In [1]:
import os
import re
from pathlib import Path
import itertools
import numpy as np
import pandas as pd

In [2]:
def numericalSort(value):
    numbers = re.compile(r'(\d)+')
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

def isMoreThanOneElement(array: list) -> bool:
    if len(array) > 1:
        return True
    else:
        return False
    
def getFilePaths(dataset_dir: Path, labels: str) -> list:
    return [f for f in os.listdir(dataset_dir / labels) if re.search(r'(.jpg)$', f)]

### Step 1: Create Negative Pairs

In [51]:
negative_pairs = [
    [64, 65, 71],
    [3, 4],
    [5, 6],
    [7, 8],
    [10, 11, 12],
    [14, 15],
    [16, 17],
    [19, 20],
    [22, 23, 24, 25, 26],
    [27, 28, 29],
    [32, 33],
    [34, 35],
    [68, 69, 72, 73, 74, 75, 79],
    [36, 37, 38, 39, 40],
    [42, 43],
    [45, 46, 47, 48],
    [49, 50, 51, 66],
    [56, 57],
    [76, 77],
    [62, 63, 67]
]

### Step 2: Populate Negative Pairs Combinations

In [52]:
for i in range(len(negative_pairs)):
    combinations = list(itertools.combinations(negative_pairs[i], 2))
    negative_pairs[i] = combinations

### Step 3: Create Image Negative Pairs

In [53]:
dataset_dir = Path("curated_dataset")

image_negative_pairs = []
for combinations in negative_pairs:

    for pair in combinations:
        filepath1 = getFilePaths(dataset_dir, str(pair[0]))
        filepath2 = getFilePaths(dataset_dir, str(pair[1]))

        for i in range(len(filepath1)):
            for j in range(len(filepath2)):
                image_negative_pairs.append([filepath1[i], filepath2[j], 1, str(pair[0]), str(pair[1])])
image_negative_pairs = np.asarray(image_negative_pairs)

In [54]:
image_negative_pairs[:5]

array([['20231017_135009.jpg', '20231017_135122.jpg', '1', '64', '65'],
       ['20231017_135009.jpg', '20231017_135128.jpg', '1', '64', '65'],
       ['20231017_135018.jpg', '20231017_135122.jpg', '1', '64', '65'],
       ['20231017_135018.jpg', '20231017_135128.jpg', '1', '64', '65'],
       ['20231017_135009.jpg', '20231017_140906.jpg', '1', '64', '71']],
      dtype='<U19')

### Step 4: Create Image Positive Pairs

In [86]:
dataset_dir = Path("curated_dataset")

image_positive_pairs = []
for label in os.listdir(dataset_dir):
    
    filepath = getFilePaths(dataset_dir, label)
    
    if isMoreThanOneElement(filepath):
        combinations = list(itertools.combinations(filepath, 2))
        for combination in combinations:
            image_positive_pairs.append([combination[0], combination[1], 0, label, label])
image_positive_pairs = np.asarray(image_positive_pairs)

In [87]:
image_positive_pairs[:5]

array([['20231016_114833.jpg', '20231016_114838.jpg', '0', '10', '10'],
       ['20231016_145128.jpg', '20231016_145132.jpg', '0', '11', '11'],
       ['20231016_145630.jpg', '20231016_145637.jpg', '0', '12', '12'],
       ['20231016_114616.jpg', '20231016_114646.jpg', '0', '14', '14'],
       ['20231016_135246.jpg', '20231016_135308.jpg', '0', '15', '15']],
      dtype='<U19')

### Step 5: Combining The Image Positive and Negative Pairs

In [88]:
dataset = np.r_[image_positive_pairs, image_negative_pairs]
df = pd.DataFrame(dataset, columns=['image1', 'image2', 'similarity', 'label1', 'label2'])
df.to_csv('dataset.csv', encoding="utf-8", index=False)