In [1]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import StratifiedGroupKFold
import tqdm.auto as tqdm

In [2]:
THEME = 'simple_white'
SEED = 42

In [3]:
train_data = pd.read_csv('train.csv')
train_data.head(10)

Unnamed: 0,image_url1,image_url2,is_same
0,https://storage.googleapis.com/lun-ua/images/8...,https://storage.googleapis.com/lun-ua/images/9...,0
1,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,1
2,https://storage.googleapis.com/lun-ua/images/8...,https://storage.googleapis.com/lun-ua/images/9...,0
3,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,1
4,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,1
5,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,0
6,https://storage.googleapis.com/lun-ua/images/8...,https://storage.googleapis.com/lun-ua/images/9...,0
7,https://storage.googleapis.com/lun-ua/images/8...,https://storage.googleapis.com/lun-ua/images/9...,0
8,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,1
9,https://storage.googleapis.com/lun-ua/images/9...,https://storage.googleapis.com/lun-ua/images/9...,0


In [4]:
fig = px.histogram(train_data, x='is_same', title='Label distribution', template=THEME)
fig.show()

In [5]:
train_data['image_url1'] = train_data['image_url1'].apply(lambda x: x.split('/')[-1])
train_data['image_url2'] = train_data['image_url2'].apply(lambda x: x.split('/')[-1])
train_data

Unnamed: 0,image_url1,image_url2,is_same
0,892325437.jpg,944751814.jpg,0
1,965225293.jpg,965564035.jpg,1
2,892403612.jpg,927225968.jpg,0
3,917878082.jpg,921610429.jpg,1
4,941374542.jpg,941588763.jpg,1
...,...,...,...
90633,919255125.jpg,922397616.jpg,0
90634,940130111.jpg,944768264.jpg,0
90635,924310310.jpg,925806417.jpg,1
90636,927655303.jpg,931435054.jpg,1


In [6]:
duplicate_counts = pd.Series(train_data['image_url1'].tolist() + train_data['image_url2'].tolist()).value_counts()
duplicate_counts = duplicate_counts[duplicate_counts > 1].to_frame().reset_index()
duplicate_counts.columns = ['file_name', 'ammount']
fig = px.bar(duplicate_counts, x='file_name', y='ammount', title='Duplicate image distribution', template=THEME)
fig.update_traces(width=5)
fig.show()
# duplicate_counts

In [7]:
'''
Create groups for StratifiedGroupKFold. 
First, we create a group for each unique image.
'''
duplicate_filter = train_data[['image_url1', 'image_url2']].isin(duplicate_counts['file_name'].tolist()).any(axis=1)
group_counter = 1
train_data['group'] = 0 
for index, row in train_data[~duplicate_filter].iterrows():
    train_data.loc[index, 'group'] = group_counter
    group_counter += 1
train_data
# train_data_nodups = train_data[~duplicate_filter]
# train_data_dups = train_data[duplicate_filter]
# train_data_nodups

Unnamed: 0,image_url1,image_url2,is_same,group
0,892325437.jpg,944751814.jpg,0,1
1,965225293.jpg,965564035.jpg,1,0
2,892403612.jpg,927225968.jpg,0,2
3,917878082.jpg,921610429.jpg,1,3
4,941374542.jpg,941588763.jpg,1,4
...,...,...,...,...
90633,919255125.jpg,922397616.jpg,0,0
90634,940130111.jpg,944768264.jpg,0,59270
90635,924310310.jpg,925806417.jpg,1,59271
90636,927655303.jpg,931435054.jpg,1,59272


In [8]:
train_data[duplicate_filter]

Unnamed: 0,image_url1,image_url2,is_same,group
1,965225293.jpg,965564035.jpg,1,0
6,895585642.jpg,931334343.jpg,0,0
9,922392092.jpg,926902879.jpg,0,0
10,892607076.jpg,928668915.jpg,0,0
11,907987762.jpg,918548067.jpg,0,0
...,...,...,...,...
90618,913222774.jpg,925816126.jpg,0,0
90626,924358116.jpg,925851577.jpg,0,0
90627,965002283.jpg,965003616.jpg,1,0
90632,896077337.jpg,932363571.jpg,0,0


In [9]:
'''
Then, we create a group for each duplicate image.
'''
duplicate_counts['group'] = duplicate_counts.index + group_counter
duplicate_counts.index = duplicate_counts['file_name']
del duplicate_counts['file_name']
duplicate_counts

Unnamed: 0_level_0,ammount,group
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1
967187157.jpg,12,59274
966672455.jpg,10,59275
927147991.jpg,9,59276
928456758.jpg,9,59277
925879405.jpg,9,59278
...,...,...
893520477.jpg,2,75963
925511857.jpg,2,75964
939532093.jpg,2,75965
919455832.jpg,2,75966


In [10]:
'''
Finally, we create a map from index to group.
'''
index_group_map = dict()
for index, row in train_data[duplicate_filter].iterrows():
    index_group_map[index] = []
    for col in ['image_url1', 'image_url2']:
        if row[col] in duplicate_counts.index:
            index_group_map[index].append(duplicate_counts.loc[row[col], 'group'])

In [11]:
'''
We merge groups which have common images.
'''
for index in index_group_map.keys():
    if len(index_group_map[index]) > 1:
        new_group = min(index_group_map[index])
        old_group = max(index_group_map[index])
        index_group_map[index] = [new_group]
        for index2 in index_group_map.keys():
            if index2 != index and old_group in index_group_map[index2]:
                index_group_map[index2].remove(old_group)
                index_group_map[index2].append(new_group)
                index_group_map[index2] = list(set(index_group_map[index2]))

In [12]:
'''
Get the final group for each index.
'''
for index in index_group_map.keys():
    index_group_map[index] = index_group_map[index][0]

In [13]:
'''
Apply the groups to the dataframe.
'''
for index in index_group_map.keys():
    train_data.loc[index, 'group'] = index_group_map[index]

In [14]:
train_data

Unnamed: 0,image_url1,image_url2,is_same,group
0,892325437.jpg,944751814.jpg,0,1
1,965225293.jpg,965564035.jpg,1,61555
2,892403612.jpg,927225968.jpg,0,2
3,917878082.jpg,921610429.jpg,1,3
4,941374542.jpg,941588763.jpg,1,4
...,...,...,...,...
90633,919255125.jpg,922397616.jpg,0,59346
90634,940130111.jpg,944768264.jpg,0,59270
90635,924310310.jpg,925806417.jpg,1,59271
90636,927655303.jpg,931435054.jpg,1,59272


In [15]:
VAL_SIZE = 0.2

In [16]:
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_idxs, val_idxs) in enumerate(cv.split(train_data[['image_url1', 'image_url2']], train_data['is_same'], train_data['group'])):
    len(train_idxs), len(val_idxs)
    break

In [17]:
fin_train_data, fin_val_data = train_data.loc[train_idxs], train_data.loc[val_idxs]

In [18]:
fin_train_data[["image_url1", "image_url2", "is_same"]].to_csv('train_split.csv', index=False)

In [19]:
fin_train_data[["image_url1", "image_url2", "is_same"]].to_csv('val_split.csv', index=False)