In [2]:
from pathlib import Path

BASE_PATH = Path('../')
PATH_TO_DATA = BASE_PATH/'data'
PATH_TO_MODELS = BASE_PATH/'checkpoints'

PATH_TO_DATA.mkdir(exist_ok=True, parents=True)
PATH_TO_MODELS.mkdir(exist_ok=True, parents=True)

#### <b>Load Libraries</b>

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#### <b>Download data</b>

In [5]:
os.chdir(PATH_TO_DATA)

download = False
if download:
    !wget https://postackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/EbMhBPnmIb5MutZvGicPKggBWKm5hLs0iwKfGW7_TwQIKg?download=1 -O custom_korean_family_dataset_resolution_128.zip
    !unzip custom_korean_fechamily_dataset_resolution_128.zip -d ./custom_korean_family_dataset_resolution_128

    ! mkdir images; cp -r ./custom_korean_family_dataset_resolution_128/*_images/* ./images


#### <b>Read Data</b>

In [6]:
custom_train_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_train_dataset.csv')
custom_unseen_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_test_dataset.csv')
custom_test_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_val_dataset.csv')

df = pd.concat([custom_train_dataset, custom_unseen_dataset, custom_test_dataset], ignore_index=True)

#### <b>Do splits</b>

In [11]:
inds = df['family_id'].unique().tolist()
inds.sort()

train_inds, test_unseen_inds = train_test_split(inds, test_size=0.20, random_state=42)
test_families, unseen_families = train_test_split(test_unseen_inds, test_size=0.50, random_state=42)
train_families, _ = train_test_split(train_inds, test_size=0.50, random_state=42)
retain_families, forget_families = train_test_split(train_families, test_size=0.05, random_state=42)

df['sample'] = None
df.loc[df['family_id'].isin(retain_families), 'sample'] = 'retain'
df.loc[df['family_id'].isin(forget_families), 'sample'] = 'forget'
df.loc[df['family_id'].isin(unseen_families), 'sample'] = 'unseen'
df.loc[df['family_id'].isin(test_families), 'sample'] = 'test'
df['sample'] = df['sample'].fillna('shadow')

In [8]:
df.head()

Unnamed: 0,family_id,person_id,age_class,image_path,sample
0,F0001,D,a,F0001_AGE_D_18_a1.jpg,test
1,F0001,D,a,F0001_AGE_D_18_a2.jpg,test
2,F0001,D,a,F0001_AGE_D_18_a3.jpg,test
3,F0001,D,a,F0001_AGE_D_18_a4.jpg,test
4,F0001,D,b,F0001_AGE_D_18_b1.jpg,test


In [12]:
# кол-во наблюдений в выборках

df['sample'].value_counts().sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
count,270,4876,5352,1296,1274


In [13]:
# кол-во наблюдений в выборках (в процентах)

100 * df['sample'].value_counts(normalize=True).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
proportion,2.066116,37.312519,40.955005,9.917355,9.749005


In [14]:
# таргет рейт

df.groupby(['sample'])['age_class'].value_counts(normalize=True).unstack().sort_index()

age_class,a,b,c,d,e,f,g,h
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
forget,0.225926,0.203704,0.174074,0.125926,0.159259,0.074074,0.025926,0.011111
retain,0.230722,0.21411,0.168786,0.125513,0.136177,0.069729,0.037326,0.017637
shadow,0.213004,0.202728,0.176009,0.127429,0.141629,0.076794,0.038677,0.023729
test,0.202932,0.190586,0.185185,0.13966,0.150463,0.072531,0.040123,0.018519
unseen,0.239403,0.193878,0.172684,0.125589,0.130298,0.071429,0.040816,0.025903


In [15]:
# кол-во уникальных семей в выборках

df.groupby('sample')['family_id'].apply(lambda x: len(set(x))).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
family_id,8,151,160,40,40


In [16]:
# среднее количество фото для каждой identity в выборках

(
    df.groupby(['sample'])['family_id'].apply(lambda x: len(x)) / \
    df.groupby(['sample'])['family_id'].apply(lambda x: len(set(x)))
).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
family_id,33.75,32.291391,33.45,32.4,31.85


In [17]:
shadow_datasets = PATH_TO_DATA/'shadow_manifests'
shadow_datasets.mkdir(exist_ok=True, parents=True)

num_shadows = 128
counter = pd.Series(index=train_inds, data=num_shadows//2)

correction = 1 if len(counter) % 2 != 0 else 0

for shadow_idx in range(num_shadows):
    
    bootstrap_inds = counter\
        .sample(frac=1, random_state=shadow_idx)\
        .sort_values(ascending=False, kind='stable')\
        .iloc[:len(counter)//2 + correction*(shadow_idx%2!=0)]\
        .index
    counter[bootstrap_inds] -= 1
    
    df.loc[df['family_id'].isin(bootstrap_inds)].to_csv(shadow_datasets/f'{shadow_idx:04}.csv', index=False)

df.loc[df['sample'].isin(['forget', 'retain'])].to_csv(PATH_TO_DATA/'train_manifest.csv', index=False)
df.loc[df['sample'].eq('retain')].to_csv(PATH_TO_DATA/'retain_manifest.csv', index=False)
df.loc[df['sample'].eq('forget')].to_csv(PATH_TO_DATA/'forget_manifest.csv', index=False)
df.loc[df['sample'].eq('test')].to_csv(PATH_TO_DATA/'test_manifest.csv', index=False)
df.loc[df['sample'].eq('unseen')].to_csv(PATH_TO_DATA/'unseen_manifest.csv', index=False)

In [18]:
data_split = [None for _ in range(num_shadows)]
for shadow_idx in range(num_shadows):
    tmp = pd.read_csv(shadow_datasets/f'{shadow_idx:04}.csv')
    data_split[shadow_idx] = tmp['family_id'].unique().tolist()


data_split = pd.DataFrame(data_split).T

data_split_dict = dict()
for family in train_inds:
    data_split_dict[family] = data_split.columns[np.any(data_split == family, axis=0)].tolist()


data_split_df = pd.DataFrame(data_split_dict).T.reset_index().rename(columns={'index': 'family_id'})
data_split_df = pd.merge(data_split_df, df[['family_id', 'image_path']]).drop(columns=['family_id']).set_index('image_path')

with open(PATH_TO_DATA/'data_split_dict.pickle', 'wb') as file:
    pickle.dump(
        {filename: datasets.tolist() for filename, datasets in zip(data_split_df.index, data_split_df.values)}, 
        file
    )
    