In [2]:
from pathlib import Path

BASE_PATH = Path('../')
PATH_TO_DATA = BASE_PATH/'data'
PATH_TO_MODELS = BASE_PATH/'checkpoints'

PATH_TO_DATA.mkdir(exist_ok=True, parents=True)
PATH_TO_MODELS.mkdir(exist_ok=True, parents=True)

#### <b>Load Libraries</b>

In [8]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#### <b>Download data</b>

In [17]:
os.chdir(PATH_TO_DATA)

!wget https://postackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/EbMhBPnmIb5MutZvGicPKggBWKm5hLs0iwKfGW7_TwQIKg?download=1 -O custom_korean_family_dataset_resolution_128.zip
!unzip custom_korean_fechamily_dataset_resolution_128.zip -d ./custom_korean_family_dataset_resolution_128

#### <b>Read Data</b>

In [5]:
custom_train_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_train_dataset.csv')
custom_unseen_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_test_dataset.csv')
custom_test_dataset = pd.read_csv(PATH_TO_DATA/'custom_korean_family_dataset_resolution_128'/'custom_val_dataset.csv')

df = pd.concat([custom_train_dataset, custom_unseen_dataset, custom_test_dataset], ignore_index=True)

#### <b>Do splits</b>

In [9]:
inds = df['family_id'].unique().tolist()
inds.sort()

train_inds, test_unseen_inds = train_test_split(inds, test_size=0.20, random_state=42)
test_families, unseen_families = train_test_split(test_unseen_inds, test_size=0.50, random_state=42)
retain_families, forget_families = train_test_split(train_inds, test_size=0.05, random_state=42)


df.loc[df['family_id'].isin(retain_families), 'sample'] = 'retain'
df.loc[df['family_id'].isin(forget_families), 'sample'] = 'forget'
df.loc[df['family_id'].isin(unseen_families), 'sample'] = 'unseen'
df.loc[df['family_id'].isin(test_families), 'sample'] = 'test'

In [24]:
df.head()

Unnamed: 0,family_id,person_id,age_class,image_path,sample
0,F0001,D,a,F0001_AGE_D_18_a1.jpg,test
1,F0001,D,a,F0001_AGE_D_18_a2.jpg,test
2,F0001,D,a,F0001_AGE_D_18_a3.jpg,test
3,F0001,D,a,F0001_AGE_D_18_a4.jpg,test
4,F0001,D,b,F0001_AGE_D_18_b1.jpg,test


In [10]:
# кол-во наблюдений в выборках

df['sample'].value_counts().sort_index().reset_index().set_index('sample').T

sample,forget,retain,test,unseen
count,547,9951,1296,1274


In [11]:
# кол-во наблюдений в выборках (в процентах)

100 * df['sample'].value_counts(normalize=True).sort_index().reset_index().set_index('sample').T

sample,forget,retain,test,unseen
proportion,4.185797,76.147842,9.917355,9.749005


In [13]:
# кол-во уникальных семей в выборках

df.groupby('sample')['family_id'].apply(lambda x: len(set(x))).sort_index().reset_index().set_index('sample').T

sample,forget,retain,test,unseen
family_id,16,303,40,40


In [14]:
# среднее количество фото для каждой identity в выборках

(
    df.groupby(['sample'])['family_id'].apply(lambda x: len(x)) / \
    df.groupby(['sample'])['family_id'].apply(lambda x: len(set(x)))
).sort_index().reset_index().set_index('sample').T

sample,forget,retain,test,unseen
family_id,34.1875,32.841584,32.4,31.85


In [18]:
shadow_datasets = PATH_TO_DATA/'shadow_manifests'
shadow_datasets.mkdir(exist_ok=True, parents=True)

num_shadows = 128
forget_counter = pd.Series(index=forget_families, data=num_shadows//2)

correction = 1 if len(forget_counter) % 2 != 0 else 0

for shadow_idx in range(num_shadows):
    
    bootstrap_retain_inds = np.random.RandomState(shadow_idx).permutation(retain_families)[:int(0.9*len(retain_families))]
    
    bootstrap_forget_inds = forget_counter\
        .sample(frac=1, random_state=shadow_idx)\
        .sort_values(ascending=False, kind='stable')\
        .iloc[:len(forget_counter)//2 + correction*(shadow_idx%2!=0)]\
        .index
    forget_counter[bootstrap_forget_inds] -= 1
    
    bootstrap_inds = np.append(bootstrap_retain_inds, bootstrap_forget_inds)

    df.loc[df['family_id'].isin(bootstrap_inds)].to_csv(shadow_datasets/f'{shadow_idx:04}.csv', index=False)

df.loc[df['sample'].isin(['forget', 'retain'])].to_csv(PATH_TO_DATA/'train_manifest.csv', index=False)
df.loc[df['sample'].eq('retain')].to_csv(PATH_TO_DATA/'retain_manifest.csv', index=False)
df.loc[df['sample'].eq('forget')].to_csv(PATH_TO_DATA/'forget_manifest.csv', index=False)
df.loc[df['sample'].eq('test')].to_csv(PATH_TO_DATA/'test_manifest.csv', index=False)
df.loc[df['sample'].eq('unseen')].to_csv(PATH_TO_DATA/'unseen_manifest.csv', index=False)

! mkdir images; cp -r ./custom_korean_family_dataset_resolution_128/*_images/* ./images

mkdir: cannot create directory ‘images’: File exists


In [26]:
data_split = [None for _ in range(num_shadows)]
for shadow_idx in range(num_shadows):
    tmp = pd.read_csv(shadow_datasets/f'{shadow_idx:04}.csv')
    data_split[shadow_idx] = tmp[tmp['sample'] == 'forget']['family_id'].unique().tolist()


data_split = pd.DataFrame(data_split).T

data_split_dict = dict()
for family in forget_families:
    data_split_dict[family] = data_split.columns[np.any(data_split == family, axis=0)].tolist()


data_split_df = pd.DataFrame(data_split_dict).T.reset_index().rename(columns={'index': 'family_id'})
data_split_df = pd.merge(data_split_df, df[['family_id', 'image_path']]).drop(columns=['family_id']).set_index('image_path')

with open(PATH_TO_DATA/'data_split_dict.pickle', 'wb') as file:
    pickle.dump(
        {filename: datasets.tolist() for filename, datasets in zip(data_split_df.index, data_split_df.values)}, 
        file
    )
    