In [1]:
from pathlib import Path

BASE_PATH = Path('../')
PATH_TO_DATA = BASE_PATH/'data'
PATH_TO_MODELS = BASE_PATH/'checkpoints'

PATH_TO_DATA.mkdir(exist_ok=True, parents=True)
PATH_TO_MODELS.mkdir(exist_ok=True, parents=True)

#### <b>Load Libraries</b>

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#### <b>Download data</b>

In [3]:
os.chdir(PATH_TO_DATA)

download = False
if download:
    !wget https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/Eb37jNPPA7hHl0fmktYqcV8B-qmPLx-ZKYQ1eFk4UPBV_A?download=1 -O CelebAMask-HQ.zip
    !wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVRoUY8_txRFv56-KWvZrksBDWbD6adkjBxwwRN7qAC6bg?download=1 -O CelebA-HQ-identity.txt
    !wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVrdIrPOkR1OlEWBVK8lE3AB9bFh741GnKBkNgPa8trNuA?download=1 -O CelebA-HQ-attribute.txt

    !rm -rf ./CelebAMask-HQ
    !unzip CelebAMask-HQ.zip
    ! mkdir images; cp -r ./CelebAMask-HQ/CelebA-HQ-img/* ./images

#### <b>Read data</b>

In [4]:
identity = pd.read_csv(
    PATH_TO_DATA/'CelebA-HQ-identity.txt', 
    header=None, 
    sep=' ', 
    names=['filename', 'identity']
)

label_num = 40  # "male": 21, "smiling": 32, "young": 40

attribute = pd.read_csv(
    PATH_TO_DATA/'CelebA-HQ-attribute.txt', 
    header=None, 
    sep=' ', 
    skiprows=2, 
    usecols=[0, label_num+1], 
    names=['filename', 'label']
)
attribute['label'] = np.where(attribute['label'] == -1, 1, 0)

df = identity.merge(attribute, on='filename')

display(df)

Unnamed: 0,filename,identity,label
0,0.jpg,0,0
1,1.jpg,1,0
2,2.jpg,2,0
3,3.jpg,3,0
4,4.jpg,4,0
...,...,...,...
29995,29995.jpg,5024,1
29996,29996.jpg,2271,0
29997,29997.jpg,1514,0
29998,29998.jpg,2875,0


#### <b>Do splits</b>

In [5]:
inds = df['identity'].unique().tolist()
inds.sort()

train_inds, test_unseen_inds = train_test_split(inds, test_size=0.20, random_state=42)
test_identities, unseen_identities = train_test_split(test_unseen_inds, test_size=0.50, random_state=42)
train_identities, _ = train_test_split(train_inds, test_size=0.50, random_state=42)
retain_identities, forget_identities = train_test_split(train_identities, test_size=0.05, random_state=42)

df['sample'] = None
df.loc[df['identity'].isin(retain_identities), 'sample'] = 'retain'
df.loc[df['identity'].isin(forget_identities), 'sample'] = 'forget'
df.loc[df['identity'].isin(unseen_identities), 'sample'] = 'unseen'
df.loc[df['identity'].isin(test_identities), 'sample'] = 'test'
df['sample'] = df['sample'].fillna('shadow')


In [6]:
# кол-во наблюдений в выборках

df['sample'].value_counts().sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
count,570,11421,11864,2983,3162


In [7]:
# кол-во наблюдений в выборках (в процентах)

100 * df['sample'].value_counts(normalize=True).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
proportion,1.9,38.07,39.546667,9.943333,10.54


In [8]:
# таргет рейт в выборках

df.groupby('sample')['label'].mean().sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
label,0.298246,0.226775,0.212492,0.205498,0.233397


In [9]:
# кол-во уникальных identity в выборках

df.groupby('sample')['identity'].apply(lambda x: len(set(x))).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
identity,125,2361,2487,622,622


In [10]:
# среднее количество фото для каждой identity в выборках

(
    df.groupby(['sample'])['identity'].apply(lambda x: len(x)) / \
    df.groupby(['sample'])['identity'].apply(lambda x: len(set(x)))
).sort_index().reset_index().set_index('sample').T

sample,forget,retain,shadow,test,unseen
identity,4.56,4.837357,4.770406,4.79582,5.083601


In [11]:
shadow_datasets = PATH_TO_DATA/'shadow_manifests'
shadow_datasets.mkdir(exist_ok=True, parents=True)

num_shadows = 128
counter = pd.Series(index=train_inds, data=num_shadows//2)

correction = 1 if len(counter) % 2 != 0 else 0

for shadow_idx in range(num_shadows):
    
    bootstrap_inds = counter\
        .sample(frac=1, random_state=shadow_idx)\
        .sort_values(ascending=False, kind='stable')\
        .iloc[:len(counter)//2 + correction*(shadow_idx%2!=0)]\
        .index
    counter[bootstrap_inds] -= 1
    
    df.loc[df['identity'].isin(bootstrap_inds)].to_csv(shadow_datasets/f'{shadow_idx:04}.csv', index=False)

df.loc[df['sample'].isin(['forget', 'retain'])].to_csv(PATH_TO_DATA/'train_manifest.csv', index=False)
df.loc[df['sample'].eq('retain')].to_csv(PATH_TO_DATA/'retain_manifest.csv', index=False)
df.loc[df['sample'].eq('forget')].to_csv(PATH_TO_DATA/'forget_manifest.csv', index=False)
df.loc[df['sample'].eq('test')].to_csv(PATH_TO_DATA/'test_manifest.csv', index=False)
df.loc[df['sample'].eq('unseen')].to_csv(PATH_TO_DATA/'unseen_manifest.csv', index=False)

In [12]:
data_split = [None for _ in range(num_shadows)]
for shadow_idx in range(num_shadows):
    tmp = pd.read_csv(shadow_datasets/f'{shadow_idx:04}.csv')
    data_split[shadow_idx] = tmp['identity'].unique().tolist()


data_split = pd.DataFrame(data_split).T

data_split_dict = dict()
for identity in train_inds:
    data_split_dict[identity] = data_split.columns[np.any(data_split == identity, axis=0)].tolist()


data_split_df = pd.DataFrame(data_split_dict).T.reset_index().rename(columns={'index': 'identity'})
data_split_df = pd.merge(data_split_df, df[['identity', 'filename']]).drop(columns=['identity']).set_index('filename')

with open(PATH_TO_DATA/'data_split_dict.pickle', 'wb') as file:
    pickle.dump(
        {filename: datasets.tolist() for filename, datasets in zip(data_split_df.index, data_split_df.values)}, 
        file
    )
    