In [1]:
from pathlib import Path

BASE_PATH = Path('../')
PATH_TO_DATA = BASE_PATH/'data'
PATH_TO_MODELS = BASE_PATH/'checkpoints'

PATH_TO_DATA.mkdir(exist_ok=True, parents=True)
PATH_TO_MODELS.mkdir(exist_ok=True, parents=True)

#### <b>Load Libraries</b>

In [2]:
import os
import numpy as np
import pandas as pd

#### <b>Download data</b>

In [3]:
os.chdir(PATH_TO_DATA)

!wget https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/Eb37jNPPA7hHl0fmktYqcV8B-qmPLx-ZKYQ1eFk4UPBV_A?download=1 -O CelebAMask-HQ.zip
!wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVRoUY8_txRFv56-KWvZrksBDWbD6adkjBxwwRN7qAC6bg?download=1 -O CelebA-HQ-identity.txt
!wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVrdIrPOkR1OlEWBVK8lE3AB9bFh741GnKBkNgPa8trNuA?download=1 -O CelebA-HQ-attribute.txt

%%capture
!rm -rf ./CelebAMask-HQ
!unzip CelebAMask-HQ.zip

zsh:1: no matches found: https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/Eb37jNPPA7hHl0fmktYqcV8B-qmPLx-ZKYQ1eFk4UPBV_A?download=1
zsh:1: no matches found: https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVRoUY8_txRFv56-KWvZrksBDWbD6adkjBxwwRN7qAC6bg?download=1
zsh:1: no matches found: https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVrdIrPOkR1OlEWBVK8lE3AB9bFh741GnKBkNgPa8trNuA?download=1


UsageError: Line magic function `%%capture` not found.


#### <b>Read data</b>

In [73]:
identity = pd.read_csv(
    PATH_TO_DATA/'CelebA-HQ-identity.txt', 
    header=None, 
    sep=' ', 
    names=['filename', 'identity']
)

label_num = 40  # "male": 21, "smiling": 32, "young": 40

attribute = pd.read_csv(
    PATH_TO_DATA/'CelebA-HQ-attribute.txt', 
    header=None, 
    sep=' ', 
    skiprows=2, 
    usecols=[0, label_num], 
    names=['filename', 'label']
)
attribute['label'] = np.where(attribute['label'] == -1, 0, 1)

df = identity.merge(attribute, on='filename')

display(df)

Unnamed: 0,filename,identity,label
0,0.jpg,0,0
1,1.jpg,1,0
2,2.jpg,2,0
3,3.jpg,3,0
4,4.jpg,4,0
...,...,...,...
29995,29995.jpg,5024,0
29996,29996.jpg,2271,0
29997,29997.jpg,1514,0
29998,29998.jpg,2875,0


#### <b>Do splits</b>

In [76]:
def split_data(x):
    if x <= 190:
        return 'test'
    if x <= 1600:
        return 'shadow'
    if x <= 1700:
        return 'forget'
    if x <= 4855:
        return 'retain'
    else:
        return 'unseen'


df['sample'] = df['identity'].apply(split_data)
df['sample'].value_counts().reset_index().T


Unnamed: 0,0,1,2,3,4
sample,shadow,retain,test,unseen,forget
count,12878,12382,2076,1999,665


In [84]:
shadow_datasets = PATH_TO_DATA/'shadow_manifests'
shadow_datasets.mkdir(exist_ok=True, parents=True)

identities = sorted(df.loc[df['sample'].eq('shadow'), 'identity'].unique())
for idx in range(128):
    random_identities = np.unique(np.random.RandomState(idx).choice(identities, size=len(identities)))
    df.loc[df['identity'].isin(random_identities), ['filename', 'label']].to_csv(shadow_datasets/f'{idx:04}.csv', index=False)
 

df.loc[df['sample'].isin(['forget', 'retain']), ['filename', 'label']].to_csv(PATH_TO_DATA/'train_manifest.csv', index=False)
df.loc[df['sample'].eq('retain'), ['filename', 'label']].to_csv(PATH_TO_DATA/'retain_manifest.csv', index=False)
df.loc[df['sample'].eq('forget'), ['filename', 'label']].to_csv(PATH_TO_DATA/'forget_manifest.csv', index=False)
df.loc[df['sample'].eq('test'), ['filename', 'label']].to_csv(PATH_TO_DATA/'test_manifest.csv', index=False)
df.loc[df['sample'].eq('unseen'), ['filename', 'label']].to_csv(PATH_TO_DATA/'unseen_manifest.csv', index=False)

! mkdir images; cp -r ./CelebAMask-HQ/CelebA-HQ-img/ ./images

mkdir: images: File exists
