In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [2]:
# read csv and remove irrelevant columns

In [3]:
train_data_path = '/opt/ml/input/data/train'
df_original = pd.read_csv(os.path.join(train_data_path, 'train.csv'))
del df_original['race']
del df_original['id']

In [4]:
## label correction (male -> female)

In [5]:
print(df_original.gender.value_counts().sort_index().to_numpy())

correction_female = ['001498-1_male_Asian_23', '004432_male_Asian_43', '005223_male_Asian_22']
idxs = df_original['path'].isin(correction_female)
df_original.loc[idxs, 'gender'] = 'female'

print(df_original.gender.value_counts().sort_index().to_numpy())

[1658 1042]
[1661 1039]


In [6]:
## label correction (female -> male)

In [7]:
print(df_original.gender.value_counts().sort_index().to_numpy())

correction_male = ['001720_female_Asian_18',
            '006359_female_Asian_18',
            '006360_female_Asian_18',
            '006361_female_Asian_18',
            '006362_female_Asian_18',
            '006363_female_Asian_18',
            '006364_female_Asian_18']

idxs = df_original['path'].isin(correction_male)
df_original.loc[idxs, 'gender'] = 'male'

print(df_original.gender.value_counts().sort_index().to_numpy())

[1661 1039]
[1654 1046]


In [8]:
# label data for stratifing

In [9]:
df = df_original.copy()

df['gender'] = df['gender'].map({'male':0, 'female':1})

bins = [0, 20, 29, 56, 59, 200]
labels = [0, 1, 2, 3, 4]
df['age'] = pd.cut(df['age'], bins=bins, labels=labels)

def label(row):
    return row['age'] + 5*row['gender']

df['label'] = df.apply(label, axis=1)

In [10]:
# Kfold

In [11]:
t = df['label']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

train_dfs, valid_dfs = [], []

for i, (train_index, test_index) in enumerate(skf.split(np.zeros(len(t)), t)):
    df_train = df_original.loc[train_index]
    df_valid = df_original.loc[test_index]
    train_dfs.append(df_train)
    valid_dfs.append(df_valid)

In [12]:
# further preprocesses for each fold

In [13]:
image_path = '/opt/ml/input/data/train/images'

def generate_mask_field(df):

    def _get_extension(row):
        for f in os.listdir(os.path.join(image_path, row.path)):
            if f.startswith(row['mask']):
                return os.path.join(image_path, row.path, f)
        raise FileNotFoundError


    mask_map = {'incorrect_mask': 1, 'mask1': 0, 'mask2': 0, 'mask3': 0,
                'mask4': 0, 'mask5': 0, 'normal': 2}    
    
    df = df.explode('mask')
    df['path'] = df.apply(_get_extension, axis=1)
    df['mask'] = df['mask'].map(mask_map)

    return df

def preprocess(df, cut_age=59):

    df['age_real'] = df['age'].copy()
    
    bins = [0, 29, cut_age, 100]
    labels = [0, 1, 2]
    df['age'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    df['gender'] = df['gender'].map({'male':0, 'female':1})
    
    df['mask'] = [['incorrect_mask', 'mask1', 'mask2', 'mask3', 'mask4', 'mask5', 'normal'] for _ in range(len(df))]
    df = generate_mask_field(df)
    
    return df

In [14]:
train_dfs = [preprocess(df_train) for df_train in train_dfs]
valid_dfs = [preprocess(df_valid) for df_valid in valid_dfs]

In [15]:
# save

In [16]:
for i, (tdf, vdf) in enumerate(zip(train_dfs, valid_dfs)):
    tdf.to_csv(f'fold/df_train_fold{i+1}.csv', index=False)
    vdf.to_csv(f'fold/df_valid_fold{i+1}.csv', index=False)