In [24]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torchvision import transforms

In [25]:
train_data_path = '/opt/ml/input/data/train'
df = pd.read_csv(os.path.join(train_data_path, 'train.csv'))

In [26]:
del df['race']
del df['id']
df['gender'] = df['gender'].map({'male':0, 'female':1})
bins = [0, 29, 57, 100]
labels = [0, 1, 2]
df['age'] = pd.cut(df['age'], bins=bins, labels=labels)

In [28]:
df['age'].value_counts()

0    1281
1     983
2     436
Name: age, dtype: int64

In [29]:
def label(row):
    return row['age'] + 3*row['gender']

df['label'] = df.apply(label, axis=1)
df['mask'] = [['incorrect_mask', 'mask1', 'mask2', 'mask3', 'mask4', 'mask5', 'normal'] for _ in range(len(df))]

In [30]:
df_train, df_valid = train_test_split(df, test_size=0.25, random_state=42, stratify=df.label)

In [31]:
def generate_mask_field(df):
    
    def _get_extension(row):
        for f in os.listdir(os.path.join(train_data_path, 'images', row.path)):
            if f.startswith(row['mask']):
                return os.path.join(train_data_path, 'images', row.path, f)
        raise FileNotFoundError

    def _label(row):
        return row['age'] + 3*row['gender'] + 6*row['mask']
    
    mask_map = {'incorrect_mask': 1, 'mask1': 0, 'mask2': 0, 'mask3': 0,
                'mask4': 0, 'mask5': 0, 'normal': 2}    

    df = df.explode('mask')
    df['path'] = df.apply(_get_extension, axis=1)
    df['mask'] = df['mask'].map(mask_map)
    df['label'] = df.apply(_label, axis=1)

    return df

In [None]:
df.to_csv('df_all.csv', index=False)

In [32]:
df_train = generate_mask_field(df_train)
df_valid = generate_mask_field(df_valid)

In [33]:
df_train.to_csv('df_train.csv', index=False)
df_valid.to_csv('df_valid.csv', index=False)