In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

people = pd.read_csv('/opt/ml/input/data/train/train.csv')
people.head()

Unnamed: 0,id,gender,race,age,path
0,1,female,Asian,45,000001_female_Asian_45
1,2,female,Asian,52,000002_female_Asian_52
2,4,male,Asian,54,000004_male_Asian_54
3,5,female,Asian,58,000005_female_Asian_58
4,6,female,Asian,59,000006_female_Asian_59


Split for Age Detection

In [8]:
def map_age(x):
    if x < 30:
        return 0
    elif x < 60:
        return 1
    else:
        return 2
    
people['age'] = people['age'].apply(map_age)
people['age'].value_counts().sort_index()

0    1281
1    1227
2     192
Name: age, dtype: int64

In [40]:
train, valid = train_test_split(people, test_size=0.1, random_state=131, stratify=people['age'])

In [42]:
from glob import glob
import os

def make_full_path(labels, pred, base):
    data = []
    for path, prd in zip(labels['path'], labels[pred]):
        img_folder = glob(os.path.join(base, path, '*'))
        for img in img_folder:
            img_dict = {}
            img_dict['path'] = img
            img_dict[pred] = prd
            data.append(img_dict)
    return pd.DataFrame(data)

In [43]:
base_dir = '/opt/ml/input/data/train/images/'
train = make_full_path(train, 'age', base_dir)
valid = make_full_path(valid, 'age', base_dir)

In [37]:
os.makedirs('/opt/ml/split_labels/age/')

In [48]:
train.to_csv('/opt/ml/split_labels/age/train.csv', index=False)
valid.to_csv('/opt/ml/split_labels/age/valid.csv', index=False)

Split for Gender Detection

In [49]:
os.makedirs('/opt/ml/split_labels/gender/')

In [50]:
def gender_map(x):
    if x == 'male':
        return 0
    return 1

people['gender'] = people['gender'].apply(gender_map)
people.head(), len(people)

(       id  gender   race  age                    path
 0  000001       1  Asian    1  000001_female_Asian_45
 1  000002       1  Asian    1  000002_female_Asian_52
 2  000004       0  Asian    1    000004_male_Asian_54
 3  000005       1  Asian    1  000005_female_Asian_58
 4  000006       1  Asian    1  000006_female_Asian_59,
 2700)

In [51]:
train, valid = train_test_split(people, test_size=0.1, random_state=77, stratify=people['gender'])

In [52]:
base_dir = '/opt/ml/input/data/train/images/'
train = make_full_path(train, 'gender', base_dir)
valid = make_full_path(valid, 'gender', base_dir)

In [56]:
female_male = ['006359', '006360', '006361', '006362', '006363', '006364']
male_female = ['001498-1', '004432']
train['id'] = [path.split('/')[-2].split('_')[0] for path in train['path']]
valid['id'] = [path.split('/')[-2].split('_')[0] for path in valid['path']]
train.loc[train['id'].isin(female_male),'gender'] = 0
valid.loc[valid['id'].isin(female_male),'gender'] = 0
train.loc[train['id'].isin(male_female),'gender'] = 1
valid.loc[valid['id'].isin(male_female),'gender'] = 1

(17010, 1890)

In [59]:
train = train.drop('id', axis=1)
valid = valid.drop('id', axis=1)

Unnamed: 0,path,gender
0,/opt/ml/input/data/train/images/000278_female_...,1
1,/opt/ml/input/data/train/images/000278_female_...,1
2,/opt/ml/input/data/train/images/000278_female_...,1
3,/opt/ml/input/data/train/images/000278_female_...,1
4,/opt/ml/input/data/train/images/000278_female_...,1
...,...,...
17005,/opt/ml/input/data/train/images/004259_male_As...,0
17006,/opt/ml/input/data/train/images/004259_male_As...,0
17007,/opt/ml/input/data/train/images/004259_male_As...,0
17008,/opt/ml/input/data/train/images/004259_male_As...,0


In [60]:
train.to_csv('/opt/ml/split_labels/gender/train.csv', index=False)
valid.to_csv('/opt/ml/split_labels/gender/valid.csv', index=False)