In [1]:
import os
import PIL
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import torch

In [2]:
path = '/opt/ml/input/data/train'
csv_path = os.path.join(path, 'train.csv')

df = pd.read_csv(csv_path)
df

Unnamed: 0,id,gender,race,age,path
0,000001,female,Asian,45,000001_female_Asian_45
1,000002,female,Asian,52,000002_female_Asian_52
2,000004,male,Asian,54,000004_male_Asian_54
3,000005,female,Asian,58,000005_female_Asian_58
4,000006,female,Asian,59,000006_female_Asian_59
...,...,...,...,...,...
2695,006954,male,Asian,19,006954_male_Asian_19
2696,006955,male,Asian,19,006955_male_Asian_19
2697,006956,male,Asian,19,006956_male_Asian_19
2698,006957,male,Asian,20,006957_male_Asian_20


In [3]:
img_dir_list = [os.path.join(path, 'images', x) for x in df['path']]
img_dir_list[:5]

['/opt/ml/input/data/train/images/000001_female_Asian_45',
 '/opt/ml/input/data/train/images/000002_female_Asian_52',
 '/opt/ml/input/data/train/images/000004_male_Asian_54',
 '/opt/ml/input/data/train/images/000005_female_Asian_58',
 '/opt/ml/input/data/train/images/000006_female_Asian_59']

In [4]:
# image size
img = PIL.Image.open(os.path.join(path, 'images', img_dir_list[0], os.listdir(img_dir_list[0])[0]))
print(img.size)

(384, 512)


In [5]:
# gender
print(df.groupby('gender')['age'].value_counts())

gender  age
female  19     200
        20     187
        18     151
        58     109
        60     109
              ... 
male    37       2
        42       2
        36       1
        44       1
        47       1
Name: age, Length: 84, dtype: int64


In [6]:
age_list = [] # {0: <30, 1: >=30 <60, 2: >=60}
mask_list = [] # {0: mask, 1: incorrect, 2: not wear}
gender_list = [] # {0: male, 1:female}
path_list = []

for dir_name in img_dir_list:
    img_dir_path = os.path.join(path, 'images', dir_name)
    gender = dir_name.split('_')[1]
    if gender == 'female':
        gender = 1
    else:
        gender = 0
    
    age = int(dir_name.split('_')[-1])
    if age < 30:
        age = 0
    elif age >= 30 and age < 60:
        age = 1
    elif age >= 60:
        age = 2

    for img in os.listdir(img_dir_path):
        if '._' in img:
            continue
        elif 'mask' in img:
            age_list.append(age)
            gender_list.append(gender)
            mask_list.append(0)
            path_list.append(os.path.join(path, 'images', dir_name, img))
        elif 'incorrect_mask' in img:
            age_list.append(age)
            gender_list.append(gender)
            mask_list.append(1)
            path_list.append(os.path.join(path, 'images', dir_name, img))
        elif 'normal' in img:
            age_list.append(age)
            gender_list.append(gender)
            mask_list.append(2)
            path_list.append(os.path.join(path, 'images', dir_name, img))



In [8]:
print(len(age_list))
print(len(gender_list))
print(len(mask_list))
print(len(path_list))

18900
18900
18900
18900


In [9]:
labels = []
images = []
for age, gender, mask, img in zip(age_list, gender_list, mask_list, path_list):
    if mask == 0 and gender == 0 and age == 0:
        labels.append(0)
        images.append(img)
    elif mask == 0 and gender == 0 and age == 1:
        labels.append(1)
        images.append(img)
    elif mask == 0 and gender == 0 and age == 2:
        labels.append(2)
        images.append(img)
    elif mask == 0 and gender == 1 and age == 0:
        labels.append(3)
        images.append(img)
    elif mask == 0 and gender == 1 and age == 1:
        labels.append(4)
        images.append(img)
    elif mask == 0 and gender == 1 and age == 2:
        labels.append(5)
        images.append(img)
    elif mask == 1 and gender == 0 and age == 0:
        labels.append(6)
        images.append(img)
    elif mask == 1 and gender == 0 and age == 1:
        labels.append(7)
        images.append(img)
    elif mask == 1 and gender == 0 and age == 2:
        labels.append(8)
        images.append(img)
    elif mask == 1 and gender == 1 and age == 0:
        labels.append(9)
        images.append(img)
    elif mask == 1 and gender == 1 and age == 1:
        labels.append(10)
        images.append(img)
    elif mask == 1 and gender == 1 and age == 2:
        labels.append(11)
        images.append(img)
    if mask == 2 and gender == 0 and age == 0:
        labels.append(12)
        images.append(img)
    elif mask == 2 and gender == 0 and age == 1:
        labels.append(13)
        images.append(img)
    elif mask == 2 and gender == 0 and age == 2:
        labels.append(14)
        images.append(img)
    elif mask == 2 and gender == 1 and age == 0:
        labels.append(15)
        images.append(img)
    elif mask == 2 and gender == 1 and age == 1:
        labels.append(16)
        images.append(img)
    elif mask == 2 and gender == 1 and age == 2:
        labels.append(17)
        images.append(img)



In [10]:
print(len(labels))
print(len(images))

18900
18900


In [17]:
import csv

with open('/opt/ml/labels.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(labels)

with open('/opt/ml/images.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(images)



In [18]:
with open('/opt/ml/genders.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(gender_list)

with open('/opt/ml/ages.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(age_list)

with open('/opt/ml/masks.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(mask_list)