In [1]:
import os
import pandas as pd
import re
import torch
from tqdm import tqdm

In [2]:
class Labeling:
    def __init__(self, path, drop_features=[]):
        self.path = path
        self.data = pd.read_csv(path + 'train.csv')
        self.df = self._make_dataframe(drop_features)
        print("Labeling object has been created")
        
    def _make_dataframe(self, drop_features):
        img_dirs = self.data['path']
        df = self.data.drop(columns=drop_features)
        df['path'] = img_dirs
        df = df.rename(columns={'path':'img_dir'})
        df = self._merge_mask_data(df)
        df = df.drop(columns='img_dir')
        return df
        
    def _get_img_file_list(self, path):
        file_list = os.listdir(path)
        return [file for file in file_list if not file.startswith('.')]
    
    def _make_mask_data(self, img_name):
        if img_name.startswith('mask'):
            return 'Wear'
        elif img_name.startswith('normal'):
            return 'Not Wear'
        elif img_name.startswith('incorrect'):
            return 'Incorrect'
        else:
            raise NameError(f'invalid name {img_name}')
    
    def _merge_mask_data(self, df):
        joined_df = pd.DataFrame()
        for img_dir in tqdm(df['img_dir'], ncols=100):
            label_df = pd.DataFrame(columns=['img_dir', 'path', 'mask', 'label'])
            img_path = os.path.join(self.path, 'images', img_dir)
            img_list = self._get_img_file_list(img_path)
            labels = list(map(self._make_mask_data, img_list))
            
            label_df['mask'] = labels
            label_df['path'] = img_list
            label_df['path'] = img_path + '/' + label_df['path']
            label_df['img_dir'] = img_dir
            
            joined_df = joined_df.append(label_df, ignore_index=True)
            
        df = pd.merge(left=df, right=joined_df, how='outer', on='img_dir')
        return df
                
    def _check_age(self, age):
        if age < 30:
            return 0
        if age >= 60:
            return 2
        return 1
        
    def _check_gender(self, gender):
        if gender == "male":
            return 0
        if gender == "female":
            return 1
    
    def _check_mask(self, mask):
        if mask == 'Wear':
            return 0
        if mask == 'Incorrect':
            return 1
        if mask == 'Not Wear':
            return 2
    
    def _labeling(self, series):
        age = self._check_age(series['age'])
        gender = 3*self._check_gender(series['gender'])
        mask = 6*self._check_mask(series['mask'])
        return age + gender + mask
        
    def labeling(self):
        for idx, series in tqdm(self.df.iterrows(), ncols=100):
            self.df.loc[idx, 'label'] = self._labeling(series)
        print("labels have been added.")
    
    def to_csv_file(self, path, file_name="train_with_label.csv"):
        self.df.to_csv(os.path.join(path, file_name), index=False)
        print("csv file has been created.")
        
    
if __name__ == '__main__':
    train_path = '/opt/ml/input/data/train/'
    labeling = Labeling(train_path)
    labeling.labeling()
    labeling.to_csv_file(train_path, file_name='train_with_label.csv')

100%|██████████████████████████████████████████████████████████| 2700/2700 [00:07<00:00, 345.61it/s]
320it [00:00, 1597.41it/s]

Labeling object has been created


18900it [00:11, 1699.76it/s]


labels have been added.
csv file has been created.


# Test

In [300]:
for i in range(18):
    c = labeling.df['class'] == i
    print(labeling.df[c].head(1))
    

    gender  age                            path  mask class
854   male   29  000309_male_Asian_29/mask1.jpg  Wear     0
   gender  age                            path  mask class
14   male   54  000004_male_Asian_54/mask1.jpg  Wear     1
     gender  age                            path  mask class
2912   male   60  001038_male_Asian_60/mask1.jpg  Wear     2
     gender  age                              path  mask class
462  female   25  000225_female_Asian_25/mask1.jpg  Wear     3
   gender  age                              path  mask class
0  female   45  000001_female_Asian_45/mask1.jpg  Wear     4
      gender  age                              path  mask class
3101  female   60  001063_female_Asian_60/mask1.jpg  Wear     5
    gender  age                                     path       mask class
860   male   29  000309_male_Asian_29/incorrect_mask.jpg  Incorrect     6
   gender  age                                     path       mask class
20   male   54  000004_male_Asian_54/incorr

In [260]:
idx = 0
for idx, sr in labeling.df.iterrows():
    print(sr)
    print(type(sr))
    idx += 1
    if idx > 0:
        break

gender                              female
age                                     45
path      000001_female_Asian_45/mask1.jpg
mask                                  None
class                                  NaN
Name: 0, dtype: object
<class 'pandas.core.series.Series'>


In [34]:
import os
import pandas as pd
import pprint

path = '../input/data/train/'
file_name = 'train.csv'
train_file_name = path + file_name

df = pd.read_csv(train_file_name)
print(df.head)
print(df.columns)

classes = [str(i) for i in range(18)]
print(classes)

file_path = df.loc[0, 'path'] + '/'
image_path = path + 'images/'
print(image_path + file_path)

pprint.pprint(os.listdir(image_path + file_path))

<bound method NDFrame.head of           id  gender   race  age                    path
0     000001  female  Asian   45  000001_female_Asian_45
1     000002  female  Asian   52  000002_female_Asian_52
2     000004    male  Asian   54    000004_male_Asian_54
3     000005  female  Asian   58  000005_female_Asian_58
4     000006  female  Asian   59  000006_female_Asian_59
...      ...     ...    ...  ...                     ...
2695  006954    male  Asian   19    006954_male_Asian_19
2696  006955    male  Asian   19    006955_male_Asian_19
2697  006956    male  Asian   19    006956_male_Asian_19
2698  006957    male  Asian   20    006957_male_Asian_20
2699  006959    male  Asian   19    006959_male_Asian_19

[2700 rows x 5 columns]>
Index(['id', 'gender', 'race', 'age', 'path'], dtype='object')
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17']
../input/data/train/images/000001_female_Asian_45/
['mask1.jpg',
 '._mask5.jpg',
 '._mask1.jpg',
 

In [301]:
'._incorrect_mask.jpg'.startswith('.')

True