# 데이터 리펙토링

### 목적

- 하나의 폴더에 파일을 취합, 아이디를 통해서 데이터 접근하기 쉽도록 만들기
- 데이터 복원 시, 명령어 `cp -r data_backup/train data/`

### CODE Structure

- main()
    - src/dst path, open src/dst csv
    - read src files
        - label = get_label(tuple)
        - modify_csv(tuple, label)
        - dst_path = get_new_file_path(tuple, label)
        - change_file_path(src_path, dst_path)

In [6]:
from tqdm.notebook import tqdm
from time import time

import os
from glob import glob
import csv
import shutil
import pprint

In [7]:
def label_image(tup, ind):
    ret = 0
    
    # 성별
    if tup[1] == 'female': ret += 3
        
    # 나이
    age = int(tup[3])
    if age >= 30 and age < 60: ret+=1
    elif age >= 60: ret += 2
        
    # 마스크 착용여부 0, 6, 12
    ret += ind
    
    return ret

In [8]:
def change_path(src, dst):
    os.rename(src, dst)

In [17]:
def get_new_name(tup, ext):
    t = int(tup[3])
    mask_s = "mask"
    if t >= 6: mask_s = "incorrect"
    if t >= 12: mask_s = "normal"
    
    p_id = (int(tup[0])-1) // 7
    ret = tup[0] + '_' + str(p_id) + '_' + tup[1][0] + '_' + tup[2] + '_' + mask_s + '_' + tup[3] + '.' + ext
    return ret


In [18]:
def make_new_id(id_):
    return str(id_).zfill(6)

# main 함수

In [19]:
def main():
    abs_path = '/opt/ml/input/data/train'
    
    # source
    src_csv_path = abs_path + '/train.csv'
    src_img_path = abs_path + '/images/'

    src_csv = open(src_csv_path, 'r', encoding='utf-8')
    src_fd = csv.reader(src_csv)
    
    # destination
    dst_csv_path = abs_path + '/train_result.csv'
    dst_img_path = abs_path + '/images_result/'

    dst_csv = open(dst_csv_path, 'w', encoding='utf-8')
    dst_fd = csv.writer(dst_csv)
    dst_fd.writerow(['id', 'person_id', 'gender', 'age', 'mask', 'label', 'path'])
    
    # get tuple, change info
    new_id = 1
    for line in src_fd:
        print(line)
        folder_name = line[-1]
        if folder_name == "path": continue

        img_path = src_img_path + folder_name
        for img in glob(img_path + '/*'):
            
            # get label
            val = img.split('.')[0].split('/')[-1]
            ind = 0
            if val == 'normal': ind = 12
            elif val == 'incorrect_mask': ind = 6
            label = label_image(line, ind)
            
            # modify img info
            modi = []
            id_ = make_new_id(new_id)
            modi.append(id_) # id
            p_id = (int(id_) - 1) // 7
            modi.append(p_id) # person id
            modi.append(line[1]) # gender
            modi.append(line[3]) # age

            new_name = get_new_name(modi, img.split('.')[1]) # make new file name
            mask_ = new_name.split('_')[3]
            modi.append(mask_) # mask
            modi.append(str(label)) # label
            modi.append(new_name)
            
            dst_fd.writerow(modi) # append csv
#             change_path(img, dst_img_path + new_name) # change path
            
            new_id += 1


# main 실행

In [20]:
main()

['id', 'gender', 'race', 'age', 'path']
['000001', 'female', 'Asian', '45', '000001_female_Asian_45']


TypeError: 'int' object is not subscriptable

In [None]:
# abs_path = '/opt/ml/input/data/train'
    
# # source
# src_csv_path = abs_path + '/train.csv'

# src_csv = open(src_csv_path, 'r', encoding='utf-8')
# src_fd = csv.reader(src_csv)

# # destination
# dst_csv_path = abs_path + '/train_result.csv'

# dst_csv = open(dst_csv_path, 'w', encoding='utf-8')
# dst_fd = csv.writer(dst_csv)
# dst_fd.writerow(['id', 'person_id', 'gender', 'age', 'mask', 'label', 'path'])

# for line in src_fd:
#     if line[0] == 'id': continue
#     save_list = []
#     save_list.append(line[0]) # id
#     p_id = (int(line[0])-1) // 7
#     save_list.append(str(p_id)) # person_id
#     save_list.append(line[1]) # gender
#     save_list.append(line[2]) # age
#     mask_ = line[4].split('_')[3]
#     save_list.append(mask_) # mask
#     save_list.append(line[3]) # label
#     save_list.append(line[4]) # path
    
#     dst_fd.writerow(save_list)
#     print(save_list)
    


In [101]:
# path = '/opt/ml/input/data/train/images/*'

# for i in glob(path):
#     id_ = i.split('/')[-1].split('_')[0]
#     new_id_ = (int(id_)-1) // 7
#     ret = i.split('_')[0] + '_' + str(new_id_) + '_' + i.split('_')[1] + '_' + i.split('_')[2] + '_' + i.split('_')[3] + '_' + i.split('_')[4]
#     print(ret)

#     change_path(i, ret)

/opt/ml/input/data/train/images/008116_1159_f_19_mask_3.jpg
/opt/ml/input/data/train/images/000242_34_m_56_mask_1.jpg
/opt/ml/input/data/train/images/016450_2349_m_19_mask_0.jpg
/opt/ml/input/data/train/images/008473_1210_f_19_mask_3.jpg
/opt/ml/input/data/train/images/000646_92_f_58_normal_16.jpg
/opt/ml/input/data/train/images/004410_629_f_20_mask_3.jpg
/opt/ml/input/data/train/images/003981_568_m_26_incorrect_6.jpg
/opt/ml/input/data/train/images/004583_654_f_54_incorrect_10.jpg
/opt/ml/input/data/train/images/016173_2310_m_19_mask_0.png
/opt/ml/input/data/train/images/015019_2145_m_36_mask_1.jpg
/opt/ml/input/data/train/images/014874_2124_m_29_mask_0.jpg
/opt/ml/input/data/train/images/008707_1243_f_19_mask_3.jpg
/opt/ml/input/data/train/images/005498_785_f_47_mask_4.jpg
/opt/ml/input/data/train/images/002513_358_m_51_mask_1.jpg
/opt/ml/input/data/train/images/015491_2212_f_46_mask_4.jpg
/opt/ml/input/data/train/images/008202_1171_f_20_incorrect_9.jpg
/opt/ml/input/data/train/image