In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

import os 
from glob import glob 

from PIL import Image, ImageOps
import albumentations as A
import cv2

from tqdm import tqdm 

In [353]:
df = pd.read_csv('../data2A/train.csv')

In [354]:
ids = []
targets = []
data_path = '../data2A/train/'
for index, ID, target in tqdm(df.itertuples(), desc='Image rotation', mininterval=0.1):
    image_path = os.path.join(data_path, ID)

    # 90도 회전 
    id_90 = 'r90_'+ ID
    ids.append(id_90)
    targets.append(target)
    rotate90_image = np.array(ImageOps.exif_transpose(Image.open(image_path).rotate(90, expand=True)))
    Image.fromarray(rotate90_image).save(os.path.join(data_path, id_90))
    
    # 180도 회전 
    id_180 = 'r180_'+ ID
    ids.append(id_180)
    targets.append(target)
    rotate180_image = np.array(ImageOps.exif_transpose(Image.open(image_path).rotate(180, expand=True)))
    Image.fromarray(rotate180_image).save(os.path.join(data_path, id_180))

    # 270도 회전 
    id_270 = 'r270_'+ ID
    ids.append(id_270)
    targets.append(target)
    rotate270_image = np.array(ImageOps.exif_transpose(Image.open(image_path).rotate(270, expand=True)))
    Image.fromarray(rotate270_image).save(os.path.join(data_path, id_270))

rotate_data = {
    'ID' : ids,
    'target' : targets
}
rotate_df = pd.DataFrame(rotate_data)    
df = pd.concat([df, rotate_df])

Image rotation: 1570it [00:00, 259091.02it/s]


In [356]:
H_flip = A.HorizontalFlip(always_apply=True, p=1)
V_flip = A.VerticalFlip(always_apply=True, p=1)

In [358]:
ids = []
targets = []
data_path = '../data2A/train/'
for index, ID, target in tqdm(df.itertuples(), desc='Image flip', mininterval=0.1):
    image_path = os.path.join(data_path, ID)
    # image = np.array(Image.open(image_path))

    if ID.startswith('r90') or ID.startswith('r270'):
        # V flip
        id_vflip = 'Vflip_'+ ID
        ids.append(id_vflip)
        targets.append(target)
        # vflip_image = V_flip(image=image)['image']
        # Image.fromarray(vflip_image).save(os.path.join(data_path, id_vflip))
    else:
        # H flip
        id_hflip = 'Hflip_'+ ID
        ids.append(id_hflip)
        targets.append(target)
        # hflip_image = H_flip(image=image)['image']
        # Image.fromarray(hflip_image).save(os.path.join(data_path, id_hflip))
    
flip_data = {
    'ID' : ids,
    'target' : targets
}
flip_df = pd.DataFrame(flip_data)    
df = pd.concat([df, flip_df])

Image flip: 6280it [00:00, 321383.00it/s]


In [270]:
transforms = A.Compose([
    A.OneOf([
        A.GridDistortion(num_steps=5, distort_limit=0.3, interpolation=1, border_mode=4, value=None, mask_value=None, always_apply=True, p=1),
        A.ElasticTransform(always_apply=True, p=1, alpha=1.0, sigma=50.0, alpha_affine=50.0, interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None, approximate=False),
        A.OpticalDistortion(always_apply=True, p=1, distort_limit=(-0.3, -0.1)),
        A.OpticalDistortion(always_apply=True, p=1, distort_limit=(0.1, 0.3)),
    ], p=0.85),
    A.SomeOf([
        A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=1),
        A.HueSaturationValue(hue_shift_limit=15, sat_shift_limit=25, val_shift_limit=20, p=1),
        A.MultiplicativeNoise(p=1, multiplier=(1, 1.5), per_channel=True),
        A.Equalize(p=1, mode='cv', by_channels=True),
    ], n=2, p=0.85),
    A.OneOf([
        A.Rotate(limit=(10, 30), border_mode=cv2.BORDER_CONSTANT, p=1),
        A.Rotate(limit=(150, 170), border_mode=cv2.BORDER_CONSTANT, p=1),
        A.Rotate(limit=(190, 210), border_mode=cv2.BORDER_CONSTANT, p=1),
        A.Rotate(limit=(330, 350), border_mode=cv2.BORDER_CONSTANT, p=1),
    ], p=1),
    A.CoarseDropout(p=0.5, max_holes=40, max_height=15, max_width=15, min_holes=8, min_height=8, min_width=8),
    A.Equalize(p=0.5, mode='cv', by_channels=True),
    A.OneOf([
        A.Blur(blur_limit=(3, 4), p=1),
        A.MotionBlur(blur_limit=(3, 5), p=1),
        A.Downscale(scale_min=0.455, scale_max=0.5, interpolation=2, p=1),
    ], p=0.5),
    A.GaussNoise(var_limit=(100, 800), per_channel=True, p=0.5),
])

In [11]:
ids = []
targets = []
data_path = '../data/train/'
for index, ID, target in tqdm(df.itertuples(), desc='Image augmentation', mininterval=0.1):
    image_path = os.path.join(data_path, ID)
    image = np.array(Image.open(image_path))
    if target == 13:
        n = 13
    elif target == 14:
        n = 20
    elif target == 1:
        n = 21
    else:
        n = 10
    for i in range(n):
        transformed_image = transforms(image=image)['image']
        image_ID = f'tf{i}_' + ID 
        ids.append(image_ID)
        targets.append(target)
        Image.fromarray(transformed_image).save(os.path.join(data_path, image_ID))
    
aug_data = {
    'ID' : ids,
    'target' : targets
}
aug_df = pd.DataFrame(aug_data)    
df = pd.concat([df, aug_df])

Image augmentation: 12560it [2:07:36,  1.64it/s]


In [28]:
df = pd.read_csv('../data2/add_transformed_train.csv')
df

Unnamed: 0,ID,target
0,002f99746285dfdd.jpg,16
1,008ccd231e1fea5d.jpg,10
2,008f5911bfda7695.jpg,10
3,009235e4c9c07af5.jpg,4
4,00b2f44967580c74.jpg,16
...,...,...
147979,tf5_Vflip_r270_ffc22136f958deb1.jpg,9
147980,tf6_Vflip_r270_ffc22136f958deb1.jpg,9
147981,tf7_Vflip_r270_ffc22136f958deb1.jpg,9
147982,tf8_Vflip_r270_ffc22136f958deb1.jpg,9


In [360]:
new_aug_df = df[(df['target'] == 14) | (df['target'] == 4) | (df['target'] == 3) | (df['target'] == 7)]
new_aug_df

Unnamed: 0,ID,target
3,009235e4c9c07af5.jpg,4
9,012913977fd1d980.jpg,14
15,0250ee8107091ade.jpg,7
20,02dad82a9420ae86.jpg,7
24,03084c1b03921a99.jpg,7
...,...,...
6266,Hflip_r180_fed9e9ec4a77bc06.jpg,4
6267,Vflip_r270_fed9e9ec4a77bc06.jpg,4
6268,Vflip_r90_feeade617aa68c45.jpg,7
6269,Hflip_r180_feeade617aa68c45.jpg,7


In [363]:
new_transforms = A.Compose([
    A.OneOf([
        A.Rotate(limit=(50, 80), border_mode=cv2.BORDER_CONSTANT, p=1),
        A.Rotate(limit=(100, 130), border_mode=cv2.BORDER_CONSTANT, p=1),
        A.Rotate(limit=(200, 230), border_mode=cv2.BORDER_CONSTANT, p=1),
         A.Rotate(limit=(280, 310), border_mode=cv2.BORDER_CONSTANT, p=1),
    ], p=0.7),
    A.GaussNoise(var_limit=(500, 1500), per_channel=False, p=0.7),
    A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.3), contrast_limit=(-0.2, 0.3), p=0.7),
    A.OneOf([
        A.MultiplicativeNoise(p=1, multiplier=(1.5, 1.5), per_channel=True),
        A.Equalize(p=1, mode='cv', by_channels=True),       
    ], p=0.7),
    A.Blur(blur_limit=(3, 4), p=0.5),
])

In [364]:
ids = []
targets = []
data_path = '../data2A/train/'
for index, ID, target in tqdm(new_aug_df.itertuples(), desc='Image augmentation', mininterval=0.1):
    image_path = os.path.join(data_path, ID)
    image = np.array(Image.open(image_path))
    if target == 14:
        n = 10
    else:
        n = 5
    for i in range(n):
        transformed_image = new_transforms(image=image)['image']
        image_ID = f'another_tf{i}_' + ID 
        ids.append(image_ID)
        targets.append(target)
        Image.fromarray(transformed_image).save(os.path.join(data_path, image_ID))
    
aug_data = {
    'ID' : ids,
    'target' : targets
}
aug_df = pd.DataFrame(aug_data)

Image augmentation: 2800it [04:13, 11.05it/s]


In [371]:
aug_df.target.value_counts()

target
4     4000
14    4000
7     4000
3     4000
Name: count, dtype: int64

In [373]:
origin_df = pd.read_csv('../data2A/add_transformed_train.csv')
origin_df

Unnamed: 0,ID,target
0,002f99746285dfdd.jpg,16
1,008ccd231e1fea5d.jpg,10
2,008f5911bfda7695.jpg,10
3,009235e4c9c07af5.jpg,4
4,00b2f44967580c74.jpg,16
...,...,...
147979,tf5_Vflip_r270_ffc22136f958deb1.jpg,9
147980,tf6_Vflip_r270_ffc22136f958deb1.jpg,9
147981,tf7_Vflip_r270_ffc22136f958deb1.jpg,9
147982,tf8_Vflip_r270_ffc22136f958deb1.jpg,9


In [376]:
new_df = pd.concat([origin_df, aug_df])

In [378]:
new_df.loc[new_df['ID'].str.contains('45f0d2dfc7e47c03.jpg'), 'target'] = 7
new_df.loc[new_df['ID'].str.contains('4620f6e53442f3b6.jpg'), 'target'] = 7
new_df.loc[new_df['ID'].str.contains('aec62dced7af97cd.jpg'), 'target'] = 14
new_df.loc[new_df['ID'].str.contains('c5182ab809478f12.jpg'), 'target'] = 14
new_df.loc[new_df['ID'].str.contains('0583254a73b48ece.jpg'), 'target'] = 10
new_df.loc[new_df['ID'].str.contains('38d1796b6ad99ddd.jpg'), 'target'] = 10
new_df.loc[new_df['ID'].str.contains('1ec14a14bbe633db.jpg'), 'target'] = 7
new_df.loc[new_df['ID'].str.contains('8646f2c3280a4f49.jpg'), 'target'] = 3

In [381]:
new_df.to_csv('../data2A/modify_target_add_transformed_train3.csv', index=False)

In [5]:
directory = '../data2A/train/'
IDS = temp['ID'].values

# 디렉토리 내의 모든 파일 목록을 얻습니다.
file_list = os.listdir(directory)

# 파일 목록을 순회하면서 특정 조건을 만족하지 않는 파일을 삭제합니다.
for filename in file_list:
    # 파일의 절대 경로
    filepath = os.path.join(directory, filename)
    
    # 파일이 디렉토리인 경우 무시합니다.
    if filename in IDS:
        continue
    else:
        os.remove(filepath)