In [27]:
import json
import os
import random
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from collections import Counter

### 학습 데이터 이미지 병합

In [3]:
img_path1 = './img'
img_path2 = './train'


In [5]:
img_ids = sorted(os.listdir(img_path1))+sorted(os.listdir(img_path2))
img_dict = {"img_id": img_ids}

In [6]:
img_df = pd.DataFrame(img_dict)
img_df.head()

In [23]:
img_df.to_csv("medicalOCR_info.csv", index=False)

In [2]:
df_path = 'df_file_name.csv'

In [None]:
filled_data = pd.read_csv(df_path)
filled_data.head()

### 사진 촬영상태와 이미지 가로, 세로 비율을 고려한 class 설정

In [4]:
def make_class(df):
    cls = []
    for i in range(len(df)):
        cls.append(2*df.iloc[i]['shooting_status']+ df.iloc[i]['ratio'])
    cls_df = pd.DataFrame({"cls": cls})
    return pd.concat((df, cls_df), axis = 1)


In [5]:
new_df = make_class(filled_data)
new_df.head()

In [18]:
new_df['shooting_status'].value_counts()

shooting_status
1    175
0    126
Name: count, dtype: int64

In [17]:
new_df['ratio'].value_counts()

ratio
True     273
False     28
Name: count, dtype: int64

In [15]:
new_df['cls'].value_counts()

cls
3    153
1    120
2     22
0      6
Name: count, dtype: int64

In [7]:
new_df.to_csv("medicalOCR_class_filled.csv", index=False)

### stratifiedkfold를 활용하여 train, validation set 설정

In [None]:
data = pd.read_csv('./medicalOCR_class_filled.csv')
data.head()

In [25]:
data['cls'].value_counts()

cls
3    153
1    120
2     22
0      6
Name: count, dtype: int64

In [None]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]


In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=411)

In [14]:
X = np.ones((len(data['img_id']), 1))
y = np.array(data['img_id'])
groups = np.array(data['cls'])

In [19]:
X.shape

(301, 1)

In [20]:
y.shape

(301,)

In [None]:

for idx, (train_idx, val_idx) in enumerate(cv.split(data['img_id'], data['cls'])):
    train_x, val_x = data.loc[train_idx], data.loc[val_idx]
    print(f'Fold {idx}')
    
    print(f"train")
    print(get_distribution(train_x['cls']))
    print(f"val")
    print(get_distribution(val_x['cls']))
    
    

In [41]:
for idx, (train_idx, val_idx) in enumerate(cv.split(data['img_id'], data['cls'])):
    if idx == 1:
        train_x, val_x = data.loc[train_idx], data.loc[val_idx]
        break 
    

In [43]:
train_x.shape

(241, 5)

### train, validation에 따라 annotation 파일 split 

In [45]:
input_json = '../../ufo/merged.json'
with open(input_json) as json_reader:
        dataset = json.load(json_reader)
        
images = dataset['images']

In [46]:
img_id = images.keys()
len(img_id)

301

In [53]:
train_ids = train_x['img_id'].tolist()
val_ids = val_x['img_id'].tolist()

In [55]:
train_data = {}
val_data = {}
for fname in images.keys():
    if fname in train_ids:
        train_data[fname] = images[fname] 
    if fname in val_ids:
        val_data[fname] =images[fname]

In [64]:
train_json = {"images": train_data}
val_json = {"images": val_data}

In [66]:
output_dir = '../ufo/fold1/'
os.makedirs(output_dir, exist_ok=True)
output_train_json = os.path.join(output_dir, 'fold1_train.json')
output_val_json = os.path.join(output_dir, 'fold1_val.json')

print(f'write {output_train_json}')
with open(output_train_json, 'w') as train_writer:
    json.dump(train_json, train_writer, indent = 4)

print(f'write {output_val_json}')
with open(output_val_json, 'w') as val_writer:
    json.dump(val_json, val_writer, indent = 4)

write ./fold1/fold1_train.json
write ./fold1/fold1_val.json


### 이미지 파일 분리

In [68]:
import shutil

In [71]:
img_path = '/img/'
output_train_dir = '/img/fold1/train/'
output_val_dir = '/img/img/fold1/val/'

In [73]:
for fname in train_ids:
    shutil.copyfile(img_path + fname, output_train_dir+fname)

In [74]:
for fname in val_ids:
    shutil.copyfile(img_path + fname, output_val_dir+fname)