In [1]:
import os
from tqdm import tqdm
import pandas as pd

In [2]:
DATA_DIR = 'data'
IMG_SIZE = 512

In [3]:
train_csv = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
train_csv = train_csv[train_csv['class_id'] != 14].reset_index(drop=True)
meta_csv = pd.read_csv(os.path.join(DATA_DIR, 'train_meta.csv'))

In [4]:
print(train_csv.shape)
train_csv.head()

(36096, 8)


Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0


In [5]:
print(meta_csv.shape)
meta_csv.head()

(15000, 3)


Unnamed: 0,image_id,dim0,dim1
0,4d390e07733ba06e5ff07412f09c0a92,3000,3000
1,289f69f6462af4933308c275d07060f0,3072,3072
2,68335ee73e67706aa59b8b55b54b11a4,2836,2336
3,7ecd6f67f649f26c05805c8359f9e528,2952,2744
4,2229148faa205e881cf0d932755c9e40,2880,2304


In [6]:
#Rescale bbox attribute to scaled image size
for i, rows in tqdm(train_csv.iterrows(), total=train_csv.shape[0], position=0, leave=True):
    scaled_image_id = rows['image_id']
    orig_img_id = meta_csv[meta_csv['image_id'] == scaled_image_id]
    actual_h, actual_w = orig_img_id['dim0'], orig_img_id['dim1']
    h_ratio = IMG_SIZE/actual_h
    w_ratio = IMG_SIZE/actual_w
    
    train_csv.at[i,'x_min'] = round(rows.x_min*w_ratio,1)
    train_csv.at[i,'y_min'] = round(rows.y_min*h_ratio,1)
    train_csv.at[i,'x_max'] = round(rows.x_max*w_ratio,1)
    train_csv.at[i,'y_max'] = round(rows.y_max*h_ratio,1)

100%|██████████| 36096/36096 [02:09<00:00, 278.42it/s]


In [7]:
train_csv.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,170.1,301.4,406.9,401.3
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,280.9,132.1,358.0,181.2
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,126.4,59.5,190.9,72.2
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,301.8,49.1,490.3,434.6
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,111.1,359.1,134.6,379.3


In [8]:
# split folds
from sklearn.model_selection import GroupKFold

train_csv['fold'] = -1
gkf  = GroupKFold(n_splits = 5)
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_csv, groups=train_csv.image_id.tolist())):
    train_csv.loc[val_idx, 'fold'] = fold

train_csv.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,170.1,301.4,406.9,401.3,3
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,280.9,132.1,358.0,181.2,2
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,126.4,59.5,190.9,72.2,4
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,301.8,49.1,490.3,434.6,2
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,111.1,359.1,134.6,379.3,4


In [9]:
train_csv.groupby(['fold'])['class_id'].value_counts()

fold  class_id
0     0           1477
      3           1041
      13           961
      11           951
      8            504
      10           495
      7            486
      9            465
      6            245
      5            226
      2            174
      4             92
      1             65
      12            38
1     0           1453
      3           1135
      11           922
      13           889
      8            663
      7            487
      10           454
      9            392
      5            209
      2            202
      6            201
      4            104
      1             62
      12            47
2     0           1380
      3           1071
                  ... 
      1             52
      12            36
3     0           1467
      3           1100
      11           954
      13           895
      10           538
      8            505
      7            498
      9            458
      6            240
      5            

In [10]:
train_csv.to_csv(os.path.join(DATA_DIR, 'final_train.csv'), index=False)