In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as cm
import os
import glob
from tqdm import tqdm_notebook as tqdm
import pydicom
from sklearn.model_selection import KFold

In [2]:
np.random.seed(10)

## Train set

In [3]:
TRAIN_ROOT = '/media/nvme/Datasets/Pneumothorax/dicom-images-train/'
CSV = '/media/hdd/Kaggle/Pneumothorax/Data/trainset.csv'

In [4]:
train_df = pd.read_csv(CSV)
train_df.tail()

Unnamed: 0,ImageId,EncodedPixels,Path
11577,1.2.276.0.7230010.3.1.4.8323329.5792.151787519...,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
11578,1.2.276.0.7230010.3.1.4.8323329.5793.151787519...,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
11579,1.2.276.0.7230010.3.1.4.8323329.5794.151787519...,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
11580,1.2.276.0.7230010.3.1.4.8323329.5795.151787519...,174459 17 982 47 952 76 943 79 936 83 937 83 9...,/media/nvme/Datasets/Pneumothorax/dicom-images...
11581,1.2.276.0.7230010.3.1.4.8323329.5796.151787519...,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...


In [5]:
HOLDOUT = 0.1

In [6]:
ids = np.unique(train_df.ImageId.values)
holdout_num = int(HOLDOUT * len(ids))
print(len(ids))

10675


In [7]:
ix = np.arange(len(ids))
np.random.shuffle(ix)
holdout_ids = ids[ix[:holdout_num]]
train_ids = ids[ix[holdout_num:]]
print(len(holdout_ids), len(train_ids))

1067 9608


In [8]:
1067 + 9608

10675

In [9]:
holdout_df = train_df.set_index('ImageId').loc[holdout_ids]
print(len(holdout_df))
holdout_df.tail()

1141


Unnamed: 0_level_0,EncodedPixels,Path
ImageId,Unnamed: 1_level_1,Unnamed: 2_level_1
1.2.276.0.7230010.3.1.4.8323329.32208.1517875159.18115,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.10459.1517875223.809547,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.1892.1517875170.126873,158071 2 1014 9 1006 16 1000 23 992 31 985 37 ...,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.32434.1517875160.578580,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.1157.1517875166.516622,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...


In [10]:
new_train_df = train_df.set_index('ImageId').loc[train_ids]
print(len(new_train_df))
new_train_df.tail()

10441


Unnamed: 0_level_0,EncodedPixels,Path
ImageId,Unnamed: 1_level_1,Unnamed: 2_level_1
1.2.276.0.7230010.3.1.4.8323329.4601.1517875183.675592,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.32184.1517875158.914008,371939 1 1019 4 1019 5 1017 6 1017 6 1016 7 10...,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.11237.1517875231.898693,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.32186.1517875158.929340,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...
1.2.276.0.7230010.3.1.4.8323329.11184.1517875231.611395,-1,/media/nvme/Datasets/Pneumothorax/dicom-images...


In [11]:
10417 + 1165

11582

In [12]:
HOLDOUT_SAVE = '/media/hdd/Kaggle/Pneumothorax/Data/Folds/holdout.csv'
holdout_df.to_csv(HOLDOUT_SAVE, index=True)

## Create folds

In [18]:
TRAINFOLDER_SAVE = '/media/hdd/Kaggle/Pneumothorax/Data/Folds/'
ids = np.unique(new_train_df.index.values)
print(len(ids))
kf = KFold(n_splits=10)
folds = []
for i, (train_index, val_index) in enumerate(kf.split(ids)):
    train_ids_i = ids[train_index]
    val_ids_i = ids[val_index]
    train_df_i = new_train_df.loc[train_ids_i]
    val_df_i = new_train_df.loc[val_ids_i]
    
    train_df_i['Set'] = 'train'
    val_df_i['Set'] = 'val'
    fold_df_i = pd.concat([train_df_i, val_df_i], axis=0)
    print('Fold {}:'.format(i))
    print('train size: {} val size: {}'.format(len(train_ids_i), len(val_ids_i)))
    print('train df size: {} val df size {}'.format(len(train_df_i), len(val_df_i)))
    print('fold df size: {}'.format(len(fold_df_i)))
    
    fold_df_i.to_csv(os.path.join(TRAINFOLDER_SAVE, 'fold{}.csv'.format(i)), index=True)

9608
Fold 0:
train size: 8647 val size: 961
train df size: 9397 val df size 1044
fold df size: 10441
Fold 1:
train size: 8647 val size: 961
train df size: 9411 val df size 1030
fold df size: 10441
Fold 2:
train size: 8647 val size: 961
train df size: 9407 val df size 1034
fold df size: 10441
Fold 3:
train size: 8647 val size: 961
train df size: 9405 val df size 1036
fold df size: 10441
Fold 4:
train size: 8647 val size: 961
train df size: 9389 val df size 1052
fold df size: 10441
Fold 5:
train size: 8647 val size: 961
train df size: 9395 val df size 1046
fold df size: 10441
Fold 6:
train size: 8647 val size: 961
train df size: 9403 val df size 1038
fold df size: 10441
Fold 7:
train size: 8647 val size: 961
train df size: 9408 val df size 1033
fold df size: 10441
Fold 8:
train size: 8648 val size: 960
train df size: 9375 val df size 1066
fold df size: 10441
Fold 9:
train size: 8648 val size: 960
train df size: 9379 val df size 1062
fold df size: 10441
