<h2>Notebook for generating folds for cross-validation</h2>

Scans from the same patient goes to the same fold

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from glob import glob
import random
import json
import pydicom
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

DICOM_TAGS_DF_PATH = '/kolos/m2/ct/data/rsna/df.pkl'
LABELS_PATH = '/kolos/storage/ct/data/rsna/stage_1_train.csv'

DISEASES = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']

NUM_FOLDS = 5

In [3]:
def load_dicom_tags():
    with open(DICOM_TAGS_DF_PATH, 'rb') as f:
        df = pickle.load(f)

    return df

def load_labels():
    labels = pd.read_csv(LABELS_PATH)
    labels[['SOPInstanceUID', 'Disease']] = labels.ID.str.rsplit("_", 1, expand=True)
    labels = labels[['SOPInstanceUID', 'Disease', 'Label']]
    labels = pd.pivot_table(labels, index="SOPInstanceUID", columns="Disease", values="Label")
    
    return labels


tags = load_dicom_tags()
labels = load_labels()

df = labels.merge(tags, on='SOPInstanceUID', how='outer')

In [4]:
df.columns

Index(['SOPInstanceUID', 'any', 'epidural', 'intraparenchymal',
       'intraventricular', 'subarachnoid', 'subdural', 'Modality', 'PatientID',
       'StudyInstanceUID', 'SeriesInstanceUID', 'StudyID',
       'ImagePositionPatient', 'ImageOrientationPatient', 'SamplesPerPixel',
       'PhotometricInterpretation', 'Rows', 'Columns', 'PixelSpacing',
       'BitsAllocated', 'BitsStored', 'HighBit', 'PixelRepresentation',
       'WindowCenter', 'WindowWidth', 'RescaleIntercept', 'RescaleSlope',
       'path', 'subset'],
      dtype='object')

In [32]:
dataset = df[df.subset == 'train']
dataset = dataset[['SOPInstanceUID', 'StudyInstanceUID', 'PatientID', 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']]
dataset.head()

Unnamed: 0,SOPInstanceUID,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000039fa0,ID_134d398b61,ID_eeaf99e7,0.0,0.0,0.0,0.0,0.0,0.0
1,ID_00005679d,ID_b5c26cda09,ID_18f2d431,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_00008ce3c,ID_974735bf79,ID_ce8a3cd2,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_0000950d7,ID_8881b1c4b1,ID_d278c67b,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_0000aee4b,ID_9aad90e421,ID_ce5f0b6c,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
mapping = pd.read_pickle('/kolos/m2/ct/data/rsna/id_to_path.pkl')
mapping = mapping.set_index('SOPInstanceUID')
dataset = dataset.set_index('SOPInstanceUID').join(mapping)
dataset.head()

Unnamed: 0_level_0,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path
SOPInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_000039fa0,ID_134d398b61,ID_eeaf99e7,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_134d398b61/dic...
ID_00005679d,ID_b5c26cda09,ID_18f2d431,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_b5c26cda09/dic...
ID_00008ce3c,ID_974735bf79,ID_ce8a3cd2,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_974735bf79/dic...
ID_0000950d7,ID_8881b1c4b1,ID_d278c67b,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_8881b1c4b1/dic...
ID_0000aee4b,ID_9aad90e421,ID_ce5f0b6c,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_9aad90e421/dic...


In [34]:
grouped_by_patient = dataset.groupby('PatientID', group_keys=True)

In [35]:
len(grouped_by_patient.groups)

17079

In [36]:
len(grouped_by_patient.groups)//NUM_FOLDS

3415

In [37]:
curr_foldnum = 0
ids = []
folds = []

patient_ids = list(grouped_by_patient.groups.keys())
random.shuffle(patient_ids)
for idx, patient_id in enumerate(patient_ids):
    patient_slices = grouped_by_patient.groups[patient_id]
                                 
    if (idx+1) % (len(grouped_by_patient.groups)//NUM_FOLDS) == 0 and curr_foldnum < (NUM_FOLDS-1):
        curr_foldnum += 1
    for slice_id in patient_slices:
        ids.append(slice_id)
        folds.append(curr_foldnum)

In [38]:
cum_count = 0
for num in range(NUM_FOLDS):
    count =0 
    for val in folds:
        if val == num:
            count +=1
    cum_count += count
    print(num, count)

print(cum_count == len(dataset))

0 133145
1 134530
2 134393
3 136549
4 135641
True


In [39]:
folds = pd.DataFrame.from_dict({'SOPInstanceUID': ids, 'fold': folds})
folds = folds.set_index('SOPInstanceUID')
dataset = dataset.join(folds)
dataset.head()

Unnamed: 0_level_0,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path,fold
SOPInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ID_000039fa0,ID_134d398b61,ID_eeaf99e7,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_134d398b61/dic...,4
ID_00005679d,ID_b5c26cda09,ID_18f2d431,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_b5c26cda09/dic...,1
ID_00008ce3c,ID_974735bf79,ID_ce8a3cd2,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_974735bf79/dic...,4
ID_0000950d7,ID_8881b1c4b1,ID_d278c67b,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_8881b1c4b1/dic...,2
ID_0000aee4b,ID_9aad90e421,ID_ce5f0b6c,0.0,0.0,0.0,0.0,0.0,0.0,/kolos/m2/ct/data/rsna/train/ID_9aad90e421/dic...,3


In [40]:
folds_grouped = dataset.groupby('fold')
folds_grouped.sum()

Unnamed: 0_level_0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,19507.0,569.0,6674.0,4782.0,5966.0,8549.0
1,19077.0,363.0,6495.0,4777.0,6415.0,8579.0
2,18838.0,551.0,6117.0,4534.0,6302.0,8054.0
3,20058.0,704.0,6811.0,4884.0,6658.0,8591.0
4,19623.0,574.0,6467.0,4789.0,6781.0,8723.0


In [41]:
dataset.path = dataset.path.apply(lambda x : x.replace('/kolos/m2/ct/data/', '').replace('dicom', 'png').replace('dcm', 'png'))
dataset.head()

Unnamed: 0_level_0,StudyInstanceUID,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path,fold
SOPInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ID_000039fa0,ID_134d398b61,ID_eeaf99e7,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_134d398b61/png/003.png,4
ID_00005679d,ID_b5c26cda09,ID_18f2d431,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_b5c26cda09/png/002.png,1
ID_00008ce3c,ID_974735bf79,ID_ce8a3cd2,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_974735bf79/png/009.png,4
ID_0000950d7,ID_8881b1c4b1,ID_d278c67b,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_8881b1c4b1/png/032.png,2
ID_0000aee4b,ID_9aad90e421,ID_ce5f0b6c,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_9aad90e421/png/007.png,3


In [42]:
dataset_2d = dataset.drop(columns=['StudyInstanceUID', 'PatientID'])
dataset_2d.to_csv('{}fold.csv'.format(NUM_FOLDS), index=False)

In [43]:
grouped_series = dataset.groupby('StudyInstanceUID')
dataset_3d = grouped_series.max()
print(len(dataset_3d))
dataset_3d.path = dataset_3d.index
dataset_3d.path = dataset_3d.path.apply(lambda x : '{}/{}'.format('rsna/train', x))
dataset_3d.head()

19530


Unnamed: 0_level_0,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural,path,fold
StudyInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_00047d6503,ID_e0d2de32,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00047d6503,4
ID_0004f7a877,ID_8cd7ca78,1.0,0.0,0.0,0.0,0.0,1.0,rsna/train/ID_0004f7a877,0
ID_0006600dd8,ID_3a12cfa6,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_0006600dd8,0
ID_000b852931,ID_782db7a2,1.0,0.0,1.0,0.0,1.0,1.0,rsna/train/ID_000b852931,1
ID_00135fb9ff,ID_0cddb2a2,0.0,0.0,0.0,0.0,0.0,0.0,rsna/train/ID_00135fb9ff,4


In [44]:
dataset_3d = dataset_3d.drop(columns=['PatientID'])
dataset_3d.to_csv('{}fold3D.csv'.format(NUM_FOLDS), index=False)

In [45]:
folds_grouped3d = dataset_3d.groupby('fold')
folds_grouped3d.sum()

Unnamed: 0_level_0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1595.0,65.0,961.0,633.0,695.0,692.0
1,1555.0,45.0,961.0,663.0,688.0,692.0
2,1535.0,57.0,903.0,643.0,675.0,643.0
3,1683.0,88.0,1010.0,686.0,748.0,722.0
4,1635.0,58.0,961.0,688.0,743.0,693.0


In [47]:
len(dataset_3d)

19530

In [48]:
test_dataset = df[df.subset == 'train']
test_dataset = test_dataset[['SOPInstanceUID']]
test_dataset = test_dataset.set_index('SOPInstanceUID').join(mapping)
test_dataset.head()

Unnamed: 0_level_0,path
SOPInstanceUID,Unnamed: 1_level_1
ID_000039fa0,/kolos/m2/ct/data/rsna/train/ID_134d398b61/dic...
ID_00005679d,/kolos/m2/ct/data/rsna/train/ID_b5c26cda09/dic...
ID_00008ce3c,/kolos/m2/ct/data/rsna/train/ID_974735bf79/dic...
ID_0000950d7,/kolos/m2/ct/data/rsna/train/ID_8881b1c4b1/dic...
ID_0000aee4b,/kolos/m2/ct/data/rsna/train/ID_9aad90e421/dic...


In [50]:
test_dataset.path = test_dataset.path.apply(lambda x : x.replace('/kolos/m2/ct/data/', '').replace('dicom', 'png').replace('dcm', 'png'))
test_dataset.head()

Unnamed: 0_level_0,path
SOPInstanceUID,Unnamed: 1_level_1
ID_000039fa0,rsna/train/ID_134d398b61/png/003.png
ID_00005679d,rsna/train/ID_b5c26cda09/png/002.png
ID_00008ce3c,rsna/train/ID_974735bf79/png/009.png
ID_0000950d7,rsna/train/ID_8881b1c4b1/png/032.png
ID_0000aee4b,rsna/train/ID_9aad90e421/png/007.png


In [52]:
test_dataset.to_csv('test.csv'.format(NUM_FOLDS), index=False)