# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import files
files.upload()  # Upload your kaggle.json here.

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!mkdir rsna-miccai-png
%cd rsna-miccai-png/
!kaggle datasets download -d jonathanbesomi/rsna-miccai-png
!unzip -q rsna-miccai-png.zip
!rm rsna-miccai-png.zip
%cd ../

Mounted at /content/gdrive


Saving kaggle.json to kaggle.json
/content/rsna-miccai-png
Downloading rsna-miccai-png.zip to /content/rsna-miccai-png
100% 5.08G/5.08G [00:39<00:00, 151MB/s]
100% 5.08G/5.08G [00:39<00:00, 137MB/s]
/content


In [42]:
# General imports.
import os
import random

import numpy as np
import pandas as pd
import torch

# Specific imports.


from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold

# Utility Functions

In [20]:
def seed_everything(seed=123):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

# Sanity Checks

In [12]:
train_meta = pd.read_csv(r"/content/gdrive/MyDrive/Kaggle Competitions/RSNA-brain-tumor/train_meta.csv")
train_labels = pd.read_csv(r"/content/gdrive/MyDrive/Kaggle Competitions/RSNA-brain-tumor/train_labels.csv")
train_patientids = os.listdir(r"/content/rsna-miccai-png/train")

In [13]:
print(train_meta.PatientID.nunique()), print(pd.Series(train_patientids).nunique())

for x in train_labels.BraTS21ID.values:
  if x not in train_meta.PatientID.unique():
    print("X")

for x in train_labels.BraTS21ID.values:
  if x not in pd.Series(train_patientids).unique().astype(np.int32):
    print("Y")

# Refining Train Meta DataFrame

In [43]:
train_df = pd.merge(train_meta, train_labels, left_on="PatientID", right_on="BraTS21ID").drop(columns=["BraTS21ID"])

In [44]:
train_df_images = train_df.png_filepath.apply(lambda x: x.split("/")[-1])
train_df_png_images = []
for root, dirs, files in os.walk(r"/content/rsna-miccai-png/train"):
  if len(files) != 0 and (".png" in files[0]):
    split = root.split("/")
    patientid = split[-2]
    mpMRI_type = split[-1]
    for file in files:
      train_df_png_images.append(f"{patientid}_{mpMRI_type}_{file}")

In [45]:
train_df_sliced = list(set(train_df_png_images).intersection(set(train_df_images)))
train_df_sliced = pd.Series(train_df_sliced).apply(lambda x: os.path.join("./images/", x)).values
train_df = train_df[train_df.png_filepath.isin(train_df_sliced)].reset_index(drop=True)
train_df.png_filepath = train_df.png_filepath.apply(lambda x: x.replace("_", "/").replace("images", "rsna-miccai-png/train"))

# This part is for when the PatientIDs are turned into ints (for some weird reason; doing this again).
patientids = [x.split("/")[-3] for x in train_df.dicom_filepath.values]
train_df.PatientID = patientids

In [46]:
train_df.head()

Unnamed: 0,dicom_filepath,png_filepath,PatientID,SeriesDescription,ImageID,StudyInstanceUID,SeriesInstanceUID,MGMT_value
0,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-273.png,688,T2w,Image-273,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0
1,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-245.png,688,T2w,Image-245,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0
2,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-130.png,688,T2w,Image-130,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0
3,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-98.png,688,T2w,Image-98,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0
4,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-247.png,688,T2w,Image-247,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0


# Hyperparameters

In [47]:
class Config:
  n_splits = 5

  seed_everything()

In [48]:
cfg = Config()

# Making KFold Training DataFrame

In [52]:
group_kfold = GroupKFold(n_splits=cfg.n_splits)

# GroupKFold-PatientID. 
for fold, train_val_idx in enumerate(group_kfold.split(train_df, groups=getattr(train_df, "PatientID"))):
  train_idx, val_idx = train_val_idx[0], train_val_idx[1]
  train_df.loc[val_idx, 'fold_gkf_patientid'] = fold

# GroupKFold-png_filepath.
for fold, train_val_idx in enumerate(group_kfold.split(train_df, groups=getattr(train_df, "png_filepath"))):
  train_idx, val_idx = train_val_idx[0], train_val_idx[1]
  train_df.loc[val_idx, 'fold_gkf_png_filepath'] = fold

strat_kfold = StratifiedKFold(n_splits=cfg.n_splits)

# StratifiedKFold-PatientID. 
for fold, train_val_idx in enumerate(strat_kfold.split(train_df, getattr(train_df, "PatientID"))):
  train_idx, val_idx = train_val_idx[0], train_val_idx[1]
  train_df.loc[val_idx, 'fold_skf_patientid'] = fold

# StratifiedKFold-SeriesDescription. 
for fold, train_val_idx in enumerate(strat_kfold.split(train_df, getattr(train_df, "SeriesDescription"))):
  train_idx, val_idx = train_val_idx[0], train_val_idx[1]
  train_df.loc[val_idx, 'fold_skf_seriesdescription'] = fold

# StratifiedKFold-MGMT_value. 
for fold, train_val_idx in enumerate(strat_kfold.split(train_df, getattr(train_df, "MGMT_value"))):
  train_idx, val_idx = train_val_idx[0], train_val_idx[1]
  train_df.loc[val_idx, 'fold_skf_MGMT_value'] = fold

In [53]:
train_df.head()

Unnamed: 0,dicom_filepath,png_filepath,PatientID,SeriesDescription,ImageID,StudyInstanceUID,SeriesInstanceUID,MGMT_value,fold_gkf_patientid,fold_skf_patientid,fold_skf_seriesdescription,fold_skf_MGMT_value,fold_gkf_png_filepath
0,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-273.png,688,T2w,Image-273,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0,0.0,0.0,0.0,0.0,1.0
1,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-245.png,688,T2w,Image-245,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0,0.0,0.0,0.0,0.0,3.0
2,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-130.png,688,T2w,Image-130,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0,0.0,0.0,0.0,0.0,1.0
3,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-98.png,688,T2w,Image-98,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0,0.0,0.0,0.0,0.0,4.0
4,../input/rsna-miccai-brain-tumor-radiogenomic-...,./rsna-miccai-png/train/00688/T2w/Image-247.png,688,T2w,Image-247,1.2.826.0.1.3680043.8.498.11655577270489271674...,1.2.826.0.1.3680043.8.498.75434499750167731864...,0,0.0,0.0,0.0,0.0,4.0


In [55]:
display(train_df.fold_gkf_patientid.value_counts()), 
display(train_df.fold_skf_patientid.value_counts()), 
display(train_df.fold_skf_seriesdescription.value_counts()), 
display(train_df.fold_skf_MGMT_value.value_counts()), 
display(train_df.fold_gkf_png_filepath.value_counts()), 

1.0    50777
2.0    50777
3.0    50774
4.0    50773
0.0    50755
Name: fold_gkf_patientid, dtype: int64

0.0    50772
1.0    50771
4.0    50771
3.0    50771
2.0    50771
Name: fold_skf_patientid, dtype: int64

0.0    50772
1.0    50771
4.0    50771
3.0    50771
2.0    50771
Name: fold_skf_seriesdescription, dtype: int64

0.0    50772
1.0    50771
4.0    50771
3.0    50771
2.0    50771
Name: fold_skf_MGMT_value, dtype: int64

0.0    50772
1.0    50771
4.0    50771
3.0    50771
2.0    50771
Name: fold_skf_MGMT_value, dtype: int64

0.0    50772
4.0    50771
3.0    50771
2.0    50771
1.0    50771
Name: fold_gkf_png_filepath, dtype: int64

(None, None, None)

In [56]:
train_kfold = train_df
train_kfold.to_csv(r"./train_kfold.csv", index=False)

In [58]:
from IPython.display import FileLink
FileLink(r"train_kfold.csv")