In [None]:
# This file is modified version of original:
# https://github.com/dangnh0611/kaggle_rsna_breast_cancer/blob/reproduce/src/tools/cv_split.py

In [2]:
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold
from settings import SETTINGS

Using global configuration (SETTINGS.json):
--------------------------------------------------------------------------------
ASSETS_DIR: ./assets/
MODEL_CHECKPOINT_DIR: ./checkpoints/
MODEL_FINAL_SELECTION_DIR: ./assets/reproduce/
PROCESSED_DATA_DIR: ./datasets/processed/
RAW_DATA_DIR: 
SUBMISSION_DIR: ./submissions/
TEMP_DIR: ./tmp/
__JSON_PATH__: /media/na/e0adac50-20ce-4eb4-9c9d-98faf82ddd46/rsna_breast/SETTINGS.json
--------------------------------------------------------------------------------






In [None]:
# - There are no new site ID in test dataset.
# - Patient IDs in train and test sets do not overlap.
# - Image IDs in train and test sets do not overlap
# - There are no new laterality values in test dataset. (R: 27439, L: 27267)
# - There are new machine IDs in test dataset. (This is already raised by @abebe9849 in here)
# - There are no new view values in test dataset. (MLO: 27903, CC: 26765, AT: 19, LM: 10, ML: 8, LMO: 1)
# - No. of images/patient are all >= 4 (train: True)
# - Site ID is always the same for each patient. (train: True) 
# - Age is always the same for each patient. (train: True)
# - There are no overlap of machine IDs between two sites in test dataset. (train: True)
# - Some patients underwent mammography with multiple machines. (train: only 1 patient 22637 with more than 1 machine)
# - No. of images in site ID 1 > No. of images in site ID 2.
# - All patients have CC and MLO images for both sides
# - More than 40% of images are from machine ID 49 (43% for train dataset) (by @kaggleqrdl)
# - Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2. (by @kaggleqrdl)
# - 1-2% of patients use implants (1.4% for train set).(by @kaggleqrdl)
# - Positive rate is between 0.0204-0.0256 (0.0206 for train dataset). (shown by @zzy990106 and others)
# - Age column contains nan, while others do not


# Each fold split should stratified the following assumtions:
# - Group by 'patient_id'.
# - Positive rate is between 0.0204-0.0256 (0.0206 for train dataset).
# - There are new machine IDs in test dataset --> group by machine_id as well (generalization to un-seen machine or not?)
# - No. of images in site ID 1 > No. of images in site ID 2.
# - All patients have CC and MLO images for both sides.
# - More than 40% of images are from machine ID 49 (43% for train dataset).
# - Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2.
# - 1-2% of patients use implants (1.4% for train set).

In [4]:

def fold_check(train_df, val_df):
    # ensure no overlap between train_patients and val_patients.
    train_patients = set(train_df.patient_id.unique())
    val_patients = set(val_df.patient_id.unique())
    
    # A = {1, 3, 5}, B = {3, 5, 7}, A – B = {1} since the element 1 is there in A but not in B.
    # Similarly, B – A = {7}.
    assert len(list(val_patients - train_patients)) == len(list(val_patients))
    
    ret = {}
    num_samples = len(val_df)
    num_patients = val_df.patient_id.nunique()
    
    # percent of positive (samples level)
    ret['val_pos_sample_num'] = val_df.cancer.sum()
    ret['val_pos_patient_num'] = val_df[val_df.cancer == 1].patient_id.nunique()
    ret['val_pos_sample_percent'] = ret['val_pos_sample_num'] / num_samples
    ret['val_pos_patient_percent'] = ret['val_pos_patient_num'] / num_patients
    
    val_machine_ids = sorted(list(val_df.machine_id.unique()))
    train_machine_ids = sorted(list(train_df.machine_id.unique()))
    not_in_train_machine_ids = list(set(val_machine_ids) - set(train_machine_ids))
    
    ret['val_machine_ids'] = val_machine_ids
    ret['train_machine_ids'] = train_machine_ids
    ret['not_in_train_machine_ids'] = not_in_train_machine_ids
    ret['mean_site_id'] = val_df.site_id.mean()
    num_machine_id_49 = len(val_df[val_df.machine_id == 49])
    ret['num_machine_id_49'] = num_machine_id_49
#     assert num_machine_id_49 / num_samples > 0.4
    ret['mean_age'] = val_df.age.mean()
    ret['mean_age_site1'] = val_df[val_df.site_id == 1].age.mean()
    ret['mean_age_site2'] = val_df[val_df.site_id == 2].age.mean()
    
    assert ret['mean_age_site1'] < ret['mean_age_site2']
    ret['implant_pct'] = val_df[val_df.implant == 1].patient_id.nunique() / num_patients
    
    assert ret['implant_pct'] < 0.02 and ret['implant_pct'] > 0.01
    return ret

In [6]:
if __name__ == '__main__':
    CSV_LABEL_PATH = os.path.join(SETTINGS.PROCESSED_DATA_DIR, 'classification', 'rsna-breast-cancer-detection', 'cleaned_label.csv')
    SAVE_DIR = os.path.join(SETTINGS.PROCESSED_DATA_DIR, 'classification', 'rsna-breast-cancer-detection', 'cv', 'v2')
    os.makedirs(SAVE_DIR, exist_ok= True)

    df = pd.read_csv(CSV_LABEL_PATH)
    # Stratified K-Folds iterator variant with non-overlapping groups.
    spliter = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=67)
    
    ret = []
    # .split(X, y, groups)
    for i, (train_idxs, val_idxs) in enumerate(spliter.split(df, df.cancer, groups = df.patient_id)):
        
        print(f"Fold {i}:")
        fold_train_df = df.loc[train_idxs].reset_index(drop = True)
        fold_val_df = df.loc[val_idxs].reset_index(drop = True)
        print(len(fold_train_df), len(fold_val_df))
        
        save_fold_train_path = os.path.join(SAVE_DIR, f'train_fold_{i}.csv')
        save_fold_val_path = os.path.join(SAVE_DIR, f'val_fold_{i}.csv')
        
        fold_ret = fold_check(fold_train_df, fold_val_df)
        print(fold_ret)
        ret.append(fold_ret)
        
        # save
        fold_train_df.to_csv(save_fold_train_path, index = False)
        fold_val_df.to_csv(save_fold_val_path, index = False)
        print('\n--------------------\n\n\n')
        
        # break

    for k in ret[0]:
        print(k)
        for fold_idx, fold_ret in enumerate(ret):
            print('\t', fold_ret[k])
            
# all validation machine_ids are present in train data.

Fold 0:
41016 13690
{'val_pos_sample_num': 303, 'val_pos_patient_num': 122, 'val_pos_sample_percent': 0.022132943754565378, 'val_pos_patient_percent': 0.040980853207927444, 'val_machine_ids': [21, 29, 48, 49, 93, 170, 190, 197, 210, 216], 'train_machine_ids': [21, 29, 48, 49, 93, 170, 190, 197, 210, 216], 'not_in_train_machine_ids': [], 'mean_site_id': 1.4555880204528853, 'num_machine_id_49': 5921, 'mean_age': 58.5497990500548, 'mean_age_site1': 57.662593984962406, 'mean_age_site2': 59.609267275933945, 'implant_pct': 0.0141081625797783}

--------------------



Fold 1:
40981 13725
{'val_pos_sample_num': 283, 'val_pos_patient_num': 121, 'val_pos_sample_percent': 0.020619307832422586, 'val_pos_patient_percent': 0.04063129617192747, 'val_machine_ids': [21, 29, 48, 49, 93, 170, 190, 210, 216], 'train_machine_ids': [21, 29, 48, 49, 93, 170, 190, 197, 210, 216], 'not_in_train_machine_ids': [], 'mean_site_id': 1.450273224043716, 'num_machine_id_49': 5953, 'mean_age': 58.32597128070559, 'mean_