In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold
import seaborn as sns

df = pd.read_csv(os.path.join("data","train.csv"))

y = df["cancer"].values
groups = df["patient_id"].values

cv = StratifiedGroupKFold(shuffle=True, random_state=42)
df["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(cv.split(np.arange(len(df)), y, groups)):
    df.loc[val_idx, "fold"] = fold

    df_train = df[(df["fold"]!=fold)]
    df_val = df[(df["fold"]==fold)]

    # Check possible overlaps
    print(f"Fold Train {fold}, positives:", len(df_train[df_train["cancer"]==1]))
    print(f"Fold Val {fold}, positives:", len(df_val[df_val["cancer"]==1]))
    uniq_train = set(df_train["patient_id"].unique())
    uniq_val = set(df_val["patient_id"].unique())
    print("Overlap", uniq_train.intersection(uniq_val))


Fold Train 0, positives: 925
Fold Val 0, positives: 233
Overlap set()
Fold Train 1, positives: 923
Fold Val 1, positives: 235
Overlap set()
Fold Train 2, positives: 909
Fold Val 2, positives: 249
Overlap set()
Fold Train 3, positives: 937
Fold Val 3, positives: 221
Overlap set()
Fold Train 4, positives: 938
Fold Val 4, positives: 220
Overlap set()


In [2]:
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,fold
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,1
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,1
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,1
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False,1
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True,0


In [33]:
def countplot(c):
    print("-"*10, c, "-"*10)
    df_count = pd.DataFrame()
    
    for fold in sorted(df["fold"].unique()):
        count = df[df["fold"]==fold][c].value_counts()
        df_count[f"fold_{fold}"] = count
    print(df_count)


In [35]:
def std(c):
    print("-"*10, c, "-"*10)
    df_count = pd.DataFrame()
    
    for fold in sorted(df["fold"].unique()):
        count = df[df["fold"]==fold][c].value_counts()
        df_count[f"fold_{fold}"] = count
    print(df_count.T.std())

In [37]:
std("view")
std("laterality")
std("biopsy")
std("invasive")
std("BIRADS")
std("implant")
std("density")
std("machine_id")
std("difficult_negative_case")

---------- view ----------
MLO    57.348060
CC     39.089641
AT      1.095445
ML      0.000000
LM      1.224745
dtype: float64
---------- laterality ----------
R    26.966646
L    31.667018
dtype: float64
---------- biopsy ----------
0    52.861139
1    37.599202
dtype: float64
---------- invasive ----------
0    59.479408
1    10.549882
dtype: float64
---------- BIRADS ----------
1.0    176.003693
0.0     59.035582
2.0     11.937336
dtype: float64
---------- implant ----------
0    69.715135
1    26.463182
dtype: float64
---------- density ----------
B    71.314094
C    84.607919
A    67.937471
D    28.252434
dtype: float64
---------- machine_id ----------
49     177.682864
48      88.171424
29      52.314434
21      19.740821
93      37.343005
216     40.246739
210     29.807717
170     25.618353
190     18.881208
197      5.686241
dtype: float64
---------- difficult_negative_case ----------
False    75.357150
True     49.462107
dtype: float64


In [34]:
countplot("view")
countplot("laterality")
countplot("biopsy")
countplot("invasive")
countplot("BIRADS")
countplot("implant")
countplot("density")
countplot("machine_id")
countplot("difficult_negative_case")

---------- view ----------
     fold_0  fold_1  fold_2  fold_3  fold_4
MLO    5601    5546  5670.0    5562    5524
CC     5370    5324  5334.0    5323    5414
AT        5       4     4.0       2       4
ML        2       2     NaN       2       2
LM        1       2     4.0       2       1
---------- laterality ----------
   fold_0  fold_1  fold_2  fold_3  fold_4
R    5496    5466    5529    5462    5486
L    5483    5413    5483    5429    5459
---------- biopsy ----------
   fold_0  fold_1  fold_2  fold_3  fold_4
0   10365   10274   10415   10362   10321
1     614     605     597     529     624
---------- invasive ----------
   fold_0  fold_1  fold_2  fold_3  fold_4
0   10819   10699   10847   10740   10783
1     160     180     165     151     162
---------- BIRADS ----------
     fold_0  fold_1  fold_2  fold_3  fold_4
1.0    2984    3014    3407    3253    3114
0.0    1728    1652    1599    1586    1684
2.0     464     447     458     435     461
---------- implant ----------
   

# Save csv

In [None]:
df.to_csv(os.path.join("data", "train_5fold.csv"),  index=False)