In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold
import seaborn as sns

# Don't touch it!
RANDOM_STATE = 42
N_SPLITS = 4

df = pd.read_csv(os.path.join("data","train.csv"))

y = df["cancer"].values
groups = [str(x)+"_"+str(y) for x,y in zip(df["patient_id"].values, df["laterality"].values)]
groups[:10]

['10006_L',
 '10006_L',
 '10006_R',
 '10006_R',
 '10011_L',
 '10011_L',
 '10011_R',
 '10011_R',
 '10025_L',
 '10025_L']

In [8]:
print(df["cancer"].value_counts())

cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
df["fold"] = -1
for fold, (train_idx, val_idx) in enumerate(cv.split(np.arange(len(df)), y, groups)):
    df.loc[val_idx, "fold"] = fold

    df_train = df[(df["fold"]!=fold)]
    df_val = df[(df["fold"]==fold)]

    # Check possible overlaps
    print(f"Fold Train {fold}, positives:", len(df_train[df_train["cancer"]==1]))
    print(f"Fold Val {fold}, positives:", len(df_val[df_val["cancer"]==1]))

0    53548
1     1158
Name: cancer, dtype: int64
Fold Train 0, positives: 839
Fold Val 0, positives: 319
Fold Train 1, positives: 906
Fold Val 1, positives: 252
Fold Train 2, positives: 850
Fold Val 2, positives: 308
Fold Train 3, positives: 879
Fold Val 3, positives: 279


In [9]:
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,fold
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,0
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,0
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,3
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False,3
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True,0


In [11]:
def countplot(c):
    print("-"*10, c, "-"*10)
    df_count = pd.DataFrame()
    
    for fold in sorted(df["fold"].unique()):
        count = df[df["fold"]==fold][c].value_counts()
        df_count[f"fold_{fold}"] = count
    print(df_count)

def std(c):
    print("-"*10, c, "-"*10)
    df_count = pd.DataFrame()
    
    for fold in sorted(df["fold"].unique()):
        count = df[df["fold"]==fold][c].value_counts()
        df_count[f"fold_{fold}"] = count
    print(df_count.T.std())

In [12]:
std("view")
std("laterality")
std("biopsy")
std("invasive")
std("BIRADS")
std("implant")
std("density")
std("machine_id")
std("difficult_negative_case")

---------- view ----------
MLO    39.802638
CC     24.662725
LM      1.290994
ML      1.154701
AT      2.500000
dtype: float64
---------- laterality ----------
L    161.281069
R    207.332864
dtype: float64
---------- biopsy ----------
0    115.952217
1     58.385929
dtype: float64
---------- invasive ----------
0    89.539563
1    31.932220
dtype: float64
---------- BIRADS ----------
1.0    93.520051
0.0    30.641747
2.0    17.858238
dtype: float64
---------- implant ----------
0    38.981833
1    43.645351
dtype: float64
---------- density ----------
B    55.602008
C    64.891576
A    42.200908
D    21.140404
dtype: float64
---------- machine_id ----------
49     116.979699
48      23.228933
21      59.292355
29      43.957366
93      27.825348
216     28.670542
210      8.888194
170     17.056279
190     11.729592
197      4.991660
dtype: float64
---------- difficult_negative_case ----------
False    22.455512
True     38.664152
dtype: float64


In [13]:
countplot("view")
countplot("laterality")
countplot("biopsy")
countplot("invasive")
countplot("BIRADS")
countplot("implant")
countplot("density")
countplot("machine_id")
countplot("difficult_negative_case")

---------- view ----------
     fold_0  fold_1  fold_2  fold_3
MLO    6954    7034    6947    6968
CC     6672    6719    6705    6669
LM        3       1       2       4
ML        3       1       1       3
AT        2       8       5       4
---------- laterality ----------
   fold_0  fold_1  fold_2  fold_3
L    7048    6674    6784    6761
R    6586    7090    6876    6887
---------- biopsy ----------
   fold_0  fold_1  fold_2  fold_3
0   12840   13103   12887   12907
1     794     661     773     741
---------- invasive ----------
   fold_0  fold_1  fold_2  fold_3
0   13401   13603   13437   13447
1     233     161     223     201
---------- BIRADS ----------
     fold_0  fold_1  fold_2  fold_3
1.0    3888    4083    3901    3900
0.0    2038    2099    2076    2036
2.0     568     542     585     570
---------- implant ----------
   fold_0  fold_1  fold_2  fold_3
0   13273   13340   13342   13274
1     361     424     318     374
---------- density ----------
   fold_0  fold_1  fold

# Save csv

In [None]:
df.to_csv(os.path.join("data", "train_5fold.csv"),  index=False)