### Import libraries

In [7]:
import numpy as np
import os
import pandas as pd


from sklearn.model_selection import StratifiedGroupKFold

### Paths

In [3]:
TRAIN_PATH = '/Users/alejopaullier/Desktop/alejo/aidmed/data/melanoma-classification/train.csv'
TEST_PATH = '/Users/alejopaullier/Desktop/alejo/aidmed/data/melanoma-classification/test.csv'

### Load data

In [4]:
train_df = pd.read_csv(TRAIN_PATH, sep=',')
test_df = pd.read_csv(TEST_PATH, sep=',')
display(train_df.head())
display(test_df.head())

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [8]:
def create_folds(df, k):
    """
    Creates folds for training.
    :param df: a dataframe with a "target" column and an "patient_id" column.
    :param k: number of folds
    :return: folds
    """
    # Create Object
    group_fold = StratifiedGroupKFold(n_splits = k)

    length = len(df)

    # Generate indices to split data into training and test set.
    folds = group_fold.split(X = np.zeros(length),
                             y = df['target'],
                             groups = df['patient_id'].tolist())
    return folds

In [18]:
folds = create_folds(train_df, 5)

def sep():
    print("-"*117)
    

def target_distribution(df):
    neg_count = df.target.value_counts()[0]
    pos_count = df.target.value_counts()[1]
    neg_perc = neg_count/(neg_count+pos_count)
    pos_perc = pos_count/(neg_count+pos_count)
    
    return pos_perc*100, neg_perc*100, pos_count, neg_count
    
for fold, (train_index, valid_index) in enumerate(folds):
    print(f"====== Fold {fold} ======")
    train_set = train_df.iloc[train_index].reset_index(drop=True)
    valid_set = train_df.iloc[valid_index].reset_index(drop=True)
    pos_perc, neg_perc, pos_count, neg_count = target_distribution(train_set)
    print(f"Train set: {pos_perc} % malignant ({pos_count}) and {neg_perc} % benign ({neg_count})")
    pos_perc, neg_perc, pos_count, neg_count = target_distribution(valid_set)
    print(f"Train set: {pos_perc} % malignant ({pos_count}) and {neg_perc} % benign ({neg_count})")
    print(f"Train set has {})
    

Train set: 1.762463675133034 % malignant (467) and 98.23753632486697 % benign (26030)
Train set: 1.764972092321617 % malignant (117) and 98.23502790767839 % benign (6512)
Train set: 1.766171031775983 % malignant (468) and 98.23382896822402 % benign (26030)
Train set: 1.7501508750754375 % malignant (116) and 98.24984912492457 % benign (6512)
Train set: 1.7616658493341886 % malignant (467) and 98.2383341506658 % benign (26042)
Train set: 1.768172888015717 % malignant (117) and 98.23182711198429 % benign (6500)
Train set: 1.766304347826087 % malignant (468) and 98.2336956521739 % benign (26028)
Train set: 1.7496229260935143 % malignant (116) and 98.25037707390648 % benign (6514)
Train set: 1.758225173558708 % malignant (466) and 98.2417748264413 % benign (26038)
Train set: 1.7819389912413168 % malignant (118) and 98.21806100875868 % benign (6504)


In [20]:
train_set.patient_id.nunique()
valid_set.patient_id.nunique()

410