In [29]:
# Generate cross validation splits for the dataset
# The dataset is split into 5 folds, with each fold containing the same distribution of classes as the original dataset


from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import numpy as np
import pandas as pd

# import metadata file
metadata = pd.read_csv('metadata.csv')
NUM_CLASSES=4

#extract the column 'baby_ID' and 'grade' from the metadata file
subjects = metadata['baby_ID'].values
labels = metadata['grade'].values   


In [30]:
# Create a dataframe for convenience
subject_labels_df = pd.crosstab(metadata['baby_ID'], metadata['grade'])
subject_ids = subject_labels_df.index.to_numpy()
subject_labels = subject_labels_df.to_numpy()

In [31]:
# Apply stratification over subject-level labels
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a dictionary to store the folds with the label counts
folds = {}


for fold, (train_idx, test_idx) in enumerate(mskf.split(subject_ids, subject_labels)):
    train_subjects = subject_ids[train_idx]
    test_subjects = subject_ids[test_idx]
    
    # count the number of samples per class in the training and test sets of the mskf split
    grade_count_train = subject_labels_df.loc[train_subjects].sum()
    grade_count_test = subject_labels_df.loc[test_subjects].sum()
    
    folds[fold] = {
        "train": train_subjects,
        "test": test_subjects,
        "train_label_counts": grade_count_train.to_numpy(),
        "test_label_counts": grade_count_test.to_numpy()    
    }
    

# Save the folds to a file
np.save("5_folds_cross_validation.npy", folds)

### Visualize the folds and class counts

In [None]:
# Print folds to check

for fold, (train_idx, test_idx) in enumerate(mskf.split(subject_ids, subject_labels)):
    train_subjects = subject_ids[train_idx]
    test_subjects = subject_ids[test_idx]
    
    print(f"Fold {fold + 1}")
    print("Train subjects:", train_subjects)
    print("Test subjects:", test_subjects)
    
    # count the number of samples per class in the training and test sets of the mskf split
    grade_count_train = subject_labels_df.loc[train_subjects].sum()

    grade_count_test = subject_labels_df.loc[test_subjects].sum()

    print("Train labels:", grade_count_train)
    print("Test labels:", grade_count_train)

In [32]:
# load folds file and print first fold information
folds = np.load("5_folds_cross_validation.npy", allow_pickle=True).item()
fold = 0
print(f"Fold {fold + 1}")
print("Train subjects:", folds[fold]["train"])
print("Test subjects:", folds[fold]["test"])
print("Train labels:", folds[fold]["train_label_counts"])
print("Test labels:", folds[fold]["test_label_counts"])



Fold 1
Train subjects: ['ID01' 'ID02' 'ID04' 'ID05' 'ID06' 'ID07' 'ID08' 'ID09' 'ID10' 'ID11'
 'ID12' 'ID13' 'ID14' 'ID15' 'ID16' 'ID17' 'ID18' 'ID19' 'ID21' 'ID22'
 'ID24' 'ID25' 'ID26' 'ID27' 'ID28' 'ID32' 'ID33' 'ID35' 'ID36' 'ID38'
 'ID40' 'ID41' 'ID42' 'ID43' 'ID45' 'ID46' 'ID47' 'ID48' 'ID49' 'ID50'
 'ID51' 'ID52' 'ID53']
Test subjects: ['ID03' 'ID20' 'ID23' 'ID29' 'ID30' 'ID31' 'ID34' 'ID37' 'ID39' 'ID44']
Train labels: [79 25 18 10]
Test labels: [25  6  4  2]
