In [None]:
#default_exp create_folds

# Create Cross Validation Folds

> Script to create folds for 5-fold cross validation.

In [None]:
#hide
from nbdev.showdoc import *
from plant_pathology.config import DATA_PATH

In [None]:
#export
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from fastcore.script import call_parse, Param
from fastcore.script import store_true, bool_arg

Since we have a small dataset, we will do **5-fold cross-validation** rather than the normal **hold-out set** based validation. This will provide us a more accurate idea of our model's generalization ability since it will be less influenced by the variance of any single split. Additionally, since the dataset is small, it won't be too compute-heavy to do this form of validation.

In [None]:
#export
@call_parse
def create_folds(
    path:        Param("Path to train CSV", Path), 
    print_stats: Param("Print class distributions across folds?", store_true),
    save: Param("Save CSV with added folds", bool_arg)=True,
) -> pd.DataFrame:
    """Saves train CSV at `path` with 5-fold CV splits added.
    
    Optionally, print class distribution across folds.
    """
    # Load train CSV
    df = pd.read_csv(path)
    
    # Shuffle rows
    df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    # Get class labels to stratify on
    lbls = df.apply(lambda r: df.columns[r==1].item(), axis=1)
    
    # Create 5 folds
    kf = StratifiedKFold(n_splits=5)
    for fold, (_, val_idxs) in enumerate(kf.split(df, lbls.values)):
        df.loc[val_idxs, "fold"] = fold
        
    if print_stats:
        stats_df = df.groupby("fold").describe()
        print("Proportion of each class out of total examples in each fold:")
        print(stats_df.iloc[:, stats_df.columns.get_level_values(1) == "mean"])
    
    if save:
        save_path = path.parent/"train_folds.csv"
        df.to_csv(save_path, index=False)
        print(f"Saved to {save_path}")
        
    return df

In [None]:
create_folds(DATA_PATH/"train.csv", print_stats=True, save=False).head()

Proportion of each class out of total examples in each fold:
       healthy multiple_diseases      rust      scab
          mean              mean      mean      mean
fold                                                
0.0   0.282192          0.052055  0.342466  0.323288
1.0   0.282967          0.049451  0.343407  0.324176
2.0   0.285714          0.049451  0.340659  0.324176
3.0   0.282967          0.049451  0.340659  0.326923
4.0   0.282967          0.049451  0.340659  0.326923


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,fold
0,Train_1511,0,0,1,0,0.0
1,Train_1799,1,0,0,0,0.0
2,Train_135,1,0,0,0,0.0
3,Train_408,0,0,1,0,0.0
4,Train_1693,1,0,0,0,0.0


In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_utils.ipynb.
Converted 01_dataset.ipynb.
Converted 02_evaluate.ipynb.
Converted 03_train.ipynb.
Converted 04_generate_pseudo_labels.ipynb.
Converted 05_self_knowledge_distillation.ipynb.
Converted 06_create_folds.ipynb.
Converted config.ipynb.
Converted index.ipynb.
