In [None]:
# default_exp utils

# Utils

> This file holds function to load the data and create folds for cross-validation.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
import pandas as pd
from sklearn.model_selection import StratifiedKFold

## Load Data

In [None]:
#export
def load_data(path_data: Path=Path("/home/brandon/projects/plant_pathology/data"), with_folds: bool=True, path_pseudo_labels: str=None, ) -> (Path, pd.DataFrame):
    path_train = path_data/('train_folds.csv' if with_folds else 'train.csv')
    df_train = pd.read_csv(path_train)
    if path_pseudo_labels is not None:
        pseudo_labels = pd.read_csv(path_pseudo_labels)
        df_train = pd.concat([df_train, pseudo_labels], ignore_index=True)
    return path_data, df_train

In [None]:
path, df = load_data(with_folds=False)
df.head(), path.ls(), len(df)

(  image_id  healthy  multiple_diseases  rust  scab
 0  Train_0        0                  0     0     1
 1  Train_1        0                  1     0     0
 2  Train_2        1                  0     0     0
 3  Train_3        0                  0     1     0
 4  Train_4        1                  0     0     0,
 (#5) [Path('/home/brandon/projects/plant_pathology/data/sample_submission.csv'),Path('/home/brandon/projects/plant_pathology/data/plant-pathology-2020-fgvc7.zip'),Path('/home/brandon/projects/plant_pathology/data/test.csv'),Path('/home/brandon/projects/plant_pathology/data/train.csv'),Path('/home/brandon/projects/plant_pathology/data/images')],
 1821)

## Create Folds

In [None]:
#export
def create_folds(path: Path, df: pd.DataFrame, prn_stats: bool = False) -> pd.DataFrame:
    df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    lbls = df.apply(lambda r: df.columns[r==1].item(), axis=1)
    
    # Create 5 folds
    kf = StratifiedKFold(n_splits=5)
    for fold, (train_idxs, val_idxs) in enumerate(kf.split(df, lbls.values)):
        print(f"Fold {fold}: {len(train_idxs)/len(df)}, {len(val_idxs)/len(df)}")
        df.loc[val_idxs, "fold"] = fold
        
    if prn_stats: print(df.groupby("fold").describe())
    
    # Save to file
    df.to_csv(path/"train_folds.csv", index=False)
    return df

In [None]:
path, df = load_data(with_folds=False)
create_folds(path, df).head()

Fold 0: 0.7995606809445359, 0.20043931905546403
Fold 1: 0.800109829763866, 0.19989017023613398
Fold 2: 0.800109829763866, 0.19989017023613398
Fold 3: 0.800109829763866, 0.19989017023613398
Fold 4: 0.800109829763866, 0.19989017023613398


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,fold
0,Train_1511,0,0,1,0,0.0
1,Train_1799,1,0,0,0,0.0
2,Train_135,1,0,0,0,0.0
3,Train_408,0,0,1,0,0.0
4,Train_1693,1,0,0,0,0.0


## Print Command to Submit to Kaggle

In [None]:
#export
def kaggle_submit_command() -> str:
    print("!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'hi'")

In [None]:
kaggle_submit_command()

!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'hi'


## Average Predictions

In [None]:
#export
def average_preds(path: Path) -> pd.DataFrame:
    dfs = pd.concat([pd.read_csv(f) for f in path.glob("predictions_fold_[0-4].csv")])
    avg_preds = dfs.groupby(dfs.index).mean()
    avg_preds.insert(0, "image_id", dfs["image_id"].unique())
    return avg_preds

## TODO: Fix these tests!!! + more below!

path = Path("/home/jupyter/kaggle/plant-pathology/exps/baseline/"); path.ls()

df = average_preds(path)

test_eq(len(df), 1821)

### Save Averaged Preds

In [None]:
#export
@call_parse
def save_average_preds(
    path: Param("Path to prediction CSVs", Path)=".", 
    name: Param("Name", str)="averaged_predictions.csv",
) -> Path:
    """Average predictions from multiple folds."""
    avg_preds = average_preds(path)
    avg_preds.to_csv(path/name, index=False)
    return path/name

In [None]:
avg_preds_path = save_average_preds(path, name="TESTING_AVG_PREDS.csv")

assert avg_preds_path.exists()
avg_preds_path.unlink()

ValueError: No objects to concatenate

In [None]:
#hide 
from nbdev.export import *
notebook2script()

Converted 00_utils.ipynb.
Converted 01_dataset.ipynb.
Converted 02_evaluate.ipynb.
Converted 03_train.ipynb.
Converted 04_generate_pseudo_labels.ipynb.
Converted 05_self_knowledge_distillation.ipynb.
Converted index.ipynb.
