In [None]:
# default_exp utils

# Utils

> This file holds function to load the data and create folds for cross-validation.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastcore.all import *
import pandas as pd
from sklearn.model_selection import StratifiedKFold

## Load Data

In [None]:
#export
def load_data(with_folds=True, path_pseudo_labels: str=None) -> (Path, pd.DataFrame):
    path = Path("/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/")
    path_train = path/('train_folds.csv' if with_folds else 'train.csv')
    df_train = pd.read_csv(path_train)
    if path_pseudo_labels is not None:
        pseudo_labels = pd.read_csv(path_pseudo_labels)
        df_train = pd.concat([df_train, pseudo_labels], ignore_index=True)
    return path, df_train

In [None]:
path, df = load_data()
df.head(), path.ls(), len(df)

(     image_id  healthy  multiple_diseases  rust  scab  fold
 0  Train_1511        0                  0     1     0   0.0
 1  Train_1799        1                  0     0     0   0.0
 2   Train_135        1                  0     0     0   0.0
 3   Train_408        0                  0     1     0   0.0
 4  Train_1693        1                  0     0     0   0.0,
 (#13) [Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/test.csv'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/train_folds.csv'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/images'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/predictions_fold_4.csv'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/TESTING.csv'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/.ipynb_checkpoints'),Path('/home/jupyter/kaggle/plant-pathology/data/plant-pathology-2020/predictions_fold_1.csv'),Path('/home/jupyter

In [None]:
path, df = load_data(path_pseudo_labels="~/kaggle/plant-pathology/exps/baseline/TESTING_PSEUDO_LABELS.csv")
df

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab,fold
0,Train_1511,0.000000e+00,0.000000e+00,1.000000,0.000000e+00,0.0
1,Train_1799,1.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.0
2,Train_135,1.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.0
3,Train_408,0.000000e+00,0.000000e+00,1.000000,0.000000e+00,0.0
4,Train_1693,1.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.0
...,...,...,...,...,...,...
3200,Test_1815,1.739019e-07,3.421622e-07,1.000000,4.958453e-10,-1.0
3201,Test_1816,3.261356e-08,1.696263e-05,0.999983,1.126221e-11,-1.0
3202,Test_1818,1.389328e-07,1.130557e-04,0.999887,4.712892e-10,-1.0
3203,Test_1819,9.999977e-01,1.068754e-06,0.000001,2.593171e-07,-1.0


## Create Folds

In [None]:
#export
def create_folds(path: Path, df: pd.DataFrame, prn_stats: bool = False) -> pd.DataFrame:
    df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    lbls = df.apply(lambda r: df.columns[r==1].item(), axis=1)
    
    # Create 5 folds
    kf = StratifiedKFold(n_splits=5)
    for fold, (train_idxs, val_idxs) in enumerate(kf.split(df, lbls.values)):
        print(f"Fold {fold}: {len(train_idxs)/len(df)}, {len(val_idxs)/len(df)}")
        df.loc[val_idxs, "fold"] = fold
        
    if prn_stats: print(df.groupby("fold").describe())
    
    # Save to file
    df.to_csv(path/"train_folds.csv", index=False)
    return df

In [None]:
path, df = load_data(False)
create_folds(path, df).head()

## Print Command to Submit to Kaggle

In [None]:
#export
def kaggle_submit_command() -> str:
    print("!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'hi'")

In [None]:
kaggle_submit_command()

!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'hi'


## Average Predictions

In [None]:
#export
def average_preds(path: Path) -> pd.DataFrame:
    dfs = pd.concat([pd.read_csv(f) for f in path.glob("predictions_fold_[0-9].csv")])
    avg_preds = dfs.groupby(dfs.index).mean()
    avg_preds.insert(0, "image_id", dfs["image_id"].unique())
    return avg_preds

In [None]:
path = Path("/home/jupyter/kaggle/plant-pathology/exps/baseline/"); path.ls()

(#5) [Path('/home/jupyter/kaggle/plant-pathology/exps/baseline/log.txt'),Path('/home/jupyter/kaggle/plant-pathology/exps/baseline/.ipynb_checkpoints'),Path('/home/jupyter/kaggle/plant-pathology/exps/baseline/predictions_fold_1.csv'),Path('/home/jupyter/kaggle/plant-pathology/exps/baseline/predictions_fold_2.csv'),Path('/home/jupyter/kaggle/plant-pathology/exps/baseline/predictions_fold_3.csv')]

In [None]:
df = average_preds(path)

test_eq(len(df), 1821)

### Save Averaged Preds

In [None]:
#export
@call_parse
def save_average_preds(
    path: Param("Path to prediction CSVs", Path)=".", 
    name: Param("Name", str)="averaged_predictions.csv",
) -> Path:
    """Average predictions from multiple folds."""
    avg_preds = average_preds(path)
    avg_preds.to_csv(path/name, index=False)
    return path/name

In [None]:
avg_preds_path = save_average_preds(path, name="TESTING_AVG_PREDS.csv")

assert avg_preds_path.exists()
avg_preds_path.unlink()

In [None]:
#hide 
from nbdev.export import *
notebook2script()

Converted 00_utils.ipynb.
Converted 01_dataset.ipynb.
Converted 02_evaluate.ipynb.
Converted 03_train.ipynb.
Converted 04_generate_pseudo_labels.ipynb.
Converted index.ipynb.
