In [None]:
# default_exp utils

# Utils

> This file holds function to load the data and create folds for cross-validation.

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *
import os
from plant_pathology.config import TEST_DATA_PATH 
import numpy as np

In [None]:
#export
from fastcore.all import *
from typing import Tuple, List
import pandas as pd
from sklearn.model_selection import StratifiedKFold

For some of our tests, we need access to the competition's data, which is stored in our `config` module.

## Load Data

This reads the training CSV into a pandas DataFrame. You can choose to load the CSV that includes the cross-validation (CV) folds already added to it (if you've created it). You can also choose to load the training data with pseudo-labeled examples added as well (if you've created a CSV of pseudo-labels).

In [None]:
#export
def load_data(data_path: Path, with_folds: bool = False, pseudo_labels_path: str = None) -> Tuple[Path, pd.DataFrame]:
    """Load data (with/without cross-validation folds) into DataFrame."""
    train_df = pd.read_csv(data_path/('train_folds.csv' if with_folds else 'train.csv'))
    if pseudo_labels_path is not None:
        # Add pseudo labels to DataFrame
        train_df = pd.concat([train_df, pd.read_csv(pseudo_labels_path)], ignore_index=True)
    return data_path, train_df

In [None]:
path, df = load_data(TEST_DATA_PATH, with_folds=False)
df.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0
2,Train_2,1,0,0,0
3,Train_3,0,0,1,0
4,Train_4,1,0,0,0


## Print Command to Submit to Kaggle

This prints the kaggle submission command so you don't have to remember it.

In [None]:
#export
def kaggle_submit_command() -> str:
    """Print terminal command to submit submission file."""
    print("kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'message'")

In [None]:
kaggle_submit_command()

kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {submission_path} -m 'message'


## Average Predictions

In [None]:
#export
def average_preds(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """Average predictions on test examples across prediction DataFrames in `dfs`."""
    all_preds_df = pd.concat(dfs)
    avg_preds_df = all_preds_df.groupby(all_preds_df.image_id).mean()
    return avg_preds_df

In [None]:
#hide
# Create fake test set prediction dataframes
NUM_EXAMPLES = 5
all_zeros_prediction_dfs = []
for _ in range(5):  # One dataframe for each fold
    # Make dataframe with fake predictions
    fake_preds = np.zeros((NUM_EXAMPLES, len(df.columns)))
    preds_df = pd.DataFrame(fake_preds, columns=df.columns)
    
    # Fix test filenames
    test_fns = [f"Test_{i}" for i in range(NUM_EXAMPLES)]
    preds_df["image_id"] = test_fns
    all_zeros_prediction_dfs.append(preds_df)

Let's test this by confirming that if the predictions for everything are `0.0`, the average of all the predictions should also be `0.0`.

In [None]:
len(all_zeros_prediction_dfs)

5

In [None]:
all_zeros_prediction_dfs[0]

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,0.0,0.0,0.0,0.0
1,Test_1,0.0,0.0,0.0,0.0
2,Test_2,0.0,0.0,0.0,0.0
3,Test_3,0.0,0.0,0.0,0.0
4,Test_4,0.0,0.0,0.0,0.0


In [None]:
averaged_preds_df = average_preds(all_zeros_prediction_dfs); averaged_preds_df

Unnamed: 0_level_0,healthy,multiple_diseases,rust,scab
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Test_0,0.0,0.0,0.0,0.0
Test_1,0.0,0.0,0.0,0.0
Test_2,0.0,0.0,0.0,0.0
Test_3,0.0,0.0,0.0,0.0
Test_4,0.0,0.0,0.0,0.0


In [None]:
assert np.all(averaged_preds_df == 0.)    # Average of a bunch of 0's is 0
test_eq(averaged_preds_df.shape, (5, 4))  # 5 examples, 4 classes

### Save Averaged Preds

Utility function to load and average all test set prediction CSVs matching naming pattern `"predictions_fold_[0-4].csv"`, which is the default naming scheme when running the training script using 5-fold cross-validation.

In [None]:
#export
def get_averaged_preds(path: Path, verbose: bool = False) -> Path:
    """Returns DataFrame of averaged of averaged predictions of prediction CSVs in `path` dir."""
    # Load test set prediction CSVs for each of 5 CV folds
    prediction_files = list(path.glob("predictions_fold_[0-4].csv"))
    if verbose:
        print(prediction_files)
    return average_preds([pd.read_csv(fn) for fn in prediction_files])

In [None]:
#hide 
from nbdev.export import notebook2script; notebook2script()

Converted 00_utils.ipynb.
Converted 01_dataset.ipynb.
Converted 02_evaluate.ipynb.
Converted 03_train.ipynb.
Converted 04_generate_pseudo_labels.ipynb.
Converted 05_self_knowledge_distillation.ipynb.
Converted 06_create_folds.ipynb.
Converted 07_pretrained_models.ipynb.
Converted config.ipynb.
Converted index.ipynb.
