In [None]:
#default_exp generate_pseudo_labels

# Generate Pseudo Labels

> Script to generate pseudo labels for test set.

In [None]:
#hide
from nbdev.showdoc import *
import pandas as pd

In [None]:
#export
from fastcore.all import *

from plant_pathology.utils import get_averaged_preds

## Generate Psuedo Labels

In [None]:
#export
@call_parse
def generate_pseudo_labels(
    path:   Param("Directory of prediction CSVs to average", Path) = ".",
    name:   Param("File name to save as", str) = "pseudo_labels.csv",
    thresh: Param("Min probabilty for pseudo label", float) = 0.95,
) -> Path:
    """Generates pseudo labels and saves in path dir."""
    avg_preds_df = get_averaged_preds(path)

    # One-hot encode highly confident predictions
    high_confidence_preds_mask = avg_preds_df >= thresh
    avg_preds_df[high_confidence_preds_mask] = 1.0
    avg_preds_df[~high_confidence_preds_mask] = 0.0

    # Only keep predictions model was highly confident on
    pseudo_labels = avg_preds_df[high_confidence_preds_mask.any(axis=1)]

    pseudo_labels.to_csv(path / name)
    return path / name

In [None]:
#slow
#hide
path = Path("../exps/baseline/")
pseudo_labels = generate_pseudo_labels(path, name="TESTING_PSEUDO_LABELS.csv")
df = pd.read_csv(pseudo_labels); df.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,0.0,0.0,1.0,0.0
1,Test_1,0.0,0.0,1.0,0.0
2,Test_10,0.0,0.0,1.0,0.0
3,Test_100,0.0,0.0,0.0,1.0
4,Test_1000,0.0,0.0,1.0,0.0


In [None]:
#slow
#hide

# Test column names are correct
assert (df.columns == ["image_id", "healthy", "multiple_diseases", "rust", "scab"]).all()

# Test that each row sums to 1
assert (df.sum(axis=1) == 1.0).all()

pseudo_labels.unlink()

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()