# 005: Cross Validate Models

In [1]:
import os
import sys

import numpy as np

sys.path.append("../")
from models import OrdinaryLeastSquares, LogisticRegression, LinearSVM, KNearestNeighbors
from model_selection import cross_validation
from preprocessing import preprocess

%load_ext autoreload
%autoreload 2

def print_results(cv_results):
    print(f"Average F1-score: {np.mean(cv_results.f1_scores)*100:.1f}% ± {np.std(cv_results.f1_scores)*100:.1f}")
    print(f"Average F2-score: {np.mean(cv_results.f2_scores)*100:.1f}% ± {np.std(cv_results.f2_scores)*100:.1f}")
    print(f"Average AUC-ROC: {np.mean(cv_results.auc_rocs)*100:.1f}% ± {np.std(cv_results.auc_rocs)*100:.1f}")
    print(f" & {np.mean(cv_results.f1_scores)*100:.1f}±{np.std(cv_results.f1_scores)*100:.1f} & "
          f"{np.mean(cv_results.f2_scores)*100:.1f}±{np.std(cv_results.f2_scores)*100:.1f} & "
          f"{np.mean(cv_results.auc_rocs)*100:.1f}±{np.std(cv_results.auc_rocs)*100:.1f}")

In [None]:
x_train, _, y_train, _ = preprocess(one_hot_encoding=True, save_dir="../data/dataset_prep")
x_train.shape

Loading raw data...
Replacing missing value codes with np.nan...
Saving preprocessed data to ../data/dataset_prep...


(328135, 321)

In [2]:
train = np.load("../data/dataset_prep/train.npz")
x_train, y_train = train["x_train"], train["y_train"]
x_train.shape

(328135, 321)

In [7]:
x_train, _, y_train, _ = preprocess(one_hot_encoding=True)

Loading raw data...
Replacing missing value codes with np.nan...
Applying one-hot encoding...


In [8]:
model_settings = [
    #{"model_class": OrdinaryLeastSquares},
    #{"model_class": LogisticRegression, "gamma": 1e-2},
    {"model_class": LinearSVM},
    #{"model_class": KNearestNeighbors, "use_pca": True},
]
for model in model_settings:
    print(f"Cross-validating model: {model['model_class'].__name__}")
    num_samples = int(1e4) if model['model_class'] != KNearestNeighbors else int(1e4)
    cv_results = cross_validation(x_train[:num_samples], y_train[:num_samples], verbose=True, **model)
    print_results(cv_results)
    break
    out_dir = "../results"
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{model['model_class'].__name__}.txt")

    with open(out_path, "w") as f:
        f.write(f"Model: {model['model_class'].__name__}\n\n")
        f.write("cv_results repr:\n")
        f.write(repr(cv_results))
        f.write("\n\n")
        f.write("Aggregated metrics:\n")
        f.write(f"Average F1-score: {np.mean(cv_results.f1_scores)*100:.1f}% ± {np.std(cv_results.f1_scores)*100:.1f}\n")
        f.write(f"Average F2-score: {np.mean(cv_results.f2_scores)*100:.1f}% ± {np.std(cv_results.f2_scores)*100:.1f}\n")
        f.write(f"Average AUC-ROC: {np.mean(cv_results.auc_rocs)*100:.1f}% ± {np.std(cv_results.auc_rocs)*100:.1f}\n")
    

Cross-validating model: LinearSVM
Starting fold 1/5 with 8000 samples
Iteration 0, Training Loss: 1.0, Validation Loss: 1.0
Iteration 100, Training Loss: 0.9401860452979481, Validation Loss: 0.932197808877839
Iteration 200, Training Loss: 0.9340145899681016, Validation Loss: 0.9257373955130859
Iteration 300, Training Loss: 0.932085946409309, Validation Loss: 0.924096095161523
Iteration 400, Training Loss: 0.9309978168860384, Validation Loss: 0.9232947702936722
Iteration 500, Training Loss: 0.9302596398180382, Validation Loss: 0.9227521124797892
Iteration 600, Training Loss: 0.9297804563046741, Validation Loss: 0.9223695903837489
Iteration 700, Training Loss: 0.9294292595464433, Validation Loss: 0.9220856205826305
Iteration 800, Training Loss: 0.9291564166523801, Validation Loss: 0.9218720145035241
Iteration 900, Training Loss: 0.9289572185717501, Validation Loss: 0.9217206658842756
Iteration 0, Training Loss: 1.0, Validation Loss: 1.0
Iteration 100, Training Loss: 0.9288611714862879, V

KeyboardInterrupt: 