# 005: Cross Validate Models

In [1]:
import os
import sys

import numpy as np

sys.path.append("../")
from models import OrdinaryLeastSquares, LogisticRegression, LinearSVM, KNearestNeighbors
from model_selection import cross_validation
from preprocessing import preprocess, preprocess_splits

%load_ext autoreload
%autoreload 2

def print_results(cv_results):
    print(f"Average F1-score: {np.mean(cv_results.f1_scores)*100:.1f}% ± {np.std(cv_results.f1_scores)*100:.1f}")
    print(f"Average F2-score: {np.mean(cv_results.f2_scores)*100:.1f}% ± {np.std(cv_results.f2_scores)*100:.1f}")
    print(f"Average AUC-ROC: {np.mean(cv_results.auc_rocs)*100:.1f}% ± {np.std(cv_results.auc_rocs)*100:.1f}")
    print(f" & {np.mean(cv_results.f1_scores)*100:.1f}±{np.std(cv_results.f1_scores)*100:.1f} & "
          f"{np.mean(cv_results.f2_scores)*100:.1f}±{np.std(cv_results.f2_scores)*100:.1f} & "
          f"{np.mean(cv_results.auc_rocs)*100:.1f}±{np.std(cv_results.auc_rocs)*100:.1f}")

In [3]:
x_train, _, y_train, *_ = preprocess(save_dir="../data/dataset_prep")
x_train.shape

Loading raw data...
Replacing missing value codes with np.nan...
Applying one-hot encoding...
Saving preprocessed data to ../data/dataset_prep...


(328135, 622)

In [2]:
train = np.load("../data/dataset_prep/train.npz")
x_train, y_train = train["x_train"], train["y_train"]
x_train.shape

(328135, 622)

In [11]:
x_train, _, y_train, *_ = preprocess(one_hot_encoding=False)

Replacing missing value codes with np.nan...


In [None]:
model_settings = [
    #{"model_class": OrdinaryLeastSquares},
    {"model_class": LogisticRegression},
    #{"model_class": LinearSVM},
    #{"model_class": KNearestNeighbors},
]
for model in model_settings:
    print(f"Cross-validating model: {model['model_class'].__name__}")
    num_samples = int(1e6) if model['model_class'] != KNearestNeighbors else int(1e5)
    cv_results = cross_validation(x_train[:num_samples][:,:321], y_train[:num_samples], verbose=True, **model)
    print_results(cv_results)
    if num_samples != 1e6: break
    out_dir = "../results"
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{model['model_class'].__name__}.txt")

    with open(out_path, "w") as f:
        f.write(f"Model: {model['model_class'].__name__}\n\n")
        f.write("cv_results repr:\n")
        f.write(repr(cv_results))
        f.write("\n\n")
        f.write("Aggregated metrics:\n")
        f.write(f"Average F1-score: {np.mean(cv_results.f1_scores)*100:.1f}% ± {np.std(cv_results.f1_scores)*100:.1f}\n")
        f.write(f"Average F2-score: {np.mean(cv_results.f2_scores)*100:.1f}% ± {np.std(cv_results.f2_scores)*100:.1f}\n")
        f.write(f"Average AUC-ROC: {np.mean(cv_results.auc_rocs)*100:.1f}% ± {np.std(cv_results.auc_rocs)*100:.1f}\n")
    

Cross-validating model: LogisticRegression
Starting fold 1/5 with 80000 samples
Evaluating lambda=0
Iter    0: loss=0.6931
Iter   10: loss=0.6698
Iter   20: loss=0.6488
Iter   30: loss=0.6265
Iter   40: loss=0.6072
Iter   50: loss=0.5909
Iter   60: loss=0.5771
Iter   70: loss=0.5656
Iter   80: loss=0.5557
Iter   90: loss=0.5473
Iter  100: loss=0.5401
Iter  110: loss=0.5339
Iter  120: loss=0.5285
Iter  130: loss=0.5238
Iter  140: loss=0.5196
Iter  150: loss=0.5160
Iter  160: loss=0.5127
Iter  170: loss=0.5099
Iter  180: loss=0.5073
Iter  190: loss=0.5050
Iter  200: loss=0.5030
Iter  210: loss=0.5011
Iter  220: loss=0.4994
Iter  230: loss=0.4979
Iter  240: loss=0.4965
Iter  250: loss=0.4953
Iter  260: loss=0.4941
Iter  270: loss=0.4931
Iter  280: loss=0.4921
Iter  290: loss=0.4913
Iter  300: loss=0.4904
Iter  310: loss=0.4897
Iter  320: loss=0.4890
Iter  330: loss=0.4884
Iter  340: loss=0.4878
Iter  350: loss=0.4873
Iter  360: loss=0.4868
Iter  370: loss=0.4863
Iter  380: loss=0.4859
Ite

In [None]:
& 41.8±0.2 & 48.7±0.7 & 85.4±0.4