# 006: Running ablations

In [None]:
import json
from dataclasses import asdict
import sys

import numpy as np

sys.path.append("../")
from preprocessing import preprocess
from models import OrdinaryLeastSquares, LogisticRegression
from model_selection import cross_validation

%load_ext autoreload
%autoreload 2

Loading raw data...


In [2]:
def append_jsonl(obj, path):
    """Append a dataclass (or dict) as one JSON line to a file."""
    if not isinstance(obj, dict):
        obj = asdict(obj)
    with open(path, "a", encoding="utf-8") as f:
        json.dump(obj, f)
        f.write("\n")

# Preprocessing ablations

In [5]:
def run_preprocessing_ablation(kwarg_list, num_samples = int(1e5), num_folds = 5):
    mean_f1 = []

    for cv_kw in kwarg_list:
        print("Running with args:", cv_kw)
        x_train, _, y_train, _ = preprocess(**cv_kw)
        
        cv_results = cross_validation(x_train[:num_samples], y_train[:num_samples], num_folds=num_folds, verbose=False, model_class=LogisticRegression)
        mean_f1.append(np.mean(cv_results.f1_scores))

    for i, (cv_kw, mean) in enumerate(zip(kwarg_list, mean_f1)):
        print(cv_kw)
        print(f"{mean - mean_f1[0]}" if i > 0 else mean)

## replace_nan_codes

In [6]:
kwarg_list = [{"replace_nan_codes": True}, {"replace_nan_codes": False}]
run_preprocessing_ablation(kwarg_list)

Running with args: {'replace_nan_codes': True}
Replacing missing value codes with np.nan...
Applying one-hot encoding...
Imputing missing values...
Removing invariant features...
Starting fold 1/5 with 80000 samples
Starting fold 2/5 with 80000 samples
Starting fold 3/5 with 80000 samples
Starting fold 4/5 with 80000 samples
Starting fold 5/5 with 80000 samples
Running with args: {'replace_nan_codes': False}
Applying one-hot encoding...
Imputing missing values...
Removing invariant features...
Starting fold 1/5 with 80000 samples
Starting fold 2/5 with 80000 samples
Starting fold 3/5 with 80000 samples
Starting fold 4/5 with 80000 samples
Starting fold 5/5 with 80000 samples
{'replace_nan_codes': True}
0.3735032352696262
{'replace_nan_codes': False}
-0.013057384818908724


## one hot encoding

In [8]:
kwarg_list = [{"one_hot_encoding": True}, {"one_hot_encoding": False}]
run_preprocessing_ablation(kwarg_list, num_samples = int(1e4))

Running with args: {'one_hot_encoding': True}
Replacing missing value codes with np.nan...
Applying one-hot encoding...
Imputing missing values...
Removing invariant features...
Starting fold 1/5 with 8000 samples
Starting fold 2/5 with 8000 samples
Starting fold 3/5 with 8000 samples
Starting fold 4/5 with 8000 samples
Starting fold 5/5 with 8000 samples
Running with args: {'one_hot_encoding': False}
Replacing missing value codes with np.nan...
Imputing missing values...
Removing invariant features...
Starting fold 1/5 with 8000 samples
Starting fold 2/5 with 8000 samples
Starting fold 3/5 with 8000 samples
Starting fold 4/5 with 8000 samples
Starting fold 5/5 with 8000 samples
{'one_hot_encoding': True}
0.35612664992698134
{'one_hot_encoding': False}
0.006440710033129238


## 

# Model ablations

In [None]:
x_train, _, y_train = preprocess() # fixed data
def run_model_ablation(kwarg_list, num_samples = int(1e5), num_folds = 5):
    mean_f1 = []

    for cv_kw in kwarg_list:
        print("Running with args:", cv_kw)
        cv_results = cross_validation(x_train[:num_samples], y_train[:num_samples], num_folds=num_folds, verbose=False, model_class=LogisticRegression,**cv_kw)
        mean_f1.append(np.mean(cv_results.f1_scores))

    for i, (cv_kw, mean) in enumerate(zip(kwarg_list, mean_f1)):
        print(cv_kw)
        print(f"{mean - mean_f1[0]}" if i > 0 else mean)

Replacing missing value codes with np.nan...
Applying one-hot encoding...
Imputing missing values...
Removing invariant features...


## Weighting

In [None]:
kwarg_list = [{"weighting": True}, {"weighting": False}]
run_model_ablation(kwarg_list)

Running with args: {'weighting': True}
Starting fold 1/5 with 8000 samples
Starting fold 2/5 with 8000 samples
Starting fold 3/5 with 8000 samples
Starting fold 4/5 with 8000 samples
Starting fold 5/5 with 8000 samples
Running with args: {'weighting': False}
Starting fold 1/5 with 8000 samples
Starting fold 2/5 with 8000 samples
Starting fold 3/5 with 8000 samples
Starting fold 4/5 with 8000 samples
Starting fold 5/5 with 8000 samples
{'weighting': True}
0.36511378008378675
{'weighting': False}
0.005183613279058474
