In [None]:
%load_ext autoreload
%autoreload 2

from typing import Any, cast

import pandas as pd

from models import models, scalers
from read_data import read_datasets
from data_cleaning import prep_dataframe, preprocess_data
from training import train_test_random, split_x_y, calc_stats

## Generate Datasets

Copies of the dataset, each with a different scaler applies, are generated and stored for usage in training.

In [None]:
dep_var = "Log(Efficiency)"

all_data = read_datasets()
combined_data = prep_dataframe(all_data, dep_var)


In [None]:
datasets = {}
for scaler_name, scaler in scalers.items():
    data, _ = preprocess_data(combined_data.copy(), dep_var, scaler, True)

    non_holdout, holdout = train_test_random(data, 0.1)
    train, test = train_test_random(non_holdout, 0.1)

    # Do splits for all data
    datasets[scaler_name] = split_x_y([train, test, holdout], dep_var)


In [None]:
results = pd.DataFrame(columns=["name", "scaler", "r2", "mae", "mape", "mse"])

for model_name, model in models.items():
    for scaler_name in scalers.keys():
        train, test, holdout = datasets[scaler_name]
        model.fit(train[0], train[1])

        pred_y = model.predict(test[0])
        result = calc_stats(test[1], pred_y, print_res=False)
        result = cast(dict[str, Any], result)
        result["name"] = model_name
        result["scaler"] = scaler_name

        results = results.append(result, ignore_index=True)


In [None]:
results.to_csv("results.csv")


In [None]:
# Take only the maximum scaler config for each model
max_indices = results.groupby(["name"])["r2"].idxmax()
maximums = results.loc[max_indices]


In [None]:
maximums.to_csv("results.csv")
