In [17]:
%load_ext autoreload
%autoreload 2

from typing import Any, cast, Dict

import pandas as pd
import numpy as np
from sklearn.base import clone

from models import models, scalers
from read_data import read_datasets
from data_cleaning import prep_dataframe, DataCleaner
from training import train_test_random, split_x_y, calc_stats

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generate Datasets

Copies of the dataset, each with a different scaler applies, are generated and stored for usage in training.

In [18]:
dep_var = "Log(Rmax)"
use_crossval = True

all_data = read_datasets()
combined_data = prep_dataframe(all_data, dep_var)


TOP500_201906.xls
TOP500_202011.xlsx
TOP500_201911.xls
TOP500_202006.xlsx
TOP500_201811.xls
TOP500_201806.xls
TOP500_201206.xls
TOP500_201211.xls
TOP500_201406.xls
TOP500_202106.xlsx
TOP500_201611.xls
TOP500_201411.xls
TOP500_201606.xls
TOP500_201311.xls
TOP500_201306.xls
TOP500_201111.xls
TOP500_201511.xls
TOP500_201706.xls
TOP500_201506.xls
TOP500_201711.xls
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'NEC', full name: 'NEC  3.20GHz' @ Earth Simulator, 2009
Unknown processor: 'Xeon EM64T', full name: 'Xeon EM64T  3.60GHz' @ Thunderbird, 2006


In [19]:
datasets = {}
for scaler_name, scaler in scalers.items():
    data = DataCleaner(scaler, dep_var).fit_transform(combined_data.copy())

    non_holdout, holdout = train_test_random(data, 0.1)
    train, test = train_test_random(non_holdout, 0.1)

    if use_crossval:
        # We'll do splits later
        datasets[scaler_name] = data
    else:
        # Do splits for all data
        datasets[scaler_name] = split_x_y([train, test, holdout], dep_var)


Filtered duplicates to go from 10000 rows to 2476
Filtered duplicates to go from 10000 rows to 2476
Filtered duplicates to go from 10000 rows to 2476


In [21]:
if use_crossval:
    results = pd.DataFrame(columns=["name", "scaler", "avg r2"])
else:
    results = pd.DataFrame(columns=["name", "scaler", "r2", "mae", "mape", "mse"])

for model_name, model in models.items():
    print(model_name)
    for scaler_name in scalers.keys():
        if use_crossval:
            all_r2 = []
            for i in range(20):
                train, test = train_test_random(datasets[scaler_name], 0.1)
                (train_X, train_y), (test_X, test_y) = split_x_y([train, test], dep_var)
                model.fit(train_X, train_y)

                pred_y = model.predict(test_X)
                result = calc_stats(test_y, pred_y, print_res=False)
                result = cast(Dict[str, Any], result)
                all_r2.append(result["r2"])

            result = {
                "name": model_name,
                "scaler": scaler_name,
                "avg r2": np.average(all_r2)
            }
        else:
            train, test, holdout = datasets[scaler_name]
            model.fit(train[0], train[1])

            pred_y = model.predict(test[0])
            result = calc_stats(test[1], pred_y, print_res=False)
            result = cast(Dict[str, Any], result)
            result["name"] = model_name
            result["scaler"] = scaler_name

        results = results.append(result, ignore_index=True)


lr_1
knn_1
knn_2
knn_3
knn_4
knn_5
svr_1
svr_2
rf_1
rf_2
rf_3
gbt_1
gbt_2
gbt_3
gbt_4
gbt_5
mlp_1




dnn1_1
dnn2_1
xgb_1
lgbm_1


In [None]:
results.to_csv("out/all_results_toa.csv")


In [None]:
# Take only the maximum scaler config for each model
max_indices = results.groupby(["name"])["r2"].idxmax()
maximums = results.loc[max_indices]


In [15]:
maximums.to_csv("out/best_results_toa.csv")
