# Example Training Code

Sample code that uses the appropriate data processing steps, trains a model, and then evaluates it using various metrics.

In [11]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
)
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler

In [12]:
from data_cleaning import preprocess_data
from read_data import read_datasets
from training import train_test, split_x_y

## Prep the data

In [13]:
dep_var = "Log(Rmax)"
all_data = read_datasets()
data, _ = preprocess_data(all_data, dep_var, RobustScaler(), True)
# dep_var = "Log(Efficiency)"
# data = get_data(dep_var, StandardScaler())

non_holdout, holdout = train_test(data, 0.1)
train, test = train_test(non_holdout, 0.1)

(train_X, train_y), (test_X, test_y) = split_x_y([train, test], dep_var)

TOP500_201906.xls
TOP500_202011.xlsx
TOP500_201911.xls
TOP500_202006.xlsx
TOP500_201811.xls
TOP500_201806.xls
TOP500_201206.xls
TOP500_201211.xls
TOP500_201406.xls
TOP500_202106.xlsx
TOP500_201611.xls
TOP500_201411.xls
TOP500_201606.xls
TOP500_201311.xls
TOP500_201306.xls
TOP500_201111.xls
TOP500_201511.xls
TOP500_201706.xls
TOP500_201506.xls
TOP500_201711.xls
Unknown processor: 'Vector Engine Type10AE', full name: 'Vector Engine Type10AE 8C 1.58GHz' @ Plasma Simulator, 2020
Unknown processor: 'Vector Engine Type10AE', full name: 'Vector Engine Type10AE 8C 1.58GHz' @ nan, 2020
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'NEC', full name: 'NEC  3.200GHz' @ Earth Simulator, 2009
Unknown processor: 'AMD EPYC 7763', full name: 'AMD EPYC 7763 64C 2.45GHz' @ Perlmutter, 2021
Unknown processor: 'Xeon Platinum 8368Q', full name: 'Xeon Platinum 8368Q 38C 2.6GHz' @ Maru, 2021
Unknown processor: 'Xeon Platinum 8368Q', full name: 'Xeon Platinum 8

## Fit the model

In [14]:
model = RandomForestRegressor(n_estimators=1000)
model.fit(train_X, train_y)


RandomForestRegressor(n_estimators=1000)

## Measure performance

In [15]:
# Testing score
pred_y = model.predict(test_X)
r2 = r2_score(test_y, pred_y)
print(f"Testing R^2: {r2}")

# Holdout score
[(hold_X, hold_y)] = split_x_y([holdout], dep_var)
pred_y = model.predict(hold_X)
r2 = r2_score(hold_y, pred_y)
mae = mean_absolute_error(hold_y, pred_y)
mape = mean_absolute_percentage_error(hold_y, pred_y)
mse = mean_squared_error(hold_y, pred_y)
print(f"Holdout R^2: {r2}")
print(f"Holdout MAE: {mae}")
print(f"Holdout MAPE: {mape}")
print(f"Holdout MSE: {mse}")


Testing R^2: 0.9518182452355517
Holdout R^2: 0.9348413364251363
Holdout MAE: 0.21850980147849497
Holdout MAPE: 0.03606633939457675
Holdout MSE: 0.1302186395058301


## Try cross validation for additional data

In [16]:
# K-fold cross validation, with a default of 5 folds
score: np.ndarray = cross_val_score(model, train_X, train_y, scoring="r2")
print(score, score.mean(), score.std())


[0.93392174 0.94634943 0.94694618 0.95328172 0.96004152] 0.9481081167762602 0.008658737560770867
