# Example Training Code

Sample code that uses the appropriate data processing steps, trains a model, and then evaluates it using various metrics.

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
)
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler

In [None]:
from data_cleaning import preprocess_data
from read_data import read_datasets
from training import train_test, split_x_y, select_past

## Prep the data

In [None]:
dep_var = "Log(Rmax)"
# dep_var = "Log(Efficiency)"
all_data = read_datasets()

# For train on all (ToA)
# data, _ = preprocess_data(all_data, dep_var, RobustScaler(), True, False)

# For train on past (ToP)
data, _ = preprocess_data(all_data, dep_var, RobustScaler(), True, True)
num_past_to_use = 3
data = select_past(data, 2021, 6, num_past_to_use)

non_holdout, holdout = train_test(data, 0.1)
train, test = train_test(non_holdout, 0.1)

(train_X, train_y), (test_X, test_y) = split_x_y([train, test], dep_var)

## Fit the model

In [None]:
model = RandomForestRegressor(n_estimators=1000)
model.fit(train_X, train_y)


## Measure performance

In [None]:
# Testing score
pred_y = model.predict(test_X)
r2 = r2_score(test_y, pred_y)
print(f"Testing R^2: {r2}")

# Holdout score
[(hold_X, hold_y)] = split_x_y([holdout], dep_var)
pred_y = model.predict(hold_X)
r2 = r2_score(hold_y, pred_y)
mae = mean_absolute_error(hold_y, pred_y)
mape = mean_absolute_percentage_error(hold_y, pred_y)
mse = mean_squared_error(hold_y, pred_y)
print(f"Holdout R^2: {r2}")
print(f"Holdout MAE: {mae}")
print(f"Holdout MAPE: {mape}")
print(f"Holdout MSE: {mse}")


## Try cross validation for additional data

In [None]:
# K-fold cross validation, with a default of 5 folds
score: np.ndarray = cross_val_score(model, train_X, train_y, scoring="r2")
print(score, score.mean(), score.std())
