# Model Evaluation

A Lasso Regression, Ridge Regression, Random Forest Regression, and AdaBoost Regression model was created on the overall dataset to determine which model could predict the finish time the best.

In [119]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

## Data

The features are limited to those that were discovered using Lasso Regression.

In [120]:
marathon_data = pd.read_csv(
    r"data\marathon_runners.csv", header=0).drop(columns="Unnamed: 0")
marathon_data = marathon_data.drop_duplicates()
marathon_data = marathon_data[marathon_data["week_1_total_distance"] > 0]
marathon_data = marathon_data[marathon_data["duration"] < 480].reset_index()

for column in marathon_data.columns:
    if "_pace" in column:
        marathon_data[column] = (
            marathon_data[column] - marathon_data["pace"]) / (marathon_data["pace"])

scaler = StandardScaler().set_output(transform="pandas")

X = marathon_data.loc[:, ["week_5_total_distance", "week_6_total_distance",
                          "week_9_total_distance", "week_10_total_distance",
                          "week_11_total_distance", "week_13_total_distance",
                          "week_14_total_distance", "week_15_total_distance",
                          "week_1_average_distance", "week_7_average_distance",
                          "week_10_average_distance", "week_11_average_distance",
                          "week_12_average_distance", "week_13_average_distance",
                          "week_14_average_distance", "week_16_average_distance",
                          "week_1_shortest_distance", "week_2_longest_time",
                          "week_3_longest_time", "week_6_longest_time", "week_7_longest_time",
                          "week_9_longest_time", "week_10_longest_time", "week_11_longest_time",
                          "week_12_longest_time", "week_13_longest_time", "week_14_longest_time",
                          "week_15_longest_time", "week_1_average_duration",
                          "week_2_average_duration", "week_3_average_duration",
                          "week_4_average_duration", "week_5_average_duration",
                          "week_8_average_duration", "week_9_average_duration",
                          "week_11_average_duration", "week_13_average_duration",
                          "week_14_average_duration", "week_15_average_duration",
                          "week_16_average_duration", "week_1_fastest_pace",
                          "week_2_fastest_pace", "week_3_fastest_pace", "week_4_fastest_pace",
                          "week_5_fastest_pace", "week_6_fastest_pace", "week_8_fastest_pace",
                          "week_10_fastest_pace", "week_11_fastest_pace", "week_12_fastest_pace",
                          "week_13_fastest_pace", "week_14_fastest_pace", "week_15_fastest_pace",
                          "week_16_fastest_pace"]]
y = marathon_data.loc[:, "duration"]
X = scaler.fit_transform(X)
y = marathon_data.loc[:, "duration"].astype("int32")
y = np.array(scaler.fit_transform(np.array(y).reshape(-1, 1))).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Metrics

To measure each model the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), and rhe Mean Absolute Percentage Error (MAPE) are used to determine the models ability to predict the race finish time.

In [121]:
def report_score(test_value, predict_value):
    test_value = scaler.inverse_transform(test_value.reshape(-1,1))
    predict_value = scaler.inverse_transform(predict_value.reshape(-1,1))
    print(f"Mean Squared Error: {round(mean_squared_error(test_value, predict_value),2)}")
    print(f"Root Mean Squared Error: {round(np.sqrt(mean_squared_error(test_value, predict_value)),2)}")
    print(f"Mean Absolute Error: {round(mean_absolute_error(test_value, predict_value),2)}")
    print(f"Mean Absolute Percentage Error: {round(mean_absolute_percentage_error(test_value, predict_value),2) * 100}%")
    

The Lasso model is the first model fitted to the training data using cross folds validation of 10 folds.

In [122]:
lasso_model = LassoCV(cv=10).fit(X_train, y_train)
lasso_model.score(X_train, y_train)

0.7579077256391812

In [123]:
lasso_predict = lasso_model.predict(X_test)
report_score(y_test, lasso_predict)

Mean Squared Error: 6553.83
Root Mean Squared Error: 80.96
Mean Absolute Error: 18.14
Mean Absolute Percentage Error: 8.0%


The Lasso coefficients.

In [124]:
for feature, coef in zip(lasso_model.feature_names_in_, lasso_model.coef_):
    print(f"{round(coef,2)} * {feature}")

-0.05 * week_5_total_distance
-0.02 * week_6_total_distance
-0.02 * week_9_total_distance
-0.01 * week_10_total_distance
0.04 * week_11_total_distance
-0.0 * week_13_total_distance
-0.0 * week_14_total_distance
-0.02 * week_15_total_distance
-0.54 * week_1_average_distance
-0.02 * week_7_average_distance
-0.04 * week_10_average_distance
-0.11 * week_11_average_distance
-0.01 * week_12_average_distance
-0.18 * week_13_average_distance
-0.22 * week_14_average_distance
-0.48 * week_16_average_distance
0.04 * week_1_shortest_distance
-0.02 * week_2_longest_time
-0.02 * week_3_longest_time
0.02 * week_6_longest_time
0.02 * week_7_longest_time
-0.0 * week_9_longest_time
0.05 * week_10_longest_time
-0.03 * week_11_longest_time
0.03 * week_12_longest_time
0.01 * week_13_longest_time
-0.01 * week_14_longest_time
0.02 * week_15_longest_time
0.48 * week_1_average_duration
0.05 * week_2_average_duration
0.06 * week_3_average_duration
0.05 * week_4_average_duration
0.1 * week_5_average_duration
0.0

Next the Ridge model is used with the same criteria as the Lasso algorithm.

In [125]:
ridge_model = RidgeCV(cv=10).fit(X_train, y_train)
ridge_model.score(X_train, y_train)

0.7582352295299393

In [126]:
ridge_predict = ridge_model.predict(X_test)
report_score(y_test, ridge_predict)

Mean Squared Error: 6417.83
Root Mean Squared Error: 80.11
Mean Absolute Error: 18.16
Mean Absolute Percentage Error: 8.0%


A Random Forest model is then used.

In [127]:
rf_regr = RandomForestRegressor(min_samples_leaf=2).fit(X_train, y_train)
rf_regr.score(X_train, y_train)

0.957201379698136

In [128]:
rf_regr_predict = rf_regr.predict(X_test)
report_score(y_test, rf_regr_predict)

Mean Squared Error: 702.14
Root Mean Squared Error: 26.5
Mean Absolute Error: 18.35
Mean Absolute Percentage Error: 8.0%


A grid search is used to find the best parameters for an AdaBoost Regression model/

In [129]:
params = {"n_estimators": np.arange(50, 100, 50), "learning_rate": np.arange(
    0.1, 1, 0.1), "loss": ["linear", "square", "exponential"]}
grid_search_ABR = GridSearchCV(
    AdaBoostRegressor(), param_grid=params, scoring="neg_mean_squared_error")
grid_search_ABR.fit(X_train, y_train)
grid_search_ABR.cv_results_.keys()


dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_learning_rate', 'param_loss', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

The best values for "learning_rate", "loss", and "n_estimators" is shown below.

In [131]:
best_params = grid_search_ABR.cv_results_.get("params")[np.argmax(
    grid_search_ABR.cv_results_.get("rank_test_score"))]

best_params

{'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 50}

The Adaboost model with the best parameters is fit with the training data.

In [138]:
adaboost = AdaBoostRegressor(learning_rate=best_params.get("learning_rate"),
                             loss=best_params.get("loss"),
                             n_estimators=best_params.get("n_estimators")
                             ).fit(X_train, y_train)
adaboost.score(X_train, y_train)


0.6229847897241528

In [139]:
adaboost_predict = adaboost.predict(X_test)
report_score(y_test, adaboost_predict)

Mean Squared Error: 1137.73
Root Mean Squared Error: 33.73
Mean Absolute Error: 25.65
Mean Absolute Percentage Error: 12.0%
