# Prediction model on clustered data

Now the same models that were trained on the overall data will be trained on each cluster.

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [105]:
marathon_data = pd.read_csv(
    r"data\marathon_runners.csv", header=0).drop(columns='Unnamed: 0')
marathon_data = marathon_data.drop_duplicates()
marathon_data = marathon_data[marathon_data["week_1_total_distance"] > 0]
marathon_data = marathon_data[marathon_data["duration"] < 480].reset_index()

for column in marathon_data.columns:
    if "_pace" in column:
        marathon_data[column] = (
            marathon_data[column] - marathon_data["pace"]) / (marathon_data["pace"])

scaler = StandardScaler().set_output(transform="pandas")

X = marathon_data.loc[:, ['week_5_total_distance', 'week_6_total_distance',
                          'week_9_total_distance', 'week_10_total_distance',
                          'week_11_total_distance', 'week_13_total_distance',
                          'week_14_total_distance', 'week_15_total_distance',
                          'week_1_average_distance', 'week_7_average_distance',
                          'week_10_average_distance', 'week_11_average_distance',
                          'week_12_average_distance', 'week_13_average_distance',
                          'week_14_average_distance', 'week_16_average_distance',
                          'week_1_shortest_distance', 'week_2_longest_time',
                          'week_3_longest_time', 'week_6_longest_time', 'week_7_longest_time',
                          'week_9_longest_time', 'week_10_longest_time', 'week_11_longest_time',
                          'week_12_longest_time', 'week_13_longest_time', 'week_14_longest_time',
                          'week_15_longest_time', 'week_1_average_duration',
                          'week_2_average_duration', 'week_3_average_duration',
                          'week_4_average_duration', 'week_5_average_duration',
                          'week_8_average_duration', 'week_9_average_duration',
                          'week_11_average_duration', 'week_13_average_duration',
                          'week_14_average_duration', 'week_15_average_duration',
                          'week_16_average_duration', 'week_1_fastest_pace',
                          'week_2_fastest_pace', 'week_3_fastest_pace', 'week_4_fastest_pace',
                          'week_5_fastest_pace', 'week_6_fastest_pace', 'week_8_fastest_pace',
                          'week_10_fastest_pace', 'week_11_fastest_pace', 'week_12_fastest_pace',
                          'week_13_fastest_pace', 'week_14_fastest_pace', 'week_15_fastest_pace',
                          'week_16_fastest_pace']]
y = marathon_data.loc[:, "duration"]
X = scaler.fit_transform(X)
y = np.array(marathon_data.loc[:, "duration"].astype("int32")).reshape(-1,1)
y = np.array(scaler.fit_transform(y)).ravel()


In [106]:
def report_score(test_value, predict_value):
    print(f"Mean Squared Error: {mean_squared_error(test_value, predict_value)}")
    print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(test_value, predict_value))}")
    print(f"Mean Absolute Error: {mean_absolute_error(test_value, predict_value)}")
    print(f"Mean Absolute Percentage Error: {mean_absolute_percentage_error(test_value, predict_value)*100}%")
    

In [107]:
kmeans = KMeans(init="k-means++", n_clusters=6, n_init="auto").fit(X)
for label in set(kmeans.labels_):
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X[kmeans.labels_ == label], y[kmeans.labels_ == label], test_size=0.2)
    except ValueError:
        print("Training and Test Data are the same")
        X_train = X[kmeans.labels_ == label]
        X_test = X[kmeans.labels_ == label]
        y_train = y[kmeans.labels_ == label]
        y_test = y[kmeans.labels_ == label]
    y_train = np.array(y_train).ravel()
    y_test = np.array(y_test).ravel()
    try:
        lasso_model = LassoCV(cv=10, max_iter=100000).fit(X_train, y_train)
    except ValueError:
        lasso_model = LassoCV(cv=2, max_iter=100000).fit(X_train, y_train)
    lasso_model_predict = lasso_model.predict(X_test)
    print(f"Number of training samples in label {label} = {len(y_train)}")
    print(f"Number of test samples in label {label} = {len(y_test)}")
    print(
        f"Maximum duration: {np.max(scaler.inverse_transform(y[kmeans.labels_ == label].reshape(-1,1)))}")
    print(
        f"Minimum duration: {np.min(scaler.inverse_transform(y[kmeans.labels_ == label].reshape(-1,1)))}")
    print(
        f"Standard Deviation of duration: {np.std(scaler.inverse_transform(y[kmeans.labels_ == label].reshape(-1,1)))}")
    print()
    print("Lasso Regression")
    print(f"R^2: {lasso_model.score(X_train, y_train)}")
    report_score(scaler.inverse_transform(y_test.reshape(-1, 1)),
                 scaler.inverse_transform(lasso_model_predict.reshape(-1, 1)))
    print("*"*79)

    try:
        ridge_model = RidgeCV(cv=10).fit(X_train, y_train)
    except ValueError:
        ridge_model = RidgeCV(cv=2).fit(X_train, y_train)
    ridge_model_predict = ridge_model.predict(X_test)
    print("Ridge Regression")
    print(f"R^2: {ridge_model.score(X_train, y_train)}")
    report_score(scaler.inverse_transform(y_test.reshape(-1, 1)),
                 scaler.inverse_transform(ridge_model_predict.reshape(-1, 1)))
    print("*"*79)

    rf_regr = RandomForestRegressor(min_samples_leaf=2).fit(X_train, y_train)
    rf_regr_predict = rf_regr.predict(X_test)
    print("Random Forest Regression")
    print(f"R^2: {rf_regr.score(X_train, y_train)}")
    report_score(scaler.inverse_transform(y_test.reshape(-1, 1)),
                 scaler.inverse_transform(rf_regr_predict.reshape(-1, 1)))
    print("*"*79)

    adaboost = AdaBoostRegressor(
        learning_rate=0.5, loss="square", n_estimators=50).fit(X_train, y_train)
    adaboost_predict = adaboost.predict(X_test)
    print("Adaboost Regressor")
    print(f"R^2: {adaboost.score(X_train, y_train)}")
    report_score(scaler.inverse_transform(y_test.reshape(-1, 1)), scaler.inverse_transform(adaboost_predict.reshape(-1, 1)))
    print("*"*79)


Number of training samples in label 0 = 822
Number of test samples in label 0 = 206
Maximum duration: 334.0
Minimum duration: 137.0
Standard Deviation of duration: 30.703123984327853

Lasso Regression
R^2: 0.8728755308490435
Mean Squared Error: 158.98744176592916
Root Mean Squared Error: 12.609022236713248
Mean Absolute Error: 9.110386491584011
Mean Absolute Percentage Error: 4.633604703694778%
*******************************************************************************
Ridge Regression
R^2: 0.8745571574593904
Mean Squared Error: 159.85258643026796
Root Mean Squared Error: 12.643282264913173
Mean Absolute Error: 9.070692860002744
Mean Absolute Percentage Error: 4.633566882219875%
*******************************************************************************
Random Forest Regression
R^2: 0.9482546051030439
Mean Squared Error: 373.9194332442165
Root Mean Squared Error: 19.336996489739985
Mean Absolute Error: 14.328354036335833
Mean Absolute Percentage Error: 7.199587164790794%
******