In [52]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
import itertools

In [53]:
df = pd.read_csv("final_df.csv")
print(df.head())

   ervaring  500_split  2k tijd  binary_trainingtype  binary_geslacht  \
0         1      104.6    379.9                    0                0   
1         1      104.7    379.9                    0                0   
2         1      104.3    379.9                    0                0   
3         1      104.0    379.9                    0                0   
4         1      104.1    379.9                    0                0   

   binary_gewichtsklasse  
0                      1  
1                      1  
2                      1  
3                      1  
4                      1  


In [54]:
from sklearn.model_selection import train_test_split

# Eerst de data opschudden om bias te voorkomen
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Zorg ervoor dat elke ervaring, geslacht en gewichtsklasse in elke set vertegenwoordigd zijn
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring in df['ervaring'].unique():
    for geslacht in df['binary_geslacht'].unique():
        for gewichtsklasse in df['binary_gewichtsklasse'].unique():
            subset = df[(df['ervaring'] == ervaring) & (df['binary_geslacht'] == geslacht) & (df['binary_gewichtsklasse'] == gewichtsklasse)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indexen
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Controleren op juiste verdeling
print(f"Trainingsdata: {len(train_data)} rijen")
print(f"Validatiedata: {len(val_data)} rijen")
print(f"Testdata: {len(test_data)} rijen")

X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']

X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']

X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

Trainingsdata: 3018 rijen
Validatiedata: 905 rijen
Testdata: 391 rijen


In [55]:
# RIDGE REGRESSION
# 1. Basis training op de trainingsdata met default alpha
# (Hier doen we nog niets met hyperparameter-tuning)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Evaluatie op de validatieset
y_val_pred = ridge_model.predict(X_val)

# Validatiemetrics (basis zonder tuning)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)
print(f"Validatie MSE RR (zonder tuning): {val_mse}")
print(f"Validatie R-squared RR (zonder tuning): {val_r2}")

# 2. Hyperparameter-tuning met de validatieset
# Zoek de beste alpha (lambda) waarde via cross-validatie op de trainingsset
poss_lam = [0.001, 0.01, 0.02, 0.25, 0.03, 0.04, 0.1, 1.0, 10.0, 50.0, 100.0]
ridge_cv_model = RidgeCV(alphas=poss_lam, store_cv_values=True)
ridge_cv_model.fit(X_train, y_train)

# Print de beste alpha
print(f"Beste alpha na tuning: {ridge_cv_model.alpha_}")

# 3. Train het model opnieuw met de beste alpha
tuned_ridge_model = Ridge(alpha=ridge_cv_model.alpha_)
tuned_ridge_model.fit(X_train, y_train)

# Voorspellingen op de testset
y_test_pred = tuned_ridge_model.predict(X_test)

# 4. Evaluatie op de testset
test_mse_rr = mean_squared_error(y_test, y_test_pred)
test_rmse_rr = test_mse ** 0.5
test_r2_rr = r2_score(y_test, y_test_pred)

# Print de evaluatiemetrics
print('\n')
print(f"Test MSE RR (na tuning): {test_mse_rr}")
print(f"Test RMSE RR (na tuning): {test_rmse_rr}")
print(f"Test R-squared RR (na tuning): {test_r2_rr}")

Validatie MSE RR (zonder tuning): 112.18862624068504
Validatie R-squared RR (zonder tuning): 0.8903284765445899
Beste alpha na tuning: 0.02


Test MSE RR (na tuning): 116.1538480038079
Test RMSE RR (na tuning): 10.777469461975194
Test R-squared RR (na tuning): 0.8868906617653229




In [51]:
# RANDOM FOREST
# Implementatie van random alpha met standaard parameters
rf_model = RandomForestRegressor(max_depth=10, random_state=0)
rf_model.fit(X_train, y_train)

# evaluatie op de dataset
val_pred_rf = rf_model.predict(X_val)

# Validatiemetrics (basis zonder tuning)
val_mse_rf = mean_squared_error(y_val, val_pred_rf)
val_r2_rf = r2_score(y_val, val_pred_rf)
print(f"Validatie MSE RF (zonder tuning): {val_mse_rf}")
print(f"Validatie R-squared RF (zonder tuning): {val_r2_rf}")

# hyperparameter tuning na validation met gridsearch
param_grid = {'max_depth': [3, 5, 7, 10, 13, 16],
              'n_estimators': [100, 200, 300],
              'min_samples_split': [2, 5, 10, 13, 16]}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_maxdepth = grid_search.best_params_['max_depth']
best_nestimators = grid_search.best_params_['n_estimators']
best_samples_split = grid_search.best_params_['min_samples_split']


# Hertrainen van het model op basis van de beste parameters
tuned_random_forest = RandomForestRegressor(max_depth=best_maxdepth, n_estimators=best_nestimators, min_samples_split=best_samples_split, random_state=0)
tuned_random_forest.fit(X_train, y_train)

y_test_pred_rf = tuned_ridge_model.predict(X_test)

# 4. Evaluatie op de testset
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
test_rmse_rf = test_mse_rf ** 0.5
test_r2_rf = r2_score(y_test, y_test_pred_rf)

# Print de evaluatiemetrics
print('\n')
print(f"Test MSE RR (na tuning): {test_mse_rf}")
print(f"Test RMSE RR (na tuning): {test_rmse_rf}")
print(f"Test R-squared RR (na tuning): {test_r2_rf}")

Validatie MSE RF (zonder tuning): 63.84021760100294
Validatie R-squared RF (zonder tuning): 0.9375921235811709


Test MSE RR (na tuning): 116.1538480038079
Test RMSE RR (na tuning): 10.777469461975194
Test R-squared RR (na tuning): 0.8868906617653229
