In [33]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import cross_val_score, GridSearchCV

In [34]:
from sklearn.model_selection import train_test_split

# # Data inladen
df = pd.read_csv("final_df.csv")
print(df.head())
# df = groot_gemid_df

# Eerst de data opschudden om bias te voorkomen
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Zorg ervoor dat elke ervaring, geslacht en gewichtsklasse in elke set vertegenwoordigd zijn
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring in df['ervaring'].unique():
    for geslacht in df['binary_geslacht'].unique():
        for gewichtsklasse in df['binary_gewichtsklasse'].unique():
            subset = df[(df['ervaring'] == ervaring) & (df['binary_geslacht'] == geslacht) & (df['binary_gewichtsklasse'] == gewichtsklasse)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indexen
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Controleren op juiste verdeling
print(f"Trainingsdata: {len(train_data)} rijen")
print(f"Validatiedata: {len(val_data)} rijen")
print(f"Testdata: {len(test_data)} rijen")

# Optioneel: data opslaan in aparte bestanden
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print(df['2k tijd'].min())
print(df['2k tijd'].max())

   ervaring  500_split  2k tijd  binary_trainingtype  binary_geslacht  \
0       1.0      104.6    379.9                    0                0   
1       1.0      104.7    379.9                    0                0   
2       1.0      104.3    379.9                    0                0   
3       1.0      104.0    379.9                    0                0   
4       1.0      104.1    379.9                    0                0   

   binary_gewichtsklasse  
0                      1  
1                      1  
2                      1  
3                      1  
4                      1  
Trainingsdata: 3018 rijen
Validatiedata: 905 rijen
Testdata: 391 rijen
371.8
510.1


In [35]:
X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']

X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']

X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

In [36]:
from sklearn.metrics import mean_squared_error

import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, reg_lambda=100)

predictions = xgb_model.fit(X_train, y_train)

predictions = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f"mse = {mse}")

rse = np.sqrt(mse)
print(f"rse = {rse}")

predictions = xgb_model.predict(X_test)
prediction1 = xgb_model.predict(np.array([[1, 104.6, 0, 0, 1]]))
print(prediction1)
print(predictions)

mse = 50.64223629541192
rse = 7.116335875674498
[382.7271]
[420.69235 426.64517 423.50128 425.944   426.14224 424.70056 419.37814
 431.14038 429.8887  423.9475  428.63486 420.9329  427.45816 422.67456
 434.2514  428.26776 435.38678 431.96942 415.46893 425.218   418.3372
 425.5304  420.175   427.89273 409.3272  425.717   427.99872 417.21582
 416.44327 423.44516 424.49875 425.4141  408.55884 419.761   423.8822
 428.63486 416.7889  421.29645 415.79913 420.9329  408.04572 427.8236
 425.90707 420.6049  423.23502 427.62296 424.95917 422.0626  423.70108
 416.62982 425.82904 425.55212 420.69836 417.51126 415.0736  422.18515
 424.2229  427.53864 415.01233 419.61395 400.43646 397.29816 413.5838
 415.66824 407.53378 414.16113 397.24826 413.69406 394.56924 415.6759
 401.77097 406.22937 404.69156 412.98447 407.06516 409.44766 409.6068
 414.26135 407.39673 406.16864 416.721   412.31445 408.94092 405.0348
 404.908   407.8651  402.55035 408.66446 397.1072  408.1496  411.21906
 401.43777 411.1518  404.

In [37]:

param_grid = {'reg_lambda': [0.1, 1, 10, 100, 150, 500]}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best lambda (reg_lambda):", grid_search.best_params_['reg_lambda'])


Best lambda (reg_lambda): 100
