In [86]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt 

In [87]:
# # Data inladen
df = pd.read_csv("final_df.csv")
print(df.head())
# df = groot_gemid_df

# Eerst de data opschudden om bias te voorkomen
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Zorg ervoor dat elke ervaring, geslacht en gewichtsklasse in elke set vertegenwoordigd zijn
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring in df['ervaring'].unique():
    for geslacht in df['binary_geslacht'].unique():
        for gewichtsklasse in df['binary_gewichtsklasse'].unique():
            subset = df[(df['ervaring'] == ervaring) & (df['binary_geslacht'] == geslacht) & (df['binary_gewichtsklasse'] == gewichtsklasse)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indexen
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Controleren op juiste verdeling
print(f"Trainingsdata: {len(train_data)} rijen")
print(f"Validatiedata: {len(val_data)} rijen")
print(f"Testdata: {len(test_data)} rijen")

# Optioneel: data opslaan in aparte bestanden
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

   ervaring  500_split  2k tijd  binary_trainingtype  binary_geslacht  \
0       1.0      104.6    379.9                    0                0   
1       1.0      104.7    379.9                    0                0   
2       1.0      104.3    379.9                    0                0   
3       1.0      104.0    379.9                    0                0   
4       1.0      104.1    379.9                    0                0   

   binary_gewichtsklasse  
0                      1  
1                      1  
2                      1  
3                      1  
4                      1  
Trainingsdata: 3018 rijen
Validatiedata: 905 rijen
Testdata: 391 rijen


In [88]:
X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']

X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']

X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

print(f'len test nu: {len(X_test)}')

# align train and test sets, so they have same amount of rows
X_train, X_val = X_train.align(X_val, join='outer', axis=1, fill_value=0)
X_train, X_test = X_train.align(X_test, join='outer', axis=1, fill_value=0)

# Via crossvalidation, find the best value for lambda 
poss_lam = [0.001, 0.01, 0.1, 1.0, 10.0, 50.0, 100.0]

ridge_cv_model = RidgeCV(alphas=poss_lam, store_cv_values=True)

# Fit the model on the training data
ridge_cv_model.fit(X_train, y_train)

# Print the best lambda value
print(f"most fitting lambda: {ridge_cv_model.alpha_}")

# Train the ridge model with the best lambda (alpha) from RidgeCV
ridge_model = Ridge(alpha=ridge_cv_model.alpha_)  # Gebruik de gevonden alpha van RidgeCV
ridge_model.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_model.predict(X_test)

# Calculate MSE, RMSE, MAE, and R-squared
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

print(pd.DataFrame({'y_pred': y_pred, 'y_test': y_test, 'pred - test': y_pred - y_test}))


len test nu: 391
most fitting lambda: 0.01
MSE: 116.1539240537786
RMSE: 10.777472990166972
MAE: 8.468323136834067
R-squared: 0.8868905877086988
         y_pred  y_test  pred - test
0    425.583305   427.7    -2.116695
1    426.593780   420.5     6.093780
2    420.124316   418.6     1.524316
3    426.060378   427.7    -1.639622
4    430.037698   427.7     2.337698
..          ...     ...          ...
386  382.695123   381.6     1.095123
387  385.636140   379.9     5.736140
388  383.121845   389.4    -6.278155
389  393.636439   382.2    11.436439
390  383.661758   381.6     2.061758

[391 rows x 3 columns]




In [None]:
print('hello')