In [9]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt 

In [12]:
from sklearn.model_selection import train_test_split

# # Data inladen
df = pd.read_csv("final_df.csv")
print(df.head())
# df = groot_gemid_df

# Eerst de data opschudden om bias te voorkomen
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Zorg ervoor dat elke ervaring, geslacht en gewichtsklasse in elke set vertegenwoordigd zijn
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring in df['ervaring'].unique():
    for geslacht in df['geslacht'].unique():
        for gewichtsklasse in df['gewichtsklasse'].unique():
            subset = df[(df['ervaring'] == ervaring) & (df['geslacht'] == geslacht) & (df['gewichtsklasse'] == gewichtsklasse)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indexen
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Controleren op juiste verdeling
print(f"Trainingsdata: {len(train_data)} rijen")
print(f"Validatiedata: {len(val_data)} rijen")
print(f"Testdata: {len(test_data)} rijen")

# Optioneel: data opslaan in aparte bestanden
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

  unique_naam  binary_trainingtype  ervaring geslacht gewichtsklasse  \
0  D2510_2024                10000         0        V              Z   
1  D2510_2024                 1001         0        V              Z   
2  D2510_2024               101100         0        V              Z   
3  D2510_2024                 1100         0        V              Z   
4  D2511_2024                10000         0        V              L   

    500_split  2k tijd  binary_geslacht  binary_gewichtsklasse  
0  129.822222    496.3                1                      1  
1  113.638462    496.3                1                      1  
2  130.800000    496.3                1                      1  
3  125.116667    496.3                1                      1  
4  132.408333    484.1                1                      0  
Trainingsdata: 607 rijen
Validatiedata: 180 rijen
Testdata: 83 rijen


In [19]:
X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']

X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']

X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

non_numeric_columns = X_train.select_dtypes(include=['object', 'category']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

X_train = pd.get_dummies(X_train, columns=non_numeric_columns)
X_val = pd.get_dummies(X_val, columns=non_numeric_columns)
X_test = pd.get_dummies(X_test, columns=non_numeric_columns)

# Ensure all datasets have the same columns after encoding
X_train, X_test = X_train.align(X_val, join='outer', axis=1, fill_value=0)
X_train, X_test = X_train.align(X_test, join='outer', axis=1, fill_value=0)

# Step 3: Verify data types
print(X_train.dtypes)

ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

val_predictions = ridge_model.predict(X_val)
val_mse = mean_squared_error(y_val, val_predictions)
print(f"Validation MSE: {val_mse}")

# Optionally evaluate on test data
test_predictions = ridge_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
print(f"Test MSE: {test_mse}")

Non-numeric columns: Index(['unique_naam', 'geslacht', 'gewichtsklasse'], dtype='object')
500_split                float64
binary_geslacht            int64
binary_gewichtsklasse      int64
binary_trainingtype        int64
ervaring                   int64
                          ...   
unique_naam_z5_2021         bool
unique_naam_z6_2020         bool
unique_naam_z6_2021         bool
unique_naam_z8_2020         bool
unique_naam_z9_2020         bool
Length: 181, dtype: object


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- unique_naam_D2510_2024
- unique_naam_D2512_2024
- unique_naam_D257_2024
- unique_naam_D258_2024
- unique_naam_Z2511_2024
- ...
