In [7]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler

In [8]:
from sklearn.model_selection import train_test_split

# # Data inladen
df = pd.read_csv("final_df.csv")
print(df.head())
# df = groot_gemid_df

# Eerst de data opschudden om bias te voorkomen
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Zorg ervoor dat elke ervaring, geslacht en gewichtsklasse in elke set vertegenwoordigd zijn
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring in df['ervaring'].unique():
    for geslacht in df['binary_geslacht'].unique():
        for gewichtsklasse in df['binary_gewichtsklasse'].unique():
            subset = df[(df['ervaring'] == ervaring) & (df['binary_geslacht'] == geslacht) & (df['binary_gewichtsklasse'] == gewichtsklasse)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indexen
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Controleren op juiste verdeling
print(f"Trainingsdata: {len(train_data)} rijen")
print(f"Validatiedata: {len(val_data)} rijen")
print(f"Testdata: {len(test_data)} rijen")

# Optioneel: data opslaan in aparte bestanden
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print(df['2k tijd'].min())
print(df['2k tijd'].max())

   ervaring  500_split  2k tijd  binary_trainingtype  binary_geslacht  \
0       1.0      104.6    379.9                    0                0   
1       1.0      104.7    379.9                    0                0   
2       1.0      104.3    379.9                    0                0   
3       1.0      104.0    379.9                    0                0   
4       1.0      104.1    379.9                    0                0   

   binary_gewichtsklasse  
0                      1  
1                      1  
2                      1  
3                      1  
4                      1  
Trainingsdata: 3018 rijen
Validatiedata: 905 rijen
Testdata: 391 rijen
371.8
510.1


In [9]:
X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']

X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']

X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)  # Fit on training data only
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [12]:
regr = RandomForestRegressor(max_depth=10, random_state=0)

regr.fit(X_train_scaled, y_train)

predictions = regr.predict(X_test_scaled)
prediction1 = regr.predict(np.array([[1, 104.6, 0, 0, 1]]))
print(prediction1)
print(predictions)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f"mse = {mse}")

rse = np.sqrt(mse)
print(f"rse = {rse}")

[414.71087038]
[419.44280894 427.12883923 423.3962863  426.55324747 424.55061326
 426.34906324 420.35678189 426.78417265 429.33591594 424.25078386
 428.2779801  419.77773257 426.98442123 424.18701238 437.71324066
 427.30062489 437.70388649 425.92960251 412.97961434 426.11252485
 416.23459255 424.42728576 418.68037751 426.72033531 407.12271701
 426.31053161 430.42600515 415.21132009 417.28922657 422.00333904
 424.40978512 431.37811381 405.5        407.4572974  423.82798602
 426.6596272  417.51421238 425.26100405 415.91320153 420.09495876
 405.5        427.1615875  423.41414839 419.52950951 422.3616991
 423.01219151 426.14710821 422.16700468 421.05821946 418.31551775
 424.6182124  422.50137546 420.86031023 416.17147931 411.98387741
 420.74098994 425.70860989 428.26715722 411.45972333 415.45560103
 401.62356521 399.49028794 410.0895956  416.30253675 406.4605331
 413.9442     397.38276046 409.96588972 394.44589953 414.06742281
 402.19771055 406.85303738 405.93406538 407.36454523 407.120540

In [11]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

max_depths = range(1, 15)
validation_errors = []

for depth in max_depths:
    model = RandomForestRegressor(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    mse = mean_squared_error(y_val, predictions)
    validation_errors.append(mse)

best_depth = max_depths[validation_errors.index(min(validation_errors))]
print(f"Best max_depth: {best_depth}")


Best max_depth: 10
