In [1]:
import pandas as pd
import numpy as np

In [2]:
### splitting the data

from sklearn.model_selection import train_test_split

df = pd.read_csv("encoded_df.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring_0 in df['ervaring_0'].unique():
    for geslacht_M in df['geslacht_M'].unique():
        for gewichtsklasse_L in df['gewichtsklasse_L'].unique():
            subset = df[(df['ervaring_0'] == ervaring_0) & 
                        (df['geslacht_M'] == geslacht_M) & 
                        (df['gewichtsklasse_L'] == gewichtsklasse_L)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print(f"Train data: {len(train_data)} rows")
print(f"Validation data: {len(val_data)} rows")
print(f"Test data: {len(test_data)} rows")

X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']
X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']
X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

Train data: 3018 rows
Validation data: 905 rows
Test data: 391 rows


In [3]:
### met de one-hot-encoded dataset en de effect coding en andere parameters

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

continuous_features = ['500_split']
binary_features = [col for col in X_train.columns if col not in continuous_features]

### scaling the data
scaler = StandardScaler()
X_train_cont = scaler.fit_transform(X_train[continuous_features])
X_val_cont = scaler.transform(X_val[continuous_features])
X_test_cont = scaler.transform(X_test[continuous_features])

### effect coding
X_train_bin = X_train[binary_features].replace({0: -1, 1: 1}).values
X_val_bin = X_val[binary_features].replace({0: -1, 1: 1}).values
X_test_bin = X_test[binary_features].replace({0: -1, 1: 1}).values

X_train_processed = np.hstack((X_train_cont, X_train_bin))
X_val_processed = np.hstack((X_val_cont, X_val_bin))
X_test_processed = np.hstack((X_test_cont, X_test_bin))

### defining the network
model = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=2000,
    batch_size=32,
    solver='adam',
    random_state=42
)

model.fit(X_train_processed, y_train)

y_val_pred = model.predict(X_val_processed)
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R^2: {r2_val:.3f}")

y_test_pred = model.predict(X_test_processed)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R^2: {r2_test:.3f}")

results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10)) 


### plotting the actual vs predicted values
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))
fig.show()


Validation MSE: 66.094
Validation R^2: 0.936
Test MSE: 56.916
Test R^2: 0.945
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         419.717878    7.282122
1             420         426.352743   -6.352743
2             418         425.543492   -7.543492
3             427         425.338352    1.661648
4             427         426.899984    0.100016
5             420         422.207903   -2.207903
6             421         420.732269    0.267731
7             449         435.570891   13.429109
8             431         426.746206    4.253794
9             431         424.717424    6.282576


In [4]:
### met de one-hot-encoded dataset, maar zonder extras

### defining the network
model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='identity', max_iter=2000, random_state=42)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R^2: {r2_val:.3f}")

y_test_pred = model.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R^2: {r2_test:.3f}")

results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10))

### plotting the actual vs predicted values
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))
fig.show()


Validation MSE: 86.868
Validation R^2: 0.915
Test MSE: 82.895
Test R^2: 0.919
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         418.277552    8.722448
1             420         423.348486   -3.348486
2             418         427.267767   -9.267767
3             427         421.317488    5.682512
4             427         432.335193   -5.335193
5             420         419.409045    0.590955
6             421         420.308549    0.691451
7             449         443.954977    5.045023
8             431         423.214717    7.785283
9             431         419.152722   11.847278
