In [6]:
import pandas as pd
import numpy as np

In [7]:
### splitting the data

from sklearn.model_selection import train_test_split

df = pd.read_csv("encoded_df.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for ervaring_0 in df['ervaring_0'].unique():
    for geslacht_M in df['geslacht_M'].unique():
        for gewichtsklasse_L in df['gewichtsklasse_L'].unique():
            subset = df[(df['ervaring_0'] == ervaring_0) & 
                        (df['geslacht_M'] == geslacht_M) & 
                        (df['gewichtsklasse_L'] == gewichtsklasse_L)]
            if not subset.empty:
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.3, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print(f"Train data: {len(train_data)} rows")
print(f"Validation data: {len(val_data)} rows")
print(f"Test data: {len(test_data)} rows")

X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']
X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']
X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

Train data: 3018 rows
Validation data: 905 rows
Test data: 391 rows


In [10]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

continuous_features = ['500_split']
binary_features = [col for col in X_train.columns if col not in continuous_features]

### Scaling the data
scaler = StandardScaler()
X_train_cont = scaler.fit_transform(X_train[continuous_features])
X_val_cont = scaler.transform(X_val[continuous_features])
X_test_cont = scaler.transform(X_test[continuous_features])

### Effect coding
X_train_bin = X_train[binary_features].replace({0: -1, 1: 1}).values
X_val_bin = X_val[binary_features].replace({0: -1, 1: 1}).values
X_test_bin = X_test[binary_features].replace({0: -1, 1: 1}).values

X_train_processed = np.hstack((X_train_cont, X_train_bin))
X_val_processed = np.hstack((X_val_cont, X_val_bin))
X_test_processed = np.hstack((X_test_cont, X_test_bin))

### Defining the network
model = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=2000,
    batch_size=32,
    solver='adam',
    random_state=42
)

model.fit(X_train_processed, y_train)

### Validation Set Performance
y_val_pred = model.predict(X_val_processed)
mse_val = mean_squared_error(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation MAE: {mae_val:.3f}")
print(f"Validation R²: {r2_val:.3f}")

### Test Set Performance
y_test_pred = model.predict(X_test_processed)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test MAE: {mae_test:.3f}")
print(f"Test R²: {r2_test:.3f}")

### Display first 10 results
results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10)) 

### Plotting Actual vs Predicted Values
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())

fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))
fig.show()


Validation MSE: 59.590
Validation MAE: 5.886
Validation R²: 0.942
Test MSE: 52.364
Test MAE: 5.647
Test R²: 0.949
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         419.482261    7.517739
1             420         425.737814   -5.737814
2             418         417.724317    0.275683
3             427         423.654897    3.345103
4             427         427.069499   -0.069499
5             420         423.077547   -3.077547
6             421         420.314186    0.685814
7             449         441.315095    7.684905
8             431         430.398375    0.601625
9             431         427.784472    3.215528


In [11]:
### met de one-hot-encoded dataset, maar zonder extras

### defining the network
model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='identity', max_iter=2000, random_state=42)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R^2: {r2_val:.3f}")

y_test_pred = model.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R^2: {r2_test:.3f}")

results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10))

### plotting the actual vs predicted values
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))
fig.show()


Validation MSE: 117.693
Validation R^2: 0.885
Test MSE: 113.845
Test R^2: 0.889
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         416.585320   10.414680
1             420         421.962344   -1.962344
2             418         424.553286   -6.553286
3             427         419.385709    7.614291
4             427         434.622501   -7.622501
5             420         417.133996    2.866004
6             421         419.161955    1.838045
7             449         448.171443    0.828557
8             431         422.036570    8.963430
9             431         416.883300   14.116700
