In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("encoded_df.csv")
print(df)

      500_split  2k tijd  geslacht_M  geslacht_V  gewichtsklasse_L  \
0           104      379           1           0                 0   
1           104      379           1           0                 0   
2           104      379           1           0                 0   
3           104      379           1           0                 0   
4           104      379           1           0                 0   
...         ...      ...         ...         ...               ...   
4309        120      484           0           1                 1   
4310         96      382           1           0                 0   
4311        101      382           1           0                 0   
4312         99      399           1           0                 0   
4313         94      377           1           0                 0   

      gewichtsklasse_Z  ervaring_0  ervaring_1  binary_trainingtype_0  \
0                    1           0           1                      1   
1            

In [48]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and shuffle the data
df = pd.read_csv("encoded_df.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Stratified splitting based on `ervaring_0`, `geslacht_M`, and `gewichtsklasse_L`
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

# Ensure representation of key features in each split
for ervaring_0 in df['ervaring_0'].unique():
    for geslacht_M in df['geslacht_M'].unique():
        for gewichtsklasse_L in df['gewichtsklasse_L'].unique():
            subset = df[(df['ervaring_0'] == ervaring_0) & 
                        (df['geslacht_M'] == geslacht_M) & 
                        (df['gewichtsklasse_L'] == gewichtsklasse_L)]
            if not subset.empty:
                # 70% train, 15% validation, 15% test
                temp_train, temp_temp = train_test_split(subset, test_size=0.3, random_state=42)
                temp_val, temp_test = train_test_split(temp_temp, test_size=0.5, random_state=42)
                train_data = pd.concat([train_data, temp_train])
                val_data = pd.concat([val_data, temp_val])
                test_data = pd.concat([test_data, temp_test])

# Reset indices
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Print the sizes of each split
print(f"Train data: {len(train_data)} rows")
print(f"Validation data: {len(val_data)} rows")
print(f"Test data: {len(test_data)} rows")

# Save splits for future use
X_train = train_data.drop(columns=['2k tijd'])
y_train = train_data['2k tijd']
X_val = val_data.drop(columns=['2k tijd'])
y_val = val_data['2k tijd']
X_test = test_data.drop(columns=['2k tijd'])
y_test = test_data['2k tijd']

Train data: 3018 rows
Validation data: 646 rows
Test data: 650 rows


In [49]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

# Separate continuous and binary features
continuous_features = ['500_split']  # Add other continuous variables if there are any
binary_features = [col for col in X_train.columns if col not in continuous_features]

# Standardize continuous variables
scaler = StandardScaler()
X_train_cont = scaler.fit_transform(X_train[continuous_features])
X_val_cont = scaler.transform(X_val[continuous_features])
X_test_cont = scaler.transform(X_test[continuous_features])

# Effect coding for binary variables (-1 and 1)
X_train_bin = X_train[binary_features].replace({0: -1, 1: 1}).values
X_val_bin = X_val[binary_features].replace({0: -1, 1: 1}).values
X_test_bin = X_test[binary_features].replace({0: -1, 1: 1}).values

# Combine processed features
X_train_processed = np.hstack((X_train_cont, X_train_bin))
X_val_processed = np.hstack((X_val_cont, X_val_bin))
X_test_processed = np.hstack((X_test_cont, X_test_bin))

# Train the neural network with the processed data
model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), max_iter=1000, random_state=42)
model.fit(X_train_processed, y_train)

# Predict on validation data
y_val_pred = model.predict(X_val_processed)

# Calculate the R² score and MSE for the validation set
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R^2: {r2_val:.3f}")

# Predict on test data
y_test_pred = model.predict(X_test_processed)

# Calculate the R² score and MSE for the test set
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R^2: {r2_test:.3f}")

# Create a DataFrame to compare the actual vs predicted values
results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10))

# Visualize predictions vs actual values for the test set
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")

# Add a diagonal line (perfect prediction line)
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))

# Show the plot
fig.show()


Validation MSE: 74.321
Validation R^2: 0.928
Test MSE: 60.654
Test R^2: 0.940
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         418.004805    8.995195
1             420         424.700218   -4.700218
2             418         423.764388   -5.764388
3             427         423.429131    3.570869
4             427         427.605284   -0.605284
5             420         420.970181   -0.970181
6             421         419.180109    1.819891
7             449         435.551768   13.448232
8             431         424.814820    6.185180
9             431         422.273555    8.726445


In [50]:
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go

# Initialize the MLPRegressor model
model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='identity', max_iter=2000, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Predict on validation data
y_val_pred = model.predict(X_val)

# Calculate the R² score and MSE for the validation set
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print(f"Validation MSE: {mse_val:.3f}")
print(f"Validation R^2: {r2_val:.3f}")

# Predict on test data
y_test_pred = model.predict(X_test)

# Calculate the R² score and MSE for the test set
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Test MSE: {mse_test:.3f}")
print(f"Test R^2: {r2_test:.3f}")

# Create a DataFrame to compare the actual vs predicted values
results = pd.DataFrame({
    'Actual 2k tijd': y_test,
    'Predicted 2k tijd': y_test_pred,
    'Difference': y_test - y_test_pred
})
print(results.head(10))

# Visualize predictions vs actual values for the test set
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
fig = px.scatter(data, x='Actual', y='Predicted', title="Actual vs Predicted Values (Test Set)")

# Add a diagonal line (perfect prediction line)
min_val = min(data['Actual'].min(), data['Predicted'].min())
max_val = max(data['Actual'].max(), data['Predicted'].max())
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', 
                         name='Perfect Prediction', line=dict(color='red')))

# Show the plot
fig.show()


Validation MSE: 93.169
Validation R^2: 0.910
Test MSE: 78.216
Test R^2: 0.923
   Actual 2k tijd  Predicted 2k tijd  Difference
0             427         418.277552    8.722448
1             420         423.348486   -3.348486
2             418         427.267767   -9.267767
3             427         421.317488    5.682512
4             427         432.335193   -5.335193
5             420         419.409045    0.590955
6             421         420.308549    0.691451
7             449         443.954977    5.045023
8             431         423.214717    7.785283
9             431         419.152722   11.847278
