In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv('final_df.csv')

In [21]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ['binary_geslacht']
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([one_hot_df, df['500_split'], df['2k tijd']], axis=1)
df_encoded = df_encoded.dropna()

print(df_encoded)

      binary_geslacht_0  binary_geslacht_1  500_split  2k tijd
0                   1.0                0.0    104.600    379.9
1                   1.0                0.0    104.700    379.9
2                   1.0                0.0    104.300    379.9
3                   1.0                0.0    104.000    379.9
4                   1.0                0.0    104.100    379.9
...                 ...                ...        ...      ...
4309                0.0                1.0    120.400    484.1
4310                1.0                0.0     96.975    382.8
4311                1.0                0.0    101.600    382.8
4312                1.0                0.0     99.775    399.1
4313                1.0                0.0     94.250    377.0

[4314 rows x 4 columns]


In [22]:
from sklearn.model_selection import train_test_split

encoded_array = np.array(df_encoded)

X, r = encoded_array[:, 0:-1], encoded_array[:, -1].reshape(-1, 1)

X_train, X_test, r_train, r_test = train_test_split(X, r, test_size=0.3, random_state=77)

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaling_split_train = scaler.fit_transform(X_train[:, 2].reshape(-1,1))
X_train = np.concatenate((X_train[:, 0:2], scaling_split_train), axis=1)

scaling_split_test = scaler.transform(X_test[:, 2].reshape(-1,1))
X_test = np.concatenate((X_test[:, 0:2], scaling_split_test), axis=1)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
import random

model = Pipeline([
    ('regressor', MLPRegressor())
])

param_grid = {
    'regressor__hidden_layer_sizes': [(64, 32), (128, 64), (128,)],
    'regressor__activation': ['relu', 'tanh', 'logistic'],
    'regressor__max_iter': [1000],
    'regressor__learning_rate_init': [0.0001, 0.001, 0.01],
    'regressor__solver': ['adam', 'sgd'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__tol': [0.0001, 0.00001],
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

grid.fit(X_train, np.ravel(r_train))

print("Best parameters found: ", grid.best_params_)
print("Best cross-validation score: ", grid.best_score_)

best_model = grid.best_estimator_


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/imme/miniconda3/lib/python3.12/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/home/imme/miniconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/imme/miniconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)


Best parameters found:  {'regressor__activation': 'tanh', 'regressor__alpha': 0.01, 'regressor__hidden_layer_sizes': (128,), 'regressor__learning_rate_init': 0.001, 'regressor__max_iter': 1000, 'regressor__solver': 'sgd', 'regressor__tol': 0.0001}
Best cross-validation score:  -169.76525643199906


In [25]:
from sklearn.metrics import mean_squared_error

r_test_pred = best_model.predict(X_test)
MSE = mean_squared_error(r_test, r_test_pred)

print(MSE)

176.99170395991837
