In [13]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer

In [14]:
clean_df = pd.read_csv('test_clean_df.csv')

categorical_cols = ['geslacht', 'gewichtsklasse', 'trainingype', 'ervaring', 
                    'datum', 'ploeg', 'zone', 'intervaltype',
                    'aantal_intervallen', 'interval_tijd', 
                    'interval_nummer', 'spm']

for col in categorical_cols:
    clean_df[col] = clean_df[col].astype(str)

numerical_cols = ['500_split', 'ervaring', 'interval_afstand', 'rust']
clean_df = clean_df.dropna(subset=numerical_cols)

features = clean_df.drop(columns=['2k tijd', 'datum', '2k datum'])
target = clean_df['2k tijd']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'),categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

ValueError: With n_samples=0, test_size=0.1 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [17]:
model = XGBRegressor(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.6,
    colsample_bytree=0.7,
    random_state=42
)

model.fit(X_train_processed, y_train)

y_pred = model.predict(X_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE: ', mse)
print('R2: ', r2)

neg_mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring=neg_mse_scorer)

mean_mse = -cv_scores.mean()
cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='r2')
print('Mean cross-val R2 score: ', cv_scores.mean())
print('Mean MSE from cross-val: ', mean_mse)

NameError: name 'X_train_processed' is not defined