In [1]:
import numpy as np
import pandas as pd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

#Regression algorythms
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
#Data split
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=345)

In [3]:
#Data standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
models = [
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('SVR', SVR()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('k-NN', KNeighborsRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Neural Network', MLPRegressor(max_iter=10000)),
    ('XGBoost', XGBRegressor(objective='reg:squarederror')),
    ('LightGBM', LGBMRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0))
]

In [5]:
#Cross-validation parameters
cv = KFold(n_splits=5, shuffle=True, random_state=345)

In [6]:
#Default hyperparameters
default_results = []
for name, model in models:
    nmse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error') #negative becouse cross_val_score is maximalizing
    nmae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error') #negative becouse cross_val_score is maximalizing
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    default_results.append({
        'Model': name,
        'MSE': -nmse_scores.mean(),  # conversion NMSE to MSE
        'MAE': -nmae_scores.mean(),  # conversion NMAE to MAE
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 152.716312
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 521
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 155.638298
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 2

In [52]:
def styled_df(df):
    styled_df = df.style.background_gradient(subset=['R2', 'Explained Variance'], cmap='RdYlGn')
    styled_df = styled_df.background_gradient(subset=['MSE', 'MAE'], cmap='RdYlGn_r') #reverse color map for minimized metrics
    return styled_df

In [53]:
default_results_df = pd.DataFrame(default_results)
styled_df(default_results_df)


Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3017.384479,44.988392,0.455947,0.470067
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,5048.65862,60.023966,0.118126,0.144811
3,Decision Tree,6226.417103,64.378068,-0.12266,-0.165075
4,Random Forest,3393.829573,49.213008,0.384307,0.398079
5,k-NN,3685.557257,48.579219,0.340087,0.349525
6,Gradient Boosting,3484.646286,47.659598,0.366107,0.379074
7,Neural Network,3712.27135,47.333618,0.367778,0.379849
8,XGBoost,4167.346671,52.814794,0.24419,0.263011
9,LightGBM,3667.813168,49.288241,0.340606,0.349145


In [34]:
#Defining hyperparameter grids for individual models

param_grids = {
    'Ridge Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'Lasso Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'SVR': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0],
        'gamma': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['rbf', 'linear']
    },
    'Decision Tree': {
        'max_depth': [None, 2, 5, 10, 20, 30, 40, 50, 100],
        'min_samples_split': [2, 4, 5, 6, 8, 10, 12, 15],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 25, 40, 50, 80, 100],
        'max_depth': [None, 5, 10, 20, 25, 30, 40],
        'min_samples_split': [2, 5, 8, 10],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10, 15]
    },
    'k-NN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Gradient Boosting': {
        'n_estimators': [10, 20, 25, 30, 40, 50, 80, 100, 150, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'max_depth': [3, 5, 7, 9, 11]
    },
    'XGBoost': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 9]
    },
    'LightGBM': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'num_leaves': [31, 50, 75, 100]
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'depth': [3, 5, 7, 9]
    },
    'Neural Network': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    }

}

In [10]:
best_models = []
for name, model in models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_estimator_, grid_search.best_params_))
    print(f"Best parameters for {name}: {grid_search.best_params_}")

Tuning Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 5.0}
Tuning Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 1.0}
Tuning SVR...
Best parameters for SVR: {'C': 50.0, 'gamma': 0.01, 'kernel': 'rbf'}
Tuning Decision Tree...
Best parameters for Decision Tree: {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 25}
Tuning k-NN...
Best parameters for k-NN: {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Tuning Neural Network...




Best parameters for Neural Network: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'solver': 'adam'}
Tuning XGBoost...
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Tuning LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 353, number of used features: 10
[LightGBM] [Info] Start training from score 153.427762
Best parameters for LightGBM: {'learning_rate': 0.01, 'n_estimators': 200, 'num_leaves': 31}
Tuning CatBoost...
Best parameters for CatBoost: {'depth': 3, 'iterations': 100, 'learning_rate': 0.1}


In [11]:
tuned_results = []
for name, model, params in best_models:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    tuned_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 152.716312
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 521
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 155.638298
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training

In [55]:
tuned_results_df = pd.DataFrame(tuned_results)
styled_df(tuned_results_df)

Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3015.265061,44.88823,0.457149,0.470543
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,2998.148894,44.802185,0.462328,0.477003
3,Decision Tree,3889.810261,49.963001,0.304464,0.323282
4,Random Forest,3264.350226,47.300805,0.406648,0.415435
5,k-NN,3445.401476,47.025107,0.385431,0.395019
6,Gradient Boosting,3356.853352,47.365533,0.394296,0.4058
7,Neural Network,3339.990336,45.806853,0.376346,0.418749
8,XGBoost,3461.701999,48.454532,0.376167,0.389286
9,LightGBM,3341.621179,48.153615,0.405178,0.413828


In [13]:
best_models_random = []
for name, model in models:
    print(f"Randomized Tuning {name}...")
    randomized_search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_iter=20, n_jobs=-1, random_state=42)
    randomized_search.fit(X_train, y_train)
    best_models_random.append((name, randomized_search.best_estimator_, randomized_search.best_params_))
    print(f"Best parameters for {name}: {randomized_search.best_params_}")

Randomized Tuning Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 5.0}
Randomized Tuning Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 1.0}
Randomized Tuning SVR...




Best parameters for SVR: {'kernel': 'linear', 'gamma': 0.5, 'C': 2.0}
Randomized Tuning Decision Tree...
Best parameters for Decision Tree: {'min_samples_split': 6, 'min_samples_leaf': 5, 'max_depth': 2}
Randomized Tuning Random Forest...
Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_depth': None}
Randomized Tuning k-NN...
Best parameters for k-NN: {'weights': 'distance', 'n_neighbors': 7, 'algorithm': 'ball_tree'}
Randomized Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'n_estimators': 25, 'max_depth': 3, 'learning_rate': 0.2}
Randomized Tuning Neural Network...




Best parameters for Neural Network: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (50,), 'alpha': 0.01, 'activation': 'relu'}
Randomized Tuning XGBoost...
Best parameters for XGBoost: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}
Randomized Tuning LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 597
[LightGBM] [Info] Number of data points in the train set: 353, number of used features: 10
[LightGBM] [Info] Start training from score 153.427762
Best parameters for LightGBM: {'num_leaves': 31, 'n_estimators': 200, 'learning_rate': 0.01}
Randomized Tuning CatBoost...
Best parameters for CatBoost: {'learning_rate': 0.01, 'iterations': 500, 'depth': 3}


In [14]:
randomized_results = []
for name, model, params in best_models_random:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    randomized_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 152.716312
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 521
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training from score 155.638298
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 282, number of used features: 10
[LightGBM] [Info] Start training

In [56]:
randomized_results_df = pd.DataFrame(randomized_results)
styled_df(randomized_results_df)

Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3015.265061,44.88823,0.457149,0.470543
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,3027.373709,45.286646,0.456227,0.469717
3,Decision Tree,3889.810261,49.963001,0.304464,0.323282
4,Random Forest,3274.191068,47.292105,0.416558,0.415494
5,k-NN,3445.401476,47.025107,0.385431,0.395019
6,Gradient Boosting,3540.296834,49.119401,0.3606,0.374712
7,Neural Network,3485.319448,45.369163,0.399794,0.401979
8,XGBoost,3461.701999,48.454532,0.376167,0.389286
9,LightGBM,3341.621179,48.153615,0.405178,0.413828
