In [1]:
#1.
import numpy as np
import pandas as pd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

#Regression algorythms
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
#2. Data split
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=345)

In [3]:
#3. Data standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
#4.
models = [
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('SVR', SVR()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('k-NN', KNeighborsRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Neural Network', MLPRegressor(max_iter=10000)),
    ('XGBoost', XGBRegressor(objective='reg:squarederror')),
    ('CatBoost', CatBoostRegressor(verbose=0))
]

In [5]:
#5. Cross-validation parameters
cv = KFold(n_splits=5, shuffle=True, random_state=345)

In [6]:
#6. Default hyperparameters
default_results = []
for name, model in models:
    nmse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error') #negative becouse cross_val_score is maximalizing
    nmae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error') #negative becouse cross_val_score is maximalizing
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    default_results.append({
        'Model': name,
        'MSE': -nmse_scores.mean(),  # conversion NMSE to MSE
        'MAE': -nmae_scores.mean(),  # conversion NMAE to MAE
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })



In [9]:
#7. Defining hyperparameter grids for individual models

param_grids = {
    'Ridge Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'Lasso Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'SVR': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0],
        'gamma': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['rbf', 'linear']
    },
    'Decision Tree': {
        'max_depth': [None, 2, 5, 10, 20, 30, 40, 50, 100],
        'min_samples_split': [2, 4, 5, 6, 8, 10, 12, 15],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 25, 40, 50, 80, 100],
        'max_depth': [None, 5, 10, 20, 25, 30, 40],
        'min_samples_split': [2, 5, 8, 10],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10, 15]
    },
    'k-NN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Gradient Boosting': {
        'n_estimators': [10, 20, 25, 30, 40, 50, 80, 100, 150, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'max_depth': [3, 5, 7, 9, 11]
    },
    'Neural Network': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'XGBoost': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 9]
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'depth': [3, 5, 7, 9]
    }

}

In [10]:
#8. 
best_models = []
for name, model in models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_estimator_, grid_search.best_params_))
    print(f"Best parameters for {name}: {grid_search.best_params_}")

Tuning Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 5.0}
Tuning Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 1.0}
Tuning SVR...
Best parameters for SVR: {'C': 50.0, 'gamma': 0.01, 'kernel': 'rbf'}
Tuning Decision Tree...
Best parameters for Decision Tree: {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': 25, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 10}
Tuning k-NN...
Best parameters for k-NN: {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Tuning Neural Network...




Best parameters for Neural Network: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive', 'solver': 'adam'}
Tuning XGBoost...
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Tuning CatBoost...
Best parameters for CatBoost: {'depth': 3, 'iterations': 100, 'learning_rate': 0.1}


In [11]:
#9. 
tuned_results = []
for name, model, params in best_models:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    tuned_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })

In [13]:
#10. 
best_models_random = []
for name, model in models:
    print(f"Randomized Tuning {name}...")
    randomized_search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_iter=20, n_jobs=-1, random_state=42)
    randomized_search.fit(X_train, y_train)
    best_models_random.append((name, randomized_search.best_estimator_, randomized_search.best_params_))
    print(f"Best parameters for {name}: {randomized_search.best_params_}")

Randomized Tuning Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 5.0}
Randomized Tuning Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 1.0}
Randomized Tuning SVR...




Best parameters for SVR: {'kernel': 'linear', 'gamma': 0.5, 'C': 2.0}
Randomized Tuning Decision Tree...
Best parameters for Decision Tree: {'min_samples_split': 6, 'min_samples_leaf': 5, 'max_depth': 2}
Randomized Tuning Random Forest...
Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_depth': 40}
Randomized Tuning k-NN...
Best parameters for k-NN: {'weights': 'distance', 'n_neighbors': 7, 'algorithm': 'ball_tree'}
Randomized Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'n_estimators': 25, 'max_depth': 3, 'learning_rate': 0.2}
Randomized Tuning Neural Network...




Best parameters for Neural Network: {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (50,), 'alpha': 0.0001, 'activation': 'relu'}
Randomized Tuning XGBoost...
Best parameters for XGBoost: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}
Randomized Tuning CatBoost...
Best parameters for CatBoost: {'learning_rate': 0.01, 'iterations': 500, 'depth': 3}


In [14]:
#11. 
randomized_results = []
for name, model, params in best_models_random:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    randomized_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })



In [18]:
#12.
def styled_df(df):
    styled_df = df.style.background_gradient(subset=['R2', 'Explained Variance'], cmap='RdYlGn')
    styled_df = styled_df.background_gradient(subset=['MSE', 'MAE'], cmap='RdYlGn_r') #reverse color map for minimized metrics
    return styled_df

In [16]:
#13.
default_results_df = pd.DataFrame(default_results)
styled_df(default_results_df)

Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3017.384479,44.988392,0.455947,0.470067
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,5048.65862,60.023966,0.118126,0.144811
3,Decision Tree,6451.217706,63.527485,-0.152907,-0.095699
4,Random Forest,3499.229101,49.461876,0.364262,0.394193
5,k-NN,3685.557257,48.579219,0.340087,0.349525
6,Gradient Boosting,3487.278368,47.565666,0.368581,0.378442
7,Neural Network,3497.700546,46.610147,0.380975,0.348059
8,XGBoost,4167.346671,52.814794,0.24419,0.263011
9,CatBoost,3452.565694,48.454933,0.376479,0.384748


In [17]:
#13.
tuned_results_df = pd.DataFrame(tuned_results)
styled_df(tuned_results_df)

Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3015.265061,44.88823,0.457149,0.470543
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,2998.148894,44.802185,0.462328,0.477003
3,Decision Tree,3889.810261,49.963001,0.304464,0.323282
4,Random Forest,3405.020378,47.066173,0.381501,0.38667
5,k-NN,3445.401476,47.025107,0.385431,0.395019
6,Gradient Boosting,3364.870071,47.50037,0.396096,0.405929
7,Neural Network,3445.584699,45.553173,0.37958,0.392306
8,XGBoost,3461.701999,48.454532,0.376167,0.389286
9,CatBoost,3114.725718,45.809044,0.437775,0.448923


In [15]:
#13.
randomized_results_df = pd.DataFrame(randomized_results)
styled_df(randomized_results_df)

Unnamed: 0,Model,MSE,MAE,R2,Explained Variance
0,Ridge Regression,3015.265061,44.88823,0.457149,0.470543
1,Lasso Regression,3002.962386,44.841481,0.460822,0.473699
2,SVR,3027.373709,45.286646,0.456227,0.469717
3,Decision Tree,3889.810261,49.963001,0.304464,0.323282
4,Random Forest,3250.355249,47.547238,0.403998,0.412049
5,k-NN,3445.401476,47.025107,0.385431,0.395019
6,Gradient Boosting,3492.978306,49.071806,0.367579,0.371844
7,Neural Network,3329.821431,47.277513,0.39334,0.409072
8,XGBoost,3461.701999,48.454532,0.376167,0.389286
9,CatBoost,3161.838497,46.535268,0.43441,0.44526
