In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

#Regression algorythms
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

ModuleNotFoundError: No module named 'xgboost'

In [None]:
#Data split
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=345)

In [None]:
#Data standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = [
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('SVR', SVR()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('k-NN', KNeighborsRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('XGBoost', XGBRegressor(objective='reg:squarederror')),
    ('LightGBM', LGBMRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0))
]

In [None]:
#Cross-validation parameters
cv = KFold(n_splits=5, shuffle=True, random_state=345)

In [None]:
#Default hyperparameters
default_results = []
for name, model in models:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    default_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })

default_results_df = pd.DataFrame(default_results)

In [None]:
default_results_df.style.background_gradient(cmap="RdYlGn")

Unnamed: 0,neg_mean_squared_error,neg_mean_absolute_error,r2,explained_variance
Ridge Regression,-3017.384479,-44.988392,0.455947,0.470067
Lasso Regression,-3002.962386,-44.841481,0.460822,0.473699
SVR,-5048.65862,-60.023966,0.118126,0.144811
Decision Tree,-6483.768813,-63.566278,-0.118599,-0.167123
Random Forest,-3440.529477,-48.833468,0.373728,0.393319
k-NN,-3685.557257,-48.579219,0.340087,0.349525
Gradient Boosting,-3465.569959,-47.75282,0.369698,0.377552


In [None]:
#Defining hyperparameter grids for individual models

param_grids = {
    'Ridge Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'Lasso Regression': {
        'alpha': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'SVR': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0],
        'gamma': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['rbf', 'linear']
    },
    'Decision Tree': {
        'max_depth': [None, 2, 5, 10, 20, 30, 40, 50, 100],
        'min_samples_split': [2, 4, 5, 6, 8, 10, 12, 15],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 25, 40, 50, 80, 100],
        'max_depth': [None, 5, 10, 20, 25, 30, 40],
        'min_samples_split': [2, 5, 8, 10],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10, 15]
    },
    'k-NN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Gradient Boosting': {
        'n_estimators': [10, 20, 25, 30, 40, 50, 80, 100, 150, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'max_depth': [3, 5, 7, 9, 11]
    },
    'XGBoost': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 9]
    },
    'LightGBM': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'num_leaves': [31, 50, 75, 100]
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'depth': [3, 5, 7, 9]
    }

}

In [None]:
best_models = []
for name, model in models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_estimator_, grid_search.best_params_))
    print(f"Best parameters for {name}: {grid_search.best_params_}")

Tuning Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 20.0}
Tuning Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 2.0}
Tuning SVR...
Best parameters for SVR: {'C': 50.0, 'gamma': 0.01, 'kernel': 'rbf'}
Tuning Decision Tree...
Best parameters for Decision Tree: {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': 30, 'min_samples_leaf': 10, 'min_samples_split': 8, 'n_estimators': 40}
Tuning k-NN...
Best parameters for k-NN: {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'uniform'}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 20}


In [None]:
tuned_results = []
for name, model, params in best_models:
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    mae_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    ev_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='explained_variance')
    
    tuned_results.append({
        'Model': name,
        'MSE': -mse_scores.mean(),  # neg_mean_squared_error zwraca wartości ujemne
        'MAE': -mae_scores.mean(),  # neg_mean_absolute_error zwraca wartości ujemne
        'R2': r2_scores.mean(),
        'Explained Variance': ev_scores.mean()
    })

tuned_results_df = pd.DataFrame(tuned_results)

Ridge Regression - CV MSE: 3017.7723, CV MAE: 44.9255, CV R2: 0.4578, CV Explained Variance: 0.4705
Lasso Regression - CV MSE: 3005.2981, CV MAE: 45.0118, CV R2: 0.4618, CV Explained Variance: 0.4745
SVR - CV MSE: 2998.1489, CV MAE: 44.8022, CV R2: 0.4623, CV Explained Variance: 0.4770
Decision Tree - CV MSE: 3889.8103, CV MAE: 49.9630, CV R2: 0.3045, CV Explained Variance: 0.3233
Random Forest - CV MSE: 3311.5203, CV MAE: 46.7223, CV R2: 0.4106, CV Explained Variance: 0.4107
k-NN - CV MSE: 3448.1764, CV MAE: 47.0574, CV R2: 0.3855, CV Explained Variance: 0.3954
Gradient Boosting - CV MSE: 3391.7342, CV MAE: 48.6594, CV R2: 0.3969, CV Explained Variance: 0.4113


In [None]:
tuned_results_df.style.background_gradient(cmap="RdYlGn")

Unnamed: 0,MSE,MAE,R2,Explained Variance
Ridge Regression,3017.77229,44.925472,0.45779,0.470459
Lasso Regression,3005.298095,45.01184,0.461837,0.474466
SVR,2998.148894,44.802185,0.462328,0.477003
Decision Tree,3889.810261,49.963001,0.304464,0.323282
Random Forest,3311.520329,46.722321,0.410587,0.410737
k-NN,3448.176399,47.057407,0.385497,0.39535
Gradient Boosting,3391.734231,48.659367,0.396939,0.411337


Sprawdzić inne modele, wiecej hiperparametrów, 
inne metody 
feature engineering
brakujące wartości lub outliers
RandomizedSearchCV