# Regression Models

For the regression task, our target variable will be the `AMT_CREDIT` column.

First of all, we need to import the necessary libraries.

In [2]:
from random import random, Random

import pandas as pd
import numpy as np
import seaborn as sns
from fontTools.misc.bezierTools import epsilon
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, TweedieRegressor, QuantileRegressor 
from sklearn.feature_selection import SelectFromModel
import optuna
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error
from sklearn.linear_model import Lasso
from statsmodels.sandbox.panel.sandwich_covariance_generic import kernel



First of all, we are going to create a class to compute all the metrics. This class will be used to evaluate the performance of the models using the K Fold

The Hyperparameter Tunning will be done using the Optuna library.

In [3]:
from sklearn.model_selection import KFold


class RegressionMetrics:
    def __init__(self, model, X, y):
        self.model = model
        self.X = pd.DataFrame(X).reset_index(drop=True)
        self.y = pd.Series(y).reset_index(drop=True)
        self.mse =[]
        self.rmse = []
        self.r2 = []
        self.mae = []
        self.mape = []
        self.adj_r2 = []

    def compute_metrics(self,y_test,y_pred):
        mse = mean_squared_error(y_test, y_pred)
        self.mse.append(mse)

        rmse = root_mean_squared_error(y_test, y_pred)
        self.rmse.append(rmse)

        r2 = r2_score(y_test, y_pred)
        self.r2.append(r2)

        #adjusted r^2
        n = len(self.X)
        p = len(self.X.columns)
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        self.adj_r2.append(adj_r2)

        mae = mean_absolute_error(y_test, y_pred)
        self.mae.append(mae)

        mape = mean_absolute_percentage_error(y_test, y_pred)
        self.mape.append(mape)

    def Kfold_evaluation(self):
        # skf = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        kf = KFold(n_splits=10, random_state=42, shuffle=True)
        for train_index, test_index in kf.split(self.X,self.y):
            # X_train = self.X.iloc[train_index, :]
            # X_test = self.X.iloc[test_index, :]
            # y_train = self.y[train_index]
            # y_test = self.y[test_index]
            X_train, X_test = self.X.iloc[train_index, :], self.X.iloc[test_index, :]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)
            self.compute_metrics(y_test,y_pred)

    def printResults(self):
        print(f'MSE: {np.mean(self.mse)}')
        print(f'RMSE: {np.mean(self.rmse)}')
        print(f'R2: {np.mean(self.r2)}')
        print(f'MAE: {np.mean(self.mae)}')
        print(f'MAPE: {np.mean(self.mape)}')
        print(f'Adjusted R2: {np.mean(self.adj_r2)}')


Loading the data.

In [4]:
df = pd.read_parquet('../data/processed/selected_features_df.parquet')
X = df.drop('AMT_CREDIT', axis=1)
X = X.drop('AMT_GOODS_PRICE', axis=1)
y = df['AMT_CREDIT']

Standardizing the data.

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Lasso Regression

SelectFromModel is a meta-transformer that can be used along with any estimator that assigns importance to each feature through a coef_ or feature_importances_ attribute. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter.

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001, max_iter=10000)
selector = SelectFromModel(estimator=lasso)
X_selected = selector.fit_transform(X_scaled, y)

Hyperparameter Tuning with Optuna


In [None]:
def objective(trial):
    tol = trial.suggest_float('tol', 1e-6, 1e-2, log = True)
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2, log = True)
    lasso = Lasso(alpha=alpha, tol=tol, random_state=42, max_iter=10000)

    cv_scores = cross_val_score(lasso, X_selected, y, cv=5, scoring='neg_mean_squared_error')

    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize', study_name='Lasso Regression')
study.optimize(objective, n_trials=3)

Train the model with the best found hyperparameters and compute the metrics.

In [None]:
model = Lasso(**study.best_params, random_state=42, max_iter=10000)
lasso_metrics = RegressionMetrics(model, X_selected, y)
lasso_metrics.Kfold_evaluation()
lasso_metrics.printResults()

Visualizing the study.

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

## Support Vector Regression (SVM) 

Optuna optimization study for tuning the hyperparameters of a SVR model. It aims to minimize the mean squared error (MSE) by adjusting the following parameters: 

* C: Regularization parameter
* Epsilon : The tolerance margin for errors  in the prediction of the model
* Kernel: Type of kernel use 
* Gamma: Kernel coefficient
The model is evaluated using 5_fold_cross_validation and negative mean squared error(MSE). The goal is to maxime the performance by tunning the hyperparameters. 

In [None]:
from sklearn.svm import SVR
def objective (trial): 
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    epsilon = trial.suggest_float('epsilon', 0.01, 0.5)
    kernel = trial.suggest_categorical('kernel', ['linear', 'pòly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    svr = SVR( C=C, epsilon= epsilon, kernel=kernel, gamma=gamma)
    
    cv_scores = cross_val_score(svr, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    
    return np.mean(cv_scores)

study= optuna.create_study(direction = 'maximize', study_name= 'SVR Regression')
study.optimize(objective, n_trials=3)
    
    
    

Train the model with the best found hyperparameters and compute SVR metrics 

In [None]:
svr_model= SVR(**study.best_params, random_state=42)
svr_metrics =RegressionMetrics(svr_model, X_selected, y)
svr_metrics.Kfold_evaluation()
svr_metrics.printResults()



Study Visualization

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(study)

## Nearest Neighbor Regression (KNN Regression)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])
    
    knn_model = KNeighborsRegressor(n_neighbors=n_neighbors, metric=metric)
    knn_metrics = RegressionMetrics(knn_model, X_scaled, y)
    knn_metrics.Kfold_evaluation()
    return np.mean(knn_metrics.mse)
study = optuna.create_study(direction='minimize', study_name='KNN Regression')
study.optimize(objective, n_trials=3)



In [None]:
knn_model = KNeighborsRegressor(n_neighbors= study.best_params['n_neighbours'], metric=study.best_params['metric'])
knn_metrics=RegressionMetrics(knn_model, X_scaled, y)
knn_metrics.Kfold_evaluation()
knn_metrics.printResults()


## Generalized Linear Regression

Hyperparameter Tuning with Optuna

In [None]:
def objective(trial):
    power = trial.suggest_float('power', 1, 2)
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2, log=True)
    tw = TweedieRegressor(power=power, alpha=alpha, max_iter=10000)
    
    cv_scores = cross_val_score(tw, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    
    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize', study_name='Tweedie Regression')
study.optimize(objective, n_trials=3)

Train the model with the best found hyperparameters and compute the metrics

In [None]:
model = TweedieRegressor(**study.best_params, max_iter=10000)
tweedie_metrics = RegressionMetrics(model, X_scaled, y)
tweedie_metrics.Kfold_evaluation()
tweedie_metrics.printResults()

Visualizing the study

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

## Quantile Regression


Determine the solver based on de SciPy version


In [None]:
from sklearn.metrics._dist_metrics import parse_version
from sklearn.utils.fixes import sp_version

solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"


Define the quantiles

In [None]:
quantiles = [0.05, 0.5, 0.95]
predictions = {}
out_bounds_predictions = np.zeros_like(y, dtype=np.bool_)

Hyperparameter Tuning with Optuna

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2, log=True)
    quantile = trial.suggest_categorical('quantile', quantiles)
    qr = QuantileRegressor(alpha=alpha, quantile=quantile, solver=solver)

    cv_scores = cross_val_score(qr, X_scaled, y, cv=3, scoring='neg_mean_squared_error')

    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize', study_name='Quantile Regression')
study.optimize(objective, n_trials=3)

Fit the QuantileRegressor for each quantile using the best hyperparameters found

In [None]:
for q in quantiles:
    best_params = study.best_params
    best_params['quantile'] = q
    qr = QuantileRegressor(**best_params, solver=solver)
    y_pred = qr.fit(X_scaled, y).predict(X_scaled)
    predictions[q] = y_pred

    if q == min(quantiles):
        out_bounds_predictions = np.logical_or(out_bounds_predictions, y_pred >= y)
    elif q == max(quantiles):
        out_bounds_predictions = np.logical_or(out_bounds_predictions, y_pred <= y)

Plot the results

In [None]:
from matplotlib import pyplot as plt

plt.plot(X_scaled, y, color="black", linestyle="dashed", label="True mean")

for quantile, y_pred in predictions.items():
    plt.plot(X_scaled, y_pred, label=f"Quantile: {quantile}")

plt.scatter(
    X_scaled[out_bounds_predictions],
    y[out_bounds_predictions],
    color="black",
    marker="+",
    alpha=0.5,
    label="Outside interval",
)
plt.scatter(
    X_scaled[~out_bounds_predictions],
    y[~out_bounds_predictions],
    color="black",
    alpha=0.5,
    label="Inside interval",
)

plt.legend()
plt.xlabel("X_scaled")
plt.ylabel("y")
plt.title("Quantiles of heteroscedastic Normal distributed target")
plt.show()

## Polynomial Regression


Create polynomial features

In [None]:
poly_features = PolynomialFeatures(degree=2)
X_poly = poly_features.fit_transform(X_scaled)

Hyperparameter Tuning with Optuna

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2, log=True)
    model = LinearRegression()
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    return np.mean(cv_scores)

# Perform optimization with Optuna
study = optuna.create_study(direction='maximize', study_name='Polynomial Regression')
study.optimize(objective, n_trials=3)

Train the model with the best found hyperparameters and compute the metrics

In [None]:
model = LinearRegression()
poly_metrics = RegressionMetrics(model, X_scaled, y)
poly_metrics.Kfold_evaluation()
poly_metrics.printResults()

Visualize the study

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

## Stochastic Gradient Descendent Regression 

In [None]:
from sklearn.linear_model import SGDRegressor

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-6, 1e-2, log=True)
    tol = trial.suggest_float('tol', 1e-6, 1e-2, log=True)
    max_iter = trial.suggest_int('max_iter', 100, 4600, step=500)
    eta0 = trial.suggest_float('eta0', 1e-6, 1e-2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet']) 
    
    sgd = SGDRegressor(alpha= alpha, tol = tol, max_iter=max_iter, eta0=eta0, penalty=penalty, random_state=42)
    
    cv_scores= cross_val_score(sgd, X_scaled, y , cv=5, scoring= 'neg_mean_squared_error')
    
    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize', study_name='SGDRegressor')

study.optimize(objective, n_trials=3)
    

In [None]:
sgd_model = SGDRegressor(**study.best_params, random_state=42)
sgd_metrics=RegressionMetrics(sgd_model, X_scaled, y)
sgd_metrics.Kfold_evaluation()
sgd_metrics.printResults()


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

## Random Forest Regression 

In [None]:

 def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50,500, step= 50)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split= min_samples_split,min_samples_leaf= min_samples_leaf, max_features= max_features, random_state=42)
    
    cv_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    return np.mean(cv_scores)
study_rf = optuna.create_study(direction= 'maximize')
study_rf.optimize(objective, n_trials=3)


[I 2024-12-21 14:02:56,453] A new study created in memory with name: no-name-c30a207f-4f42-44a9-8ec1-5d19df56aeb9


In [None]:
rf_model = RandomForestRegressor(**study_rf.best_params, random_state=42)
rf_metrics= RegressionMetrics(rf_model, X_scaled, y)
rf_metrics.Kfold_evaluation()
rf_metrics.printResults()

In [None]:
optuna.visualization.plot_optimization_history(study_rf)


In [None]:
optuna.visualization.plot_param_importances(study_rf)

## Gaussian Process Regression


In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C 
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import mean_squared_error
def objective(trial):
    length_scale = trial.suggest_float('length_scale', 1e-5, 1e5, log=True)
    constant_value = trial.suggest_float('constant_value', 1e-5, 1e5, log=True) 
    #We will calculate the kernel variable for the Gaussian Precess Regressor by multiplicating the ccurrent and the RBF kernel
    kernel = C(constant_value, (1e-4, 1e1)) * RBF(length_scale, (1e-4,1e1))
    gpr = GaussianProcessRegressor ( kernel= kernel, random_state=42)
    
    cv_scores = cross_val_score(gpr, X_scaled, cv=5, scoring= "meg_mean_squared_error")
    
    return np.mean(cv_scores)

study_gpr= optuna.create_study(direction = 'maximize')

study_gpr.optimize(objective, n_trials=3)
    
    
 

In [None]:
gps_params= study_gpr.best_params

gps_kernel = C(gps_params['constant_value'], (1e-4, 1e1))* RBF(gps_params['lenght_scale'], (1e-4, 1e1))
gps_model = GaussianProcessRegressor(kernel=kernel, random_state= 42)
gps_model.fit(X_scaled, y)

In [None]:
optuna.visualization.plot_optimization_history(study_gpr)

In [None]:
optuna.visualization.plot_param_importances(study_gpr)