# Regression Models

For the regression task, our target variable will be the `AMT_CREDIT` column.

First of all, we need to import the necessary libraries.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

First of all, we are going to create a class to compute all the metrics. This class will be used to evaluate the performance of the models using the Stratified K Fold
.

The Hyperparameter Tunning will be done using the Optuna library.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error


class RegressionMetrics:
    def __init__(self, model, X, y):
        self.model = model
        self.X = pd.DataFrame(X).reset_index(drop=True)
        self.y = pd.Series(y).reset_index(drop=True)
        self.mse =[]
        self.rmse = []
        self.r2 = []
        self.mae = []
        self.mape = []
        self.adj_r2 = []

    def compute_metrics(self,y_test,y_pred):
        mse = mean_squared_error(y_test, y_pred)
        self.mse.append(mse)

        rmse = root_mean_squared_error(y_test, y_pred)
        self.rmse.append(rmse)

        r2 = r2_score(y_test, y_pred)
        self.r2.append(r2)

        #adjusted r^2
        n = len(self.X)
        p = len(self.X.columns)
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        self.adj_r2.append(adj_r2)

        mae = mean_absolute_error(y_test, y_pred)
        self.mae.append(mae)

        mape = mean_absolute_percentage_error(y_test, y_pred)
        self.mape.append(mape)

    def stratifiedKfold(self):
        skf = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        for train_index, test_index in skf.split(self.X,self.y):

            X_train = self.X.iloc[train_index, :]
            X_test = self.X.iloc[test_index, :]
            y_train = self.y[train_index]
            y_test = self.y[test_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)

            self.compute_metrics(y_test,y_pred)

    def printResults(self):
        print(f'MSE: {np.mean(self.mse)}')
        print(f'RMSE: {np.mean(self.rmse)}')
        print(f'R2: {np.mean(self.r2)}')
        print(f'MAE: {np.mean(self.mae)}')
        print(f'MAPE: {np.mean(self.mape)}')
        print(f'Adjusted R2: {np.mean(self.adj_r2)}')


Loading the data.

In [None]:
df = pd.read_parquet('../data/processed/selected_features_df.parquet')
X = df.drop('AMT_CREDIT', axis=1)
y = df['AMT_CREDIT']

Standardizing the data.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Linear Regression