## Importing needed moduls, libs

In [45]:
import pandas as pd
import numpy as np
import json
from preprocessing.check_nans import CheckNans
from preprocessing.log_transformation import LogTransformation
from preprocessing.standardscaling import StandardScaling
from preprocessing.minmax_scaler import MinMaxScaling
from preprocessing.check_and_remove_outliers import CheckAndRemoveOutliers
from Models.LinearRegression import LinearRegressionModel
from Models.LassoRegression import LassoRegression
from Models.RidgeRegression import RidgeRegression
from Models.RandomForestRegressor import RandomForestReg
from sklearn.model_selection import train_test_split
from Utils.GridSearch import grid_search
from pipeline import Pipeline

## Preprocessing

In [3]:
df = pd.read_csv("Data/cleaned_data.csv")
df

Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,33269,0,532304,0.0000,16.0000,1.2000,1889,945,0.5003,0.0568,...,1.0000,0.0000,0.0625,2,1,0,0,0,0,444
1,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,0.0754,...,0.9091,0.7692,0.2222,0,1,0,0,1,0,625
2,30917,21642,534864,0.7000,17.3000,24.7142,1157,860,0.7433,0.0374,...,1.0000,1.4286,0.0578,2,1,0,0,1,0,469
3,149971,1172498,2399531,7.8182,16.0000,2.0465,1978,322,0.1628,0.0132,...,0.9091,0.1163,0.0568,2,1,1,1,1,1,473
4,181636,339055,1695274,1.8667,9.3334,5.0000,11446,1910,0.1669,0.0630,...,0.6667,0.3571,0.0714,0,1,0,0,1,0,596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,18830,2354,393068,0.1250,20.8746,166.9788,1282,592,0.4618,0.0681,...,1.2500,9.9987,0.0599,0,1,0,0,1,0,411
896,81404,88805,680837,1.0909,8.3637,7.6667,5400,1936,0.3585,0.0663,...,0.9091,0.8333,0.1087,0,1,0,0,1,0,589
897,0,42428,30760,3.2379,8.1889,0.7250,0,0,0.8779,0.0047,...,1.0668,0.2500,0.3448,0,1,0,0,1,0,499
898,36011,8002,604181,0.2222,16.7777,75.5037,1993,1271,0.6377,0.0553,...,1.1111,5.0002,0.0662,0,1,1,0,1,0,507


In [4]:
df_check = CheckNans().transform(df)
df_check

[32m2024-03-16 15:05:45.878[0m | [1mINFO    [0m | [36mpreprocessing.check_nans[0m:[36mtransform[0m:[36m14[0m - [1mCheckNans is starting[0m
[32m2024-03-16 15:05:45.881[0m | [1mINFO    [0m | [36mpreprocessing.check_nans[0m:[36mtransform[0m:[36m17[0m - [1mNo NaNs in dataframe[0m


Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,33269,0,532304,0.0000,16.0000,1.2000,1889,945,0.5003,0.0568,...,1.0000,0.0000,0.0625,2,1,0,0,0,0,444
1,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,0.0754,...,0.9091,0.7692,0.2222,0,1,0,0,1,0,625
2,30917,21642,534864,0.7000,17.3000,24.7142,1157,860,0.7433,0.0374,...,1.0000,1.4286,0.0578,2,1,0,0,1,0,469
3,149971,1172498,2399531,7.8182,16.0000,2.0465,1978,322,0.1628,0.0132,...,0.9091,0.1163,0.0568,2,1,1,1,1,1,473
4,181636,339055,1695274,1.8667,9.3334,5.0000,11446,1910,0.1669,0.0630,...,0.6667,0.3571,0.0714,0,1,0,0,1,0,596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,18830,2354,393068,0.1250,20.8746,166.9788,1282,592,0.4618,0.0681,...,1.2500,9.9987,0.0599,0,1,0,0,1,0,411
896,81404,88805,680837,1.0909,8.3637,7.6667,5400,1936,0.3585,0.0663,...,0.9091,0.8333,0.1087,0,1,0,0,1,0,589
897,0,42428,30760,3.2379,8.1889,0.7250,0,0,0.8779,0.0047,...,1.0668,0.2500,0.3448,0,1,0,0,1,0,499
898,36011,8002,604181,0.2222,16.7777,75.5037,1993,1271,0.6377,0.0553,...,1.1111,5.0002,0.0662,0,1,1,0,1,0,507


In [5]:
df_std = StandardScaling().transform(df)
df_std

[32m2024-03-16 15:05:45.904[0m | [1mINFO    [0m | [36mpreprocessing.standardscaling[0m:[36mtransform[0m:[36m14[0m - [1m_StandardScaling preprocessing class_ is starting[0m
[32m2024-03-16 15:05:45.910[0m | [1mINFO    [0m | [36mpreprocessing.standardscaling[0m:[36mtransform[0m:[36m19[0m - [1m_StandardScaling preprocessing class_ ended[0m


Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,-0.776089,-0.940191,-0.258649,-1.038621,1.684624,-0.308809,-0.659417,-0.496046,0.205435,0.039203,...,0.348734,-0.557241,-0.426584,1.548644,0.245093,-0.553390,-0.454350,-11.294752,-0.416417,-2.234096
1,-0.390283,-0.733901,-0.486944,-0.739305,-0.333355,-0.152000,-0.135180,-0.661346,-1.830147,0.540088,...,-0.199584,-0.083924,-0.297272,-0.743757,0.245093,-0.553390,-0.454350,0.088537,-0.416417,0.596843
2,-0.796764,-0.891231,-0.255952,-0.861331,1.904907,1.321626,-0.757087,-0.512894,1.233379,-0.483225,...,0.348734,0.321829,-0.430390,1.548644,0.245093,-0.553390,-0.454350,0.088537,-0.416417,-1.843083
3,0.249780,1.712319,1.708888,0.941504,1.684624,-0.250114,-0.647542,-0.619525,-1.222264,-1.134913,...,-0.199584,-0.485677,-0.431199,1.548644,0.245093,1.807043,2.200944,0.088537,2.401441,-1.780520
4,0.528131,-0.173156,0.966797,-0.565839,0.554979,-0.045323,0.615752,-0.304783,-1.204920,0.206165,...,-1.661768,-0.337504,-0.419378,-0.743757,0.245093,-0.553390,-0.454350,0.088537,-0.416417,0.143267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,-0.903015,-0.934866,-0.405365,-1.006962,2.510618,11.186012,-0.740408,-0.566011,0.042572,0.343504,...,1.856762,5.595329,-0.428689,-0.743757,0.245093,-0.553390,-0.454350,0.088537,-0.416417,-2.750235
896,-0.352958,-0.739290,-0.102137,-0.762327,0.390664,0.139581,-0.190953,-0.299630,-0.394410,0.295031,...,-0.199584,-0.044481,-0.389175,-0.743757,0.245093,-0.553390,-0.454350,0.088537,-0.416417,0.033784
897,-1.068540,-0.844207,-0.787137,-0.218554,0.361045,-0.341744,-0.911462,-0.683346,1.802767,-1.363812,...,0.751679,-0.403407,-0.198000,-0.743757,0.245093,-0.553390,-0.454350,0.088537,-0.416417,-1.373866
898,-0.751985,-0.922088,-0.182911,-0.982344,1.816404,4.843284,-0.645541,-0.431433,0.786668,-0.001191,...,1.018902,2.519567,-0.423588,-0.743757,0.245093,1.807043,-0.454350,0.088537,-0.416417,-1.248742


In [6]:
df = MinMaxScaling().transform(df)
df

[32m2024-03-16 15:05:45.935[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mfit[0m:[36m16[0m - [1m_MinMaxScaler preprocessing class_ is fitting[0m
[32m2024-03-16 15:05:45.937[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mfit[0m:[36m19[0m - [1m_MinMaxScaler preprocessing class_ fitting ended[0m
[32m2024-03-16 15:05:45.938[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mtransform[0m:[36m38[0m - [1m_MinMaxScaler preprocessing class_ is starting transformation[0m
[32m2024-03-16 15:05:45.941[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mtransform[0m:[36m41[0m - [1m_MinMaxScaler preprocessing class_ transformation ended[0m


Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,0.050248,0.000000,0.089184,0.000000,0.432425,0.007187,0.043671,0.023674,0.472739,0.215062,...,0.249944,0.000000,0.006247,1.0,1.0,0.0,0.0,0.0,0.0,0.288
1,0.116536,0.031316,0.052885,0.073353,0.110563,0.020730,0.134505,0.002781,0.018048,0.289972,...,0.181777,0.076844,0.022208,0.0,1.0,0.0,0.0,1.0,0.0,0.650
2,0.046696,0.007432,0.089613,0.043448,0.467560,0.148008,0.026748,0.021544,0.702353,0.136931,...,0.249944,0.142719,0.005777,1.0,1.0,0.0,0.0,1.0,0.0,0.338
3,0.226510,0.402662,0.402024,0.485265,0.432425,0.012256,0.045729,0.008067,0.153832,0.039468,...,0.181777,0.011618,0.005677,1.0,1.0,1.0,1.0,1.0,1.0,0.346
4,0.274336,0.116439,0.284031,0.115863,0.252250,0.029944,0.264617,0.047848,0.157706,0.240032,...,0.000000,0.035675,0.007136,0.0,1.0,0.0,0.0,1.0,0.0,0.592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.028440,0.000808,0.065856,0.007759,0.564169,1.000000,0.029638,0.014830,0.436360,0.260572,...,0.437420,0.998881,0.005987,0.0,1.0,0.0,0.0,1.0,0.0,0.222
896,0.122949,0.030498,0.114069,0.067711,0.226042,0.045914,0.124841,0.048499,0.338751,0.253323,...,0.181777,0.083248,0.010864,0.0,1.0,0.0,0.0,1.0,0.0,0.578
897,0.000000,0.014571,0.005154,0.200972,0.221318,0.004342,0.000000,0.000000,0.829538,0.005236,...,0.300037,0.024975,0.034462,0.0,1.0,0.0,0.0,1.0,0.0,0.398
898,0.054390,0.002748,0.101226,0.013792,0.453444,0.452175,0.046076,0.031840,0.602570,0.209021,...,0.333258,0.499525,0.006616,0.0,1.0,1.0,0.0,1.0,0.0,0.414


## Selecting features

### selected_features_5

In [7]:
features_5 = pd.read_json("Data/selected_features_5.json")[0]
df_with_5 = df[features_5]
df_with_5

Unnamed: 0,R_EDUCATION_INCOME,R_ENTERTAINMENT,R_EXPENDITURE_DEBT,R_DEBT_INCOME,CAT_GAMBLING,R_TAX_DEBT,R_EXPENDITURE,R_UTILITIES_DEBT,INCOME,R_ENTERTAINMENT_DEBT,R_GAMBLING_INCOME,R_DEBT_SAVINGS,DEBT
0,0.000000,0.293363,0.006247,0.432425,1.0,0.000000,0.711912,0.001717,0.050248,0.002075,0.191934,0.007187,0.089184
1,0.000000,0.287582,0.022208,0.110563,0.0,0.025080,0.320078,0.009089,0.116536,0.019536,0.000000,0.020730,0.052885
2,0.000000,0.573152,0.005777,0.467560,1.0,0.000000,0.694415,0.001616,0.046696,0.002433,0.188533,0.148008,0.089613
3,0.000000,0.314731,0.005677,0.432425,1.0,0.006369,0.501060,0.004343,0.226510,0.000787,0.056365,0.012256,0.402024
4,0.059034,0.303167,0.007136,0.252250,0.0,0.018710,0.483740,0.006766,0.274336,0.004616,0.000000,0.029944,0.284031
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.000000,0.307190,0.005987,0.564169,0.0,0.000000,0.654472,0.002222,0.028440,0.002540,0.000000,1.000000,0.065856
896,0.000000,0.623680,0.010864,0.226042,0.0,0.016720,0.438848,0.005150,0.122949,0.015171,0.000000,0.045914,0.114069
897,0.000000,0.569382,0.034462,0.221318,0.0,0.000000,1.000000,0.008786,0.000000,0.000000,0.092323,0.004342,0.005154
898,0.503961,0.303419,0.006616,0.453444,0.0,0.000000,0.836515,0.001818,0.054390,0.002111,0.000000,0.452175,0.101226


In [8]:
y = df["CREDIT_SCORE"]
y

0      0.288
1      0.650
2      0.338
3      0.346
4      0.592
       ...  
895    0.222
896    0.578
897    0.398
898    0.414
899    0.714
Name: CREDIT_SCORE, Length: 900, dtype: float64

### With selected_features_10

In [9]:
features_10 = pd.read_json("Data/selected_features_10.json")[0]
df_with_10 = df[features_10]
df_with_10


Unnamed: 0,R_ENTERTAINMENT,CAT_CREDIT_CARD,R_CLOTHING_DEBT,R_HEALTH_DEBT,R_ENTERTAINMENT_DEBT,R_EDUCATION_INCOME,R_DEBT_INCOME,R_GROCERIES,R_EXPENDITURE_INCOME,R_TAX_DEBT,...,INCOME,R_EXPENDITURE_DEBT,T_TAX_12,T_FINES_12,R_EXPENDITURE,R_DEBT_SAVINGS,CAT_GAMBLING,T_FINES_6,R_GROCERIES_DEBT,R_UTILITIES_DEBT
0,0.293363,0.0,0.002762,0.000295,0.002075,0.000000,0.432425,0.783925,0.249944,0.000000,...,0.050248,0.006247,0.000000,0.000000,0.711912,0.007187,1.0,0.000000,0.003253,0.001717
1,0.287582,0.0,0.014518,0.000738,0.019536,0.000000,0.110563,0.494259,0.181777,0.025080,...,0.116536,0.022208,0.115970,0.000000,0.320078,0.020730,0.0,0.000000,0.005935,0.009089
2,0.573152,0.0,0.001736,0.000837,0.002433,0.000000,0.467560,0.758090,0.249944,0.000000,...,0.046696,0.005777,0.000000,0.025334,0.694415,0.148008,1.0,0.033272,0.002896,0.001616
3,0.314731,1.0,0.000631,0.001526,0.000787,0.000000,0.432425,0.454593,0.181777,0.006369,...,0.226510,0.005677,0.231646,0.000000,0.501060,0.012256,1.0,0.000000,0.005184,0.004343
4,0.303167,0.0,0.005365,0.002166,0.004616,0.059034,0.252250,0.714509,0.000000,0.018710,...,0.274336,0.007136,0.463704,0.000000,0.483740,0.029944,0.0,0.000000,0.005220,0.006766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.307190,0.0,0.002604,0.000886,0.002540,0.000000,0.564169,0.646399,0.437420,0.000000,...,0.028440,0.005987,0.000000,0.000000,0.654472,1.000000,0.0,0.000000,0.003611,0.002222
896,0.623680,0.0,0.006233,0.001280,0.015171,0.000000,0.226042,0.634656,0.181777,0.016720,...,0.122949,0.010864,0.169870,0.000000,0.438848,0.045914,0.0,0.000000,0.003682,0.005150
897,0.569382,0.0,0.000000,0.002363,0.000000,0.000000,0.221318,0.783142,0.300037,0.000000,...,0.000000,0.034462,0.000000,0.000000,1.000000,0.004342,0.0,0.000000,0.007079,0.008786
898,0.303419,1.0,0.002604,0.000689,0.002111,0.503961,0.453444,0.845251,0.333258,0.000000,...,0.054390,0.006616,0.000000,0.000000,0.836515,0.452175,0.0,0.000000,0.003575,0.001818


### With selected_features_15

In [10]:
features_15 = pd.read_json("Data/selected_features_15.json")[0]
df_with_15 = df[features_15]
df_with_15

Unnamed: 0,R_ENTERTAINMENT,CAT_CREDIT_CARD,R_CLOTHING_DEBT,R_GAMBLING,R_TAX_INCOME,R_HEALTH_DEBT,R_TAX_SAVINGS,R_ENTERTAINMENT_DEBT,R_EDUCATION_INCOME,R_TAX,...,R_HOUSING_DEBT,R_SAVINGS_INCOME,T_FINES_12,R_EXPENDITURE,R_DEBT_SAVINGS,CAT_GAMBLING,T_FINES_6,R_GROCERIES_DEBT,R_UTILITIES_DEBT,R_TRAVEL_DEBT
0,0.293363,0.0,0.002762,0.355319,0.000000,0.000295,0.169429,0.002075,0.000000,0.595138,...,0.001651,0.000000,0.000000,0.711912,0.007187,1.0,0.000000,0.003253,0.001717,0.004579
1,0.287582,0.0,0.014518,0.586304,0.298368,0.000738,0.132597,0.019536,0.000000,0.500900,...,0.015272,0.073353,0.000000,0.320078,0.020730,0.0,0.000000,0.005935,0.009089,0.008736
2,0.573152,0.0,0.001736,0.485265,0.000000,0.000837,0.000000,0.002433,0.000000,0.548319,...,0.001533,0.043448,0.025334,0.694415,0.148008,1.0,0.033272,0.002896,0.001616,0.004088
3,0.314731,1.0,0.000631,0.538310,0.306527,0.001526,0.020872,0.000787,0.000000,0.364046,...,0.005248,0.485265,0.000000,0.501060,0.012256,1.0,0.000000,0.005184,0.004343,0.001595
4,0.303167,0.0,0.005365,0.532697,0.505828,0.002166,0.143033,0.004616,0.059034,0.200180,...,0.005336,0.115863,0.000000,0.483740,0.029944,0.0,0.000000,0.005220,0.006766,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.307190,0.0,0.002604,0.557395,0.000000,0.000886,0.000000,0.002540,0.000000,0.532113,...,0.002064,0.007759,0.000000,0.654472,1.000000,0.0,0.000000,0.003611,0.002222,0.003870
896,0.623680,0.0,0.006233,0.364580,0.413753,0.001280,0.199509,0.015171,0.000000,0.499400,...,0.000000,0.067711,0.000000,0.438848,0.045914,0.0,0.000000,0.003682,0.005150,0.004933
897,0.569382,0.0,0.000000,0.706988,0.403263,0.002363,0.000000,0.000000,0.000000,0.484394,...,0.007489,0.200972,0.000000,1.000000,0.004342,0.0,0.000000,0.007079,0.008786,0.039004
898,0.303419,1.0,0.002604,0.680887,0.000000,0.000689,0.000000,0.002111,0.503961,0.473589,...,0.001739,0.013792,0.000000,0.836515,0.452175,0.0,0.000000,0.003575,0.001818,0.002985


In [11]:
df_list = [df_with_5, df_with_10, df_with_15]
y_list = [y, y, y]

for i in range(len(df_list)):
    check = CheckAndRemoveOutliers()
    df_list[i] = check.transform(df_list[i])

    print(df_list[i].shape)
    
    y_list[i] = y.loc[df_list[i].index]
    print(y_list[i].shape)

[32m2024-03-16 15:05:46.158[0m | [1mINFO    [0m | [36mpreprocessing.check_and_remove_outliers[0m:[36mtransform[0m:[36m31[0m - [1mNumber of outliers in dataframe: 63[0m


[32m2024-03-16 15:05:46.232[0m | [1mINFO    [0m | [36mpreprocessing.check_and_remove_outliers[0m:[36mtransform[0m:[36m31[0m - [1mNumber of outliers in dataframe: 33[0m
[32m2024-03-16 15:05:46.302[0m | [1mINFO    [0m | [36mpreprocessing.check_and_remove_outliers[0m:[36mtransform[0m:[36m31[0m - [1mNumber of outliers in dataframe: 42[0m


(837, 13)
(837,)
(867, 22)
(867,)
(858, 37)
(858,)


In [12]:
for i in range(3):
    print(df_list[i].shape, y_list[i].shape) 

(837, 13) (837,)
(867, 22) (867,)
(858, 37) (858,)


## Base Model(LinearRegression)

In [53]:
df = pd.read_csv("Data/cleaned_data.csv")

In [54]:
df

Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,33269,0,532304,0.0000,16.0000,1.2000,1889,945,0.5003,0.0568,...,1.0000,0.0000,0.0625,2,1,0,0,0,0,444
1,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,0.0754,...,0.9091,0.7692,0.2222,0,1,0,0,1,0,625
2,30917,21642,534864,0.7000,17.3000,24.7142,1157,860,0.7433,0.0374,...,1.0000,1.4286,0.0578,2,1,0,0,1,0,469
3,149971,1172498,2399531,7.8182,16.0000,2.0465,1978,322,0.1628,0.0132,...,0.9091,0.1163,0.0568,2,1,1,1,1,1,473
4,181636,339055,1695274,1.8667,9.3334,5.0000,11446,1910,0.1669,0.0630,...,0.6667,0.3571,0.0714,0,1,0,0,1,0,596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,18830,2354,393068,0.1250,20.8746,166.9788,1282,592,0.4618,0.0681,...,1.2500,9.9987,0.0599,0,1,0,0,1,0,411
896,81404,88805,680837,1.0909,8.3637,7.6667,5400,1936,0.3585,0.0663,...,0.9091,0.8333,0.1087,0,1,0,0,1,0,589
897,0,42428,30760,3.2379,8.1889,0.7250,0,0,0.8779,0.0047,...,1.0668,0.2500,0.3448,0,1,0,0,1,0,499
898,36011,8002,604181,0.2222,16.7777,75.5037,1993,1271,0.6377,0.0553,...,1.1111,5.0002,0.0662,0,1,1,0,1,0,507


In [55]:
cols = pd.read_json("Data/selected_features_15.json")[0]

In [56]:
y = df["CREDIT_SCORE"]

In [57]:
X = df[cols]

In [58]:
X.shape, y.shape

((900, 37), (900,))

In [59]:
X["CREDIT_SCORE"] = y


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["CREDIT_SCORE"] = y


In [60]:
X.shape, y.shape

((900, 38), (900,))

In [61]:
model = LinearRegressionModel()

In [63]:
data_preprocessed = Pipeline(X, model=LinearRegressionModel).data_preprocessing(X)

[32m2024-03-16 17:09:43.500[0m | [1mINFO    [0m | [36mpipeline[0m:[36mdata_preprocessing[0m:[36m20[0m - [1mData Preprocessing[0m
[32m2024-03-16 17:09:43.501[0m | [1mINFO    [0m | [36mpreprocessing.check_nans[0m:[36mtransform[0m:[36m14[0m - [1mCheckNans is starting[0m
[32m2024-03-16 17:09:43.503[0m | [1mINFO    [0m | [36mpreprocessing.check_nans[0m:[36mtransform[0m:[36m17[0m - [1mNo NaNs in dataframe[0m
[32m2024-03-16 17:09:43.504[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mfit[0m:[36m16[0m - [1m_MinMaxScaler preprocessing class_ is fitting[0m
[32m2024-03-16 17:09:43.506[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mfit[0m:[36m19[0m - [1m_MinMaxScaler preprocessing class_ fitting ended[0m
[32m2024-03-16 17:09:43.507[0m | [1mINFO    [0m | [36mpreprocessing.minmax_scaler[0m:[36mtransform[0m:[36m38[0m - [1m_MinMaxScaler preprocessing class_ is starting transformation[0m
[32m2024-03-16 1

In [64]:
data_preprocessed.shape

(859, 38)

In [65]:
X = data_preprocessed.drop("CREDIT_SCORE", axis=1)
y = data_preprocessed["CREDIT_SCORE"]

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=63)

In [67]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((644, 37), (215, 37), (644,), (215,))

In [68]:
model.train(X_train, y_train)
pred_valid = model.predict(X_test)

[32m2024-03-16 17:12:41.406[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


In [69]:
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [70]:
predicted = MinMaxScaling().inverse_transform(pred_valid)
actual = MinMaxScaling().inverse_transform(y_test)
print(mean_squared_error(actual, predicted))
print(r2_score(actual, predicted))
print(mean_absolute_error(actual, predicted))
print(np.sqrt(mean_squared_error(actual, predicted)))

731.7819843850364
0.8258413548020719
19.88245030657026
27.051469172394988


In [37]:
model.train(X_train, y_train)
model.predict(X_test)
for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {model.score(y_test, score_type=sctype)}")


[32m2024-03-16 15:34:08.787[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


MAE -> 19.99456239914598
MSE -> 776.162374763549
RMSE -> 27.85969085908078
R2 -> 0.7782401425160457
MAPE -> 0.033817467615376066


## Linear Regression

In [13]:
for i in range(3):
    linregmodel = LinearRegressionModel()
    print(f"{linregmodel} --> {df_list[i].shape} df")
    X_train, X_test, y_train, y_test = train_test_split(df_list[i], y_list[i], test_size=0.25, random_state=63)
    linregmodel.train(X_train, y_train)
    linregmodel.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {linregmodel.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

[32m2024-03-16 15:05:46.323[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:05:46.330[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:05:46.338[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


LinearRegression --> (837, 13) df
MAE -> 0.03778643607323955
MSE -> 0.0027068589969169777
RMSE -> 0.05202748309227516
R2 -> 0.757123432086953
MAPE -> 0.06570458704989628
FINISHED

LinearRegression --> (867, 22) df
MAE -> 0.04028951466222352
MSE -> 0.002862591800198103
RMSE -> 0.053503194299014546
R2 -> 0.8421832925743077
MAPE -> 0.08653734959700636
FINISHED

LinearRegression --> (858, 37) df
MAE -> 0.04082290008625606
MSE -> 0.003227586238444142
RMSE -> 0.05681184945452966
R2 -> 0.8316621486514271
MAPE -> 276464739469.02545
FINISHED



## Lasso Regression

In [14]:
params = {
    "alpha" : [0.001, 0.0001, 0.00001],
    "max_iter" : list(range(10, 200, 10)),
    "selection" : ["random", "cyclic"]
}

results = [0, 0, 0]
for i in range(3):
    results[i] = grid_search(LassoRegression, df_list[i], y_list[i], params)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [15]:
for i in range(3):
    print(results[i])

{'alpha': 0.0001, 'max_iter': 20, 'selection': 'random', 'best_score': 0.7849471585562691}
{'alpha': 0.0001, 'max_iter': 10, 'selection': 'cyclic', 'best_score': 0.8083251345545536}
{'alpha': 0.0001, 'max_iter': 180, 'selection': 'cyclic', 'best_score': 0.8395142414976027}


In [16]:
# Saving the grid search

json.dump(results, open("Data/LassoGridSearchResults.json", "w"), indent=4)

In [17]:
model1 = LassoRegression("LassoReg", alpha=0.0001, max_iter=20, selection="random")
model2 = LassoRegression("LassoReg", alpha=0.0001, max_iter=10, selection="cyclic")
model3 = LassoRegression("LassoReg", alpha=0.0001, max_iter=180, selection="cyclic")
models_list = [model1, model2, model3]

In [18]:
for dfi in [df_with_5, df_with_10, df_with_15]:
    for i in range(3):
        model = models_list[i]
    print(f"LASSOREG model with {dfi.shape} df")
    X_train, X_test, y_train, y_test = train_test_split(dfi, y, test_size=0.25, random_state=63)
    model.train(X_train, y_train)
    model.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {model.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

[32m2024-03-16 15:05:55.089[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:05:55.103[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
  model = cd_fast.enet_coordinate_descent(
[32m2024-03-16 15:05:55.123[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


LASSOREG model with (900, 13) df
MAE -> 0.03889914118567895
MSE -> 0.002823174297091141
RMSE -> 0.0531335515196485
R2 -> 0.7983454396491391
MAPE -> 0.07063773942409787
FINISHED

LASSOREG model with (900, 22) df
MAE -> 0.03931041985443405
MSE -> 0.002877999035164957
RMSE -> 0.053646985331563195
R2 -> 0.7944294014279007
MAPE -> 0.07155655442546105
FINISHED

LASSOREG model with (900, 37) df
MAE -> 0.038754823222253175
MSE -> 0.002811402478615518
RMSE -> 0.05302266004846907
R2 -> 0.7991862807129295
MAPE -> 0.0705947709300268
FINISHED



## Random Forest Regression

In [19]:
for dfi in [df_with_5, df_with_10, df_with_15]:
    linregmodel = RandomForestReg("RandomForestModelsss")
    print(f"RANDOMFOREST model with {dfi.shape} df")
    X_train, X_test, y_train, y_test = train_test_split(dfi, y, test_size=0.25, random_state=63)
    linregmodel.train(X_train, y_train)
    linregmodel.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {linregmodel.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

RANDOMFOREST model with (900, 13) df


[32m2024-03-16 15:05:55.349[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


MAE -> 0.04400568888888886
MSE -> 0.003508377916444441
RMSE -> 0.05923156182682034
R2 -> 0.7494025051821186
MAPE -> 0.07949561551024645
FINISHED

RANDOMFOREST model with (900, 22) df


[32m2024-03-16 15:05:55.591[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


MAE -> 0.04690515555555554
MSE -> 0.0037584380071111097
RMSE -> 0.06130610089633094
R2 -> 0.7315411362625164
MAPE -> 0.08486500667727752
FINISHED

RANDOMFOREST model with (900, 37) df


[32m2024-03-16 15:05:55.822[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


MAE -> 0.04647626666666666
MSE -> 0.0037896468515555537
RMSE -> 0.06156010763112386
R2 -> 0.7293119413410456
MAPE -> 0.08373068275003936
FINISHED



In [20]:
params = {
    "n_estimators": list(range(10, 100, 23)),
    "criterion" : ["squared_error", "absolute_error"],
    "max_depth": list(range(10, 100, 25)),
    "min_samples_split": list(range(2, 10, 3)),
    "min_samples_leaf": list(range(1, 10, 3))
}

result_forest = [0, 0, 0]
for i in range(3):
    result_forest[i] = grid_search(RandomForestReg, df_list[i], y_list[i], params)

100%|██████████| 288/288 [03:03<00:00,  1.57it/s]
100%|██████████| 288/288 [03:42<00:00,  1.29it/s]
100%|██████████| 288/288 [05:16<00:00,  1.10s/it]


In [21]:
result_forest

[{'n_estimators': 79,
  'criterion': 'squared_error',
  'max_depth': 10,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'best_score': 0.7196651703481908},
 {'n_estimators': 79,
  'criterion': 'squared_error',
  'max_depth': 35,
  'min_samples_split': 8,
  'min_samples_leaf': 1,
  'best_score': 0.7527890383886096},
 {'n_estimators': 79,
  'criterion': 'squared_error',
  'max_depth': 35,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'best_score': 0.7734889635931518}]

### SAVING GRID SEARCH result

In [22]:
json.dump(result_forest, open("Data/RandomForestGridSearchResults.json", "w"), indent=4)

In [23]:
model1 = RandomForestReg("RandomForestModelsss", 
                         n_estimators=79, 
                         criterion="squared_error", 
                         max_depth=10, 
                         min_samples_split=5, 
                         min_samples_leaf=1)
model2 = RandomForestReg("RandomForestModelsss", 
                         n_estimators=79, 
                         criterion="squared_error", 
                         max_depth=35, 
                         min_samples_split=8, 
                         min_samples_leaf=1)
model3 = RandomForestReg("RandomForestModelsss", 
                         n_estimators=79, 
                         criterion="squared_error", 
                         max_depth=35, 
                         min_samples_split=2, 
                         min_samples_leaf=1)
model_list = [model1, model2, model3]

In [24]:
for dfi in [df_with_5, df_with_10, df_with_15]:
    for i in range(3):
        model = model_list[i]
    print(f"RANDOMFOREST model with {dfi.shape} df")
    X_train, X_test, y_train, y_test = train_test_split(dfi, y, test_size=0.25, random_state=63)
    model.train(X_train, y_train)
    model.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {model.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

[32m2024-03-16 15:17:58.323[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


RANDOMFOREST model with (900, 13) df
MAE -> 0.04461097046413501
MSE -> 0.003628745598461784
RMSE -> 0.06023907036518562
R2 -> 0.7408048454405047
MAPE -> 0.08045619351607129
FINISHED

RANDOMFOREST model with (900, 22) df


[32m2024-03-16 15:17:58.456[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:17:58.635[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


MAE -> 0.047962194092826996
MSE -> 0.003888101129092558
RMSE -> 0.062354639996495514
R2 -> 0.7222795189816317
MAPE -> 0.08658165654052546
FINISHED

RANDOMFOREST model with (900, 37) df
MAE -> 0.046327651195499295
MSE -> 0.003757963451726041
RMSE -> 0.061302230397645735
R2 -> 0.7315750329502402
MAPE -> 0.08349243405953839
FINISHED



## Ridge regression

In [25]:
for dfi in [df_with_5, df_with_10, df_with_15]:
    model = RidgeRegression("Ridge model")
    print(f"RIDGE model with {dfi.shape} df")
    X_train, X_test, y_train, y_test = train_test_split(dfi, y, test_size=0.25, random_state=63)
    model.train(X_train, y_train)
    model.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {model.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

[32m2024-03-16 15:17:58.650[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:17:58.655[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:17:58.661[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


RIDGE model with (900, 13) df
MAE -> 0.03890428156859313
MSE -> 0.0028051353138695005
RMSE -> 0.05296352814786323
R2 -> 0.7996339336803027
MAPE -> 0.07020712680693414
FINISHED

RIDGE model with (900, 22) df
MAE -> 0.03950430415315832
MSE -> 0.002876505841843807
RMSE -> 0.053633066683192815
R2 -> 0.7945360576988245
MAPE -> 0.0715881757206933
FINISHED

RIDGE model with (900, 37) df
MAE -> 0.03907731535070195
MSE -> 0.002810013973654255
RMSE -> 0.053009564926098526
R2 -> 0.7992854592715459
MAPE -> 0.07092845704752176
FINISHED



In [26]:
params = {
    "alpha" : np.linspace(0.5, 2, 4),
    "max_iter" : list(range(100, 200, 20))
}

results_ridge = [0, 0, 0]
for i in range(3):
    results_ridge[i] = grid_search(RidgeRegression, df_list[i], y_list[i], params)

100%|██████████| 20/20 [00:00<00:00, 84.29it/s]
100%|██████████| 20/20 [00:00<00:00, 72.92it/s]
100%|██████████| 20/20 [00:00<00:00, 38.43it/s]


In [27]:
results_ridge

[{'alpha': 0.5, 'max_iter': 100, 'best_score': 0.783990125231396},
 {'alpha': 0.5, 'max_iter': 100, 'best_score': 0.8066143035157027},
 {'alpha': 0.5, 'max_iter': 100, 'best_score': 0.8353821261994646}]

In [28]:
json.dump(results_ridge, open("Data/RidgeGridSearchResults.json", "w"), indent=4)

In [29]:
for dfi in [df_with_5, df_with_10, df_with_15]:
    linregmodel = RidgeRegression("Ridge model", alpha=0.5, max_iter=100)
    print(f"RIDGE model with {dfi.shape} df")
    X_train, X_test, y_train, y_test = train_test_split(dfi, y, test_size=0.25, random_state=63)
    linregmodel.train(X_train, y_train)
    linregmodel.predict(X_test)
    for sctype in ["MAE", "MSE", "RMSE", "R2", "MAPE"]:
        print(f"{sctype} -> {linregmodel.score(y_test, score_type=sctype)}")
    print('FINISHED\n')

[32m2024-03-16 15:17:59.759[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:17:59.774[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m
[32m2024-03-16 15:17:59.789[0m | [1mINFO    [0m | [36mModels.abstract_model[0m:[36mtrain[0m:[36m75[0m - [1mModel was trained successfully: status:True[0m


RIDGE model with (900, 13) df
MAE -> 0.0386794756620964
MSE -> 0.0027853018306432126
RMSE -> 0.052775958832059246
R2 -> 0.801050605808674
MAPE -> 0.06984114213946244
FINISHED

RIDGE model with (900, 22) df
MAE -> 0.03925152120949069
MSE -> 0.002859376262820805
RMSE -> 0.05347313589851267
R2 -> 0.7957595945277188
MAPE -> 0.0711709927091428
FINISHED

RIDGE model with (900, 37) df
MAE -> 0.0387998415362179
MSE -> 0.0028044830414392266
RMSE -> 0.05295737003892118
R2 -> 0.7996805243956869
MAPE -> 0.07047252854384749
FINISHED

