In [1]:
import pandas as pd 
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

In [2]:
# pip install pycaret
# Tutorial source: https://github.com/pycaret/pycaret/blob/master/tutorials/Regression%20Tutorial%20Level%20Beginner%20-%20REG101.ipynb

In [3]:
# Import CSV
dataset = pd.read_csv('credit_master.csv')

In [4]:
dataset.shape

(5426, 26)

In [5]:
dataset.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,MONTHS_0,MONTHS_1,MONTHS_2,MONTHS_3,MONTHS_4,MONTHS_5,MONTHS_C,MONTHS_X,MONTHS_TOT,RISK
0,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,1.0,1.0,0.0,0.0,0.0,0.0,12.0,1.0,15.0,0
1,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,7.0,0.0,0.0,0.0,0.0,0.0,7.0,16.0,30.0,0
2,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,6.0,0.0,0.0,0.0,0.0,0.0,27.0,6.0,39.0,0
3,5008814,F,N,Y,0,283500.0,Pensioner,Higher education,Separated,House / apartment,...,14.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,17.0,0
4,5008824,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0


In [6]:
dataset.dtypes

ID                       int64
CODE_GENDER             object
FLAG_OWN_CAR            object
FLAG_OWN_REALTY         object
CNT_CHILDREN             int64
AMT_INCOME_TOTAL       float64
NAME_INCOME_TYPE        object
NAME_EDUCATION_TYPE     object
NAME_FAMILY_STATUS      object
NAME_HOUSING_TYPE       object
DAYS_BIRTH               int64
DAYS_EMPLOYED            int64
FLAG_WORK_PHONE          int64
FLAG_PHONE               int64
FLAG_EMAIL               int64
CNT_FAM_MEMBERS        float64
MONTHS_0               float64
MONTHS_1               float64
MONTHS_2               float64
MONTHS_3               float64
MONTHS_4               float64
MONTHS_5               float64
MONTHS_C               float64
MONTHS_X               float64
MONTHS_TOT             float64
RISK                     int64
dtype: object

#### Regression

In [7]:
# Withhold data to be used for predictions to check model accuracies
data = dataset.sample(frac=0.7, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (3798, 26)
Unseen Data For Predictions: (1628, 26)


In [8]:
# Prepare environment and data for modeling and deployment
from pycaret.regression import *
exp_reg101 = setup(data = data, target = 'RISK', session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,RISK
2,Original Data,"(3798, 26)"
3,Missing Values,False
4,Numeric Features,14
5,Categorical Features,11
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(2658, 47)"


In [9]:
# Compare models to evaluate performance
best = compare_models(exclude = ['ransac'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.432
ada,AdaBoost Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.006
gbr,Gradient Boosting Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.056
rf,Random Forest Regressor,0.0002,0.0,0.0034,0.9998,0.0,0.001,0.278
et,Extra Trees Regressor,0.0217,0.0039,0.0617,0.9747,0.0,0.0534,0.081
br,Bayesian Ridge,0.279,0.1182,0.3434,0.2357,0.0,0.6061,0.006
ridge,Ridge Regression,0.279,0.119,0.3447,0.2296,0.0,0.5991,0.006
lar,Least Angle Regression,0.2792,0.1191,0.3448,0.2287,0.0,0.5988,0.006
lr,Linear Regression,0.2789,0.1192,0.3449,0.2284,0.0,0.5991,0.439
omp,Orthogonal Matching Pursuit,0.28,0.1199,0.3459,0.2236,0.0,0.6097,0.005


In [10]:
# Create models for Extra Trees, Bayesian Ridge, and Ridge Regression since they are the least overfit
# BR
br = create_model('br')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2761,0.1115,0.3339,0.2913,-0.0,0.5924
1,0.2796,0.1212,0.3481,0.2405,-0.0,0.6106
2,0.2723,0.1133,0.3367,0.2575,-0.0,0.6286
3,0.2809,0.1182,0.3437,0.2137,-0.0,0.6157
4,0.2855,0.1187,0.3446,0.222,-0.0,0.5848
5,0.2552,0.0996,0.3155,0.1287,-0.0,0.6023
6,0.3023,0.1381,0.3716,0.2942,-0.0,0.6269
7,0.2879,0.1299,0.3604,0.1356,-0.0,0.596
8,0.2884,0.1229,0.3506,0.2811,-0.0,0.5881
9,0.2621,0.1083,0.3292,0.2922,-0.0,0.6151


In [11]:
print(br)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)


In [12]:
# Ridge
ridge = create_model('ridge')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2717,0.1099,0.3314,0.3015,-0.0,0.5819
1,0.2821,0.1239,0.352,0.2233,-0.0,0.6102
2,0.2736,0.1168,0.3418,0.2348,-0.0,0.6321
3,0.276,0.1156,0.34,0.2308,-0.0,0.6046
4,0.2866,0.1203,0.3469,0.2118,-0.0,0.5654
5,0.26,0.1031,0.3211,0.0979,-0.0,0.5961
6,0.3014,0.1379,0.3713,0.2953,-0.0,0.6194
7,0.2887,0.13,0.3606,0.1346,-0.0,0.5918
8,0.2888,0.1245,0.3528,0.2718,-0.0,0.5837
9,0.2613,0.108,0.3287,0.2943,-0.0,0.6058


In [13]:
print(ridge)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=123, solver='auto', tol=0.001)


In [14]:
# ET
et = create_model('et')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0182,0.0024,0.0492,0.9846,-0.0,0.0463
1,0.0238,0.0064,0.08,0.9599,-0.0,0.0606
2,0.0183,0.0021,0.0456,0.9864,-0.0,0.0352
3,0.0229,0.0036,0.0601,0.9759,-0.0,0.0653
4,0.0216,0.0038,0.0617,0.975,-0.0,0.067
5,0.0194,0.0034,0.0585,0.9701,-0.0,0.0517
6,0.0227,0.004,0.0636,0.9793,-0.0,0.0499
7,0.0182,0.0026,0.0507,0.9829,-0.0,0.0412
8,0.025,0.0053,0.073,0.9688,-0.0,0.0353
9,0.0266,0.0056,0.0746,0.9637,-0.0,0.0812


In [15]:
print(et)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)


In [16]:
# Tune models
# BR

tuned_br = tune_model(br)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2758,0.1113,0.3336,0.2923,-0.0,0.5919
1,0.2797,0.1213,0.3483,0.2399,-0.0,0.6107
2,0.2723,0.1134,0.3367,0.2572,-0.0,0.6287
3,0.2806,0.118,0.3435,0.215,-0.0,0.6153
4,0.2855,0.1186,0.3444,0.2227,-0.0,0.5839
5,0.2552,0.0996,0.3156,0.1285,-0.0,0.602
6,0.3022,0.1381,0.3716,0.2944,-0.0,0.6268
7,0.288,0.1299,0.3604,0.1358,-0.0,0.5958
8,0.2884,0.1229,0.3506,0.2811,-0.0,0.5878
9,0.2621,0.1083,0.3291,0.2924,-0.0,0.6148


In [17]:
print(tuned_br)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=0.05, lambda_2=0.0001, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)


In [18]:
# ridge

tuned_ridge = tune_model(ridge)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.272,0.1099,0.3315,0.3014,-0.0,0.5827
1,0.2819,0.1239,0.3519,0.2236,-0.0,0.6108
2,0.273,0.1163,0.341,0.2381,-0.0,0.6319
3,0.2761,0.1157,0.3401,0.2302,-0.0,0.6063
4,0.286,0.1194,0.3456,0.2176,-0.0,0.5666
5,0.2588,0.1022,0.3196,0.1058,-0.0,0.5964
6,0.3014,0.138,0.3714,0.2949,-0.0,0.6206
7,0.2886,0.13,0.3605,0.1352,-0.0,0.592
8,0.2884,0.1242,0.3524,0.2738,-0.0,0.5836
9,0.2614,0.108,0.3287,0.2943,-0.0,0.6064


In [19]:
print(tuned_ridge)

Ridge(alpha=8.6, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=123, solver='auto', tol=0.001)


In [20]:
# ET

tuned_et = tune_model(et)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.2665,0.1178,0.3432,0.2513,-0.0,0.6767
1,0.2767,0.1264,0.3556,0.2077,-0.0,0.6989
2,0.268,0.1233,0.3512,0.1921,-0.0,0.7156
3,0.2657,0.1182,0.3438,0.2135,-0.0,0.6973
4,0.2679,0.1186,0.3443,0.2233,-0.0,0.6897
5,0.2395,0.0932,0.3053,0.1843,-0.0,0.6876
6,0.3049,0.1593,0.3991,0.1858,-0.0,0.7182
7,0.2684,0.1194,0.3456,0.2052,-0.0,0.6967
8,0.2822,0.1313,0.3623,0.2322,-0.0,0.6915
9,0.2729,0.1233,0.3511,0.1947,-0.0,0.6998


In [21]:
print(tuned_et)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0001,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=5, min_weight_fraction_leaf=0.0,
                    n_estimators=240, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)


In [22]:
# Evaluate models
# BR
evaluate_model(tuned_br)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [23]:
# RIDGE
evaluate_model(tuned_ridge)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [24]:
# ET
evaluate_model(tuned_et)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [25]:
# Predict model on hold-out sample
# BR
predict_model(tuned_br);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,0.2683,0.1088,0.3298,0.2707,0.2362,0.5995


In [26]:
# RIDGE
predict_model(tuned_ridge);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Ridge Regression,0.2691,0.1102,0.332,0.2613,0.2381,0.5982


In [27]:
# ET
predict_model(tuned_et);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.2651,0.118,0.3436,0.2087,0.239,0.7021


In [28]:
# Finalize model for deployment
# BR
final_br = finalize_model(tuned_br)

In [29]:
print(final_br)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=0.05, lambda_2=0.0001, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)


In [30]:
# RIDGE
final_ridge = finalize_model(tuned_ridge)

In [31]:
print(final_ridge)

Ridge(alpha=8.6, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=123, solver='auto', tol=0.001)


In [32]:
# ET
final_et = finalize_model(tuned_et)

In [33]:
print(final_et)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0001,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=5, min_weight_fraction_leaf=0.0,
                    n_estimators=240, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)


In [34]:
# Predict on unseen data
# BR
unseen_pred_br = predict_model(final_br, data = data_unseen)

In [35]:
unseen_pred_br.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,MONTHS_1,MONTHS_2,MONTHS_3,MONTHS_4,MONTHS_5,MONTHS_C,MONTHS_X,MONTHS_TOT,RISK,Label
0,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,7.0,16.0,30.0,0,0.306137
1,5008824,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0,0.253283
2,5008826,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,...,7.0,0.0,0.0,0.0,0.0,18.0,0.0,30.0,0,0.299521
3,5008837,M,Y,Y,3,270000.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,16.0,0.0,17.0,0,-0.030346
4,5008843,M,N,Y,1,405000.0,Commercial associate,Higher education,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,20.0,0.0,30.0,0,-0.307348


In [36]:
# RIDGE
unseen_pred_ridge = predict_model(final_ridge, data = data_unseen)

In [37]:
unseen_pred_ridge.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,MONTHS_1,MONTHS_2,MONTHS_3,MONTHS_4,MONTHS_5,MONTHS_C,MONTHS_X,MONTHS_TOT,RISK,Label
0,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,7.0,16.0,30.0,0,0.268235
1,5008824,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0,0.220147
2,5008826,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,...,7.0,0.0,0.0,0.0,0.0,18.0,0.0,30.0,0,0.272183
3,5008837,M,Y,Y,3,270000.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,16.0,0.0,17.0,0,-0.008024
4,5008843,M,N,Y,1,405000.0,Commercial associate,Higher education,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,20.0,0.0,30.0,0,-0.311763


In [38]:
# ET
unseen_pred_et = predict_model(final_et, data = data_unseen)

In [39]:
unseen_pred_et.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,...,MONTHS_1,MONTHS_2,MONTHS_3,MONTHS_4,MONTHS_5,MONTHS_C,MONTHS_X,MONTHS_TOT,RISK,Label
0,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,7.0,16.0,30.0,0,0.150099
1,5008824,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0,0.112931
2,5008826,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,...,7.0,0.0,0.0,0.0,0.0,18.0,0.0,30.0,0,0.146211
3,5008837,M,Y,Y,3,270000.0,Working,Secondary / secondary special,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,16.0,0.0,17.0,0,0.103564
4,5008843,M,N,Y,1,405000.0,Commercial associate,Higher education,Married,House / apartment,...,0.0,0.0,0.0,0.0,0.0,20.0,0.0,30.0,0,0.079052
