 ### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

### Load the dataset

In [2]:
a1 = pd.read_excel('case_study1.xlsx')
a2 = pd.read_excel('case_study2.xlsx')

In [3]:
df1 = a1.copy()
df2 = a2.copy()

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PROSPECTID            51336 non-null  int64  
 1   Total_TL              51336 non-null  int64  
 2   Tot_Closed_TL         51336 non-null  int64  
 3   Tot_Active_TL         51336 non-null  int64  
 4   Total_TL_opened_L6M   51336 non-null  int64  
 5   Tot_TL_closed_L6M     51336 non-null  int64  
 6   pct_tl_open_L6M       51336 non-null  float64
 7   pct_tl_closed_L6M     51336 non-null  float64
 8   pct_active_tl         51336 non-null  float64
 9   pct_closed_tl         51336 non-null  float64
 10  Total_TL_opened_L12M  51336 non-null  int64  
 11  Tot_TL_closed_L12M    51336 non-null  int64  
 12  pct_tl_open_L12M      51336 non-null  float64
 13  pct_tl_closed_L12M    51336 non-null  float64
 14  Tot_Missed_Pmnt       51336 non-null  int64  
 15  Auto_TL            

In [5]:
df1.shape

(51336, 26)

In [6]:
df2.shape

(51336, 62)

In [7]:
df1.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0,0,1,0,4,1,4,0,72,18
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,1,0,7,7
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0,6,1,0,0,2,6,0,47,2
3,4,1,0,1,1,0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,1,5,5
4,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0,0,0,0,0,3,0,2,131,32


In [8]:
df2.head()

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,549,35,15,11,29,29,0,0,0,...,0.0,0.0,0.0,13.333,1,0,PL,PL,696,P2
1,2,47,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,0.86,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,302,11,3,9,25,25,1,9,8,...,0.0,0.0,0.0,5741.667,1,0,ConsumerLoan,others,693,P2
3,4,-99999,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,9.9,0,0,others,others,673,P2
4,5,583,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,-99999.0,0,0,AL,AL,753,P1


### Remove nulls

In [9]:
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [10]:
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(i)

In [11]:
len(columns_to_be_removed)

8

In [12]:
df2 = df2.drop(columns_to_be_removed, axis = 1)

In [13]:
df2.shape

(51336, 54)

In [14]:
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [15]:
df2.shape

(42066, 54)

In [16]:
df2.isna().sum()

PROSPECTID                    0
time_since_recent_payment     0
num_times_delinquent          0
max_recent_level_of_deliq     0
num_deliq_6mts                0
num_deliq_12mts               0
num_deliq_6_12mts             0
num_times_30p_dpd             0
num_times_60p_dpd             0
num_std                       0
num_std_6mts                  0
num_std_12mts                 0
num_sub                       0
num_sub_6mts                  0
num_sub_12mts                 0
num_dbt                       0
num_dbt_6mts                  0
num_dbt_12mts                 0
num_lss                       0
num_lss_6mts                  0
num_lss_12mts                 0
recent_level_of_deliq         0
tot_enq                       0
CC_enq                        0
CC_enq_L6m                    0
CC_enq_L12m                   0
PL_enq                        0
PL_enq_L6m                    0
PL_enq_L12m                   0
time_since_recent_enq         0
enq_L12m                      0
enq_L6m 

In [17]:
df1.isna().sum()

PROSPECTID              0
Total_TL                0
Tot_Closed_TL           0
Tot_Active_TL           0
Total_TL_opened_L6M     0
Tot_TL_closed_L6M       0
pct_tl_open_L6M         0
pct_tl_closed_L6M       0
pct_active_tl           0
pct_closed_tl           0
Total_TL_opened_L12M    0
Tot_TL_closed_L12M      0
pct_tl_open_L12M        0
pct_tl_closed_L12M      0
Tot_Missed_Pmnt         0
Auto_TL                 0
CC_TL                   0
Consumer_TL             0
Gold_TL                 0
Home_TL                 0
PL_TL                   0
Secured_TL              0
Unsecured_TL            0
Other_TL                0
Age_Oldest_TL           0
Age_Newest_TL           0
dtype: int64

### checking common columns name

In [18]:
for i in list(df1.columns):
    if i in list(df2.columns):
        print(i)

PROSPECTID


### Merge the two dataframes, inner join so that no nulls are present

In [19]:
df = pd.merge(df1,df2,how='inner',left_on=['PROSPECTID'],right_on=['PROSPECTID'])

In [20]:
df.shape

(42064, 79)

In [21]:
df.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0.0,0.0,0.0,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0.0,0.0,0.0,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.0,0.0,0.167,0.833,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   Total_TL                    42064 non-null  int64  
 2   Tot_Closed_TL               42064 non-null  int64  
 3   Tot_Active_TL               42064 non-null  int64  
 4   Total_TL_opened_L6M         42064 non-null  int64  
 5   Tot_TL_closed_L6M           42064 non-null  int64  
 6   pct_tl_open_L6M             42064 non-null  float64
 7   pct_tl_closed_L6M           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  Total_TL_opened_L12M        42064 non-null  int64  
 11  Tot_TL_closed_L12M          42064 non-null  int64  
 12  pct_tl_open_L12M            42064 non-null  float64
 13  pct_tl_closed_L12M          420

In [23]:
df.isna().sum()

PROSPECTID             0
Total_TL               0
Tot_Closed_TL          0
Tot_Active_TL          0
Total_TL_opened_L6M    0
                      ..
GL_Flag                0
last_prod_enq2         0
first_prod_enq2        0
Credit_Score           0
Approved_Flag          0
Length: 79, dtype: int64

### check how many columns are categorical

In [24]:
for i in df.columns:
    if df[i].dtype == 'object':
          print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [25]:
df['MARITALSTATUS'].value_counts()

Married    30886
Single     11178
Name: MARITALSTATUS, dtype: int64

In [26]:
df['first_prod_enq2'].value_counts()

others          20640
ConsumerLoan    11075
PL               4431
AL               2641
CC               1988
HL               1289
Name: first_prod_enq2, dtype: int64

#### Are these two associated - MARITALSTATUS vs Approved_Flag
#### Hpothesis testing
1.
H0: NULL HYPOTHESIS
    Not associated
2.
H1: Alternate Hypothesis
    Associated
3.Alpha (Assume)
Significance level
Strictness level
Margin error
5% = 0.05
0.00001
Less risky projects = High alpha
More risky projects = less alpha

4.
confidance interval = 1 - alpha

5.
calculate the evidence against H0
p-value
calculate using tests
T-test, Chisquare, Anova
Degree of freedom

6.
p-value <= alpha:
    Reject H0
p-value > alpha
    Fail to reject H0
    
what are these tests:
    Chisquare = Cat vs Cat
    T-test = Cat vs Num(2 categorical)
    Annova = Cat vs Num(>=3 categorical)

### Chi-square Test

In [27]:
for i in ['MARITALSTATUS','EDUCATION','GENDER','last_prod_enq2','first_prod_enq2']:
    chi2,pval,_,_ = chi2_contingency(pd.crosstab(df[i],df['Approved_Flag']))
    print(i, '---',pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


. since all the categorical features have pval <= 0.05, we will accept all

### VIF for numerical features

In [28]:
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [29]:
len(numeric_columns)

72

### Multicollinearity vs Correlation
Multicollinearity = Predictability of each features by other features
### IMPORTANT
Correlation is specific to linear relationships between columns
In convex functions, Correlation gives  misleading values
#### VIF = Parallel vs Sequential

 ### Variance Inflation Factor(VIF) sequentially check

In [30]:
vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

for i in range(0,total_columns):
    vif_value = variance_inflation_factor(vif_data.values, column_index)
    print(column_index, '----', vif_value)
    
    if vif_value <= 6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index = column_index + 1
    else:
        vif_data = vif_data.drop([numeric_columns[i]],axis = 1)


  vif = 1. / (1. - r_squared_i)


0 ---- inf
0 ---- inf
0 ---- 11.320180023967996
0 ---- 8.363698035000336
0 ---- 6.520647877790928
0 ---- 5.149501618212625
1 ---- 2.611111040579735
2 ---- inf
2 ---- 1788.7926256209232
2 ---- 8.601028256477228
2 ---- 3.8328007921530785
3 ---- 6.0996533816467355
3 ---- 5.5813520096427585
4 ---- 1.985584353098778
5 ---- inf
5 ---- 4.809538302819343
6 ---- 23.270628983464636
6 ---- 30.595522588100053
6 ---- 4.3843464059655854
7 ---- 3.0646584155234238
8 ---- 2.898639771299252
9 ---- 4.377876915347322
10 ---- 2.2078535836958433
11 ---- 4.916914200506864
12 ---- 5.214702030064725
13 ---- 3.3861625024231476
14 ---- 7.840583309478997
14 ---- 5.255034641721438
15 ---- inf
15 ---- 7.380634506427232
15 ---- 1.4210050015175733
16 ---- 8.083255010190323
16 ---- 1.6241227524040114
17 ---- 7.257811920140003
17 ---- 15.59624383268298
17 ---- 1.825857047132431
18 ---- 1.5080839450032666
19 ---- 2.172088834824578
20 ---- 2.623397553527229
21 ---- 2.2959970812106167
22 ---- 7.360578319196446
22 ---- 2.1

In [31]:
len(columns_to_be_kept)

39

### check Annova for columns_to_be_kept

In [32]:
from scipy.stats import f_oneway

In [33]:
# columns_to_be_kept_numerical = []

# for i in columns_to_be_kept:
#     a = list(df[i])
#     b = list(df['Approved_Flag'])
    
#     group_p1 = [value for value, group in zip(a,b) if group == 'p1']
#     group_p2 = [value for value, group in zip(a,b) if group == 'p2']
#     group_p3 = [value for value, group in zip(a,b) if group == 'p3']
#     group_p4 = [value for value, group in zip(a,b) if group == 'p4']
    
#     f_statistic, p_value = f_oneway(group_p1, group_p2, group_p3, group_p4)
    
#     if p_value <= 0.05:
#         columns_to_be_kept_numerical.append(i)

In [34]:
from scipy.stats import f_oneway
import pandas as pd

columns_to_be_kept_numerical = []

for column in columns_to_be_kept:
    groups = df.groupby('Approved_Flag')[column].apply(list)
    
    # Check if any group is empty or has less than 2 unique values
    if any(len(group) == 0 for group in groups) or any(len(set(group)) < 2 for group in groups):
        continue
    
    f_statistic, p_value = f_oneway(*groups)
    
    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(column)

print(columns_to_be_kept_numerical)


['pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Tot_TL_closed_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL', 'Home_TL', 'PL_TL', 'Secured_TL', 'Unsecured_TL', 'Other_TL', 'Age_Oldest_TL', 'Age_Newest_TL', 'time_since_recent_payment', 'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd', 'num_std_12mts', 'num_sub', 'num_sub_6mts', 'num_sub_12mts', 'num_dbt', 'num_dbt_12mts', 'num_lss', 'recent_level_of_deliq', 'CC_enq_L12m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L3m', 'NETMONTHLYINCOME', 'Time_With_Curr_Empr', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever', 'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag']


In [35]:
len(columns_to_be_kept_numerical)

37

In [36]:
for i in df.columns:
    if i not in columns_to_be_kept_numerical and df[i].dtype != 'object':
        df = df.drop(i,axis=1)

In [37]:
df.shape

(42064, 43)

 #### Feature selection is done for cat and num features

#### Encoding for the categorical features
['MARITALSTATUS','EDUCATION','GENDER','last_prod_enq2','first_prod_enq2']

In [38]:
df['MARITALSTATUS'].unique()

array(['Married', 'Single'], dtype=object)

In [39]:
df['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

In [40]:
df['GENDER'].unique()

array(['M', 'F'], dtype=object)

In [41]:
df['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [42]:
df['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [43]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [44]:
df['EDUCATION'].value_counts()

3    18931
2    11703
1     9532
4     1898
Name: EDUCATION, dtype: int64

In [45]:
df['EDUCATION'] = df['EDUCATION'].astype(int)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [47]:
df_encoded = pd.get_dummies(df,columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'],drop_first=True)

In [48]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [49]:
k = df_encoded.describe()

#### Machine Learning Model fitting

In [50]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,0,1,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,0,1,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,0,1,0,0,0,0,0,0,1,0


#### Data processing
#### 1. Random forest

In [51]:
x = df_encoded.drop(['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [53]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state = 42)

In [54]:
rf_classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=42)

In [55]:
y_pred = rf_classifier.predict(x_test)

In [56]:
accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy: {accuracy}')
print()
precision, recall, f1_score, _= precision_recall_fscore_support(y_test,y_pred)


Accuracy: 0.7660763104718887



In [57]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")

Class p1:
Precision: 0.8400926998841252
Recall: 0.7149901380670611
F1 Score: 0.7725093233883857
Class p2:
Precision: 0.7988731432473963
Recall: 0.9274529236868186
F1 Score: 0.8583746101632729
Class p3:
Precision: 0.44661654135338347
Recall: 0.2241509433962264
F1 Score: 0.29849246231155774
Class p4:
Precision: 0.7237354085603113
Recall: 0.7230320699708455
F1 Score: 0.7233835683033544


#### 2. xgBoost

In [58]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


Keyring is skipped due to an exception: 'keyring.backends'


In [59]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'],axis=1)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x,y_encoded, test_size=0.2,random_state=42)

xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print()
print(f"Accuracy: {accuracy:.2f}")
print()
precision, recall, f1_score, _= precision_recall_fscore_support(y_test,y_pred)


Accuracy: 0.78



In [60]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")

Class p1:
Precision: 0.8313641245972073
Recall: 0.7633136094674556
F1 Score: 0.7958868894601543
Class p2:
Precision: 0.8247275326067536
Recall: 0.9149653121902874
F1 Score: 0.8675061078744597
Class p3:
Precision: 0.46565774155995343
Recall: 0.3018867924528302
F1 Score: 0.3663003663003663
Class p4:
Precision: 0.7368421052631579
Recall: 0.7346938775510204
F1 Score: 0.7357664233576642


#### Decision Tree

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
x = df_encoded.drop(['Approved_Flag'], axis = 1)
y = df_encoded['Approved_Flag']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print()
print(f"Accurac y: {accuracy:.2f}")
print()
precision, recall, f1_score, _= precision_recall_fscore_support(y_test,y_pred)


Accurac y: 0.71



In [63]:
for i,v in enumerate(['p1','p2','p3','p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")

Class p1:
Precision: 0.716374269005848
Recall: 0.7248520710059172
F1 Score: 0.7205882352941176
Class p2:
Precision: 0.8066524022563704
Recall: 0.822001982160555
F1 Score: 0.8142548596112311
Class p3:
Precision: 0.3394863563402889
Recall: 0.3192452830188679
F1 Score: 0.3290548424737456
Class p4:
Precision: 0.646
Recall: 0.6277939747327502
F1 Score: 0.6367668802365697


#### HyperParameter tuning for xgboost
#### Define the hyperparameter grid

In [64]:
param_grid = {
    'colsample_bytree' : [0.1,0.3,0.5,0.7,0.9],
    'learning_rate' : [0.001, 0.01, 0.1, 1],
    'max_depth' : [3, 5, 8, 10],
    'alpha' : [1, 10, 100],
    'n_estimators' : [10, 50, 100]
}

index = 0

answers_grid = {
    'combination'    : [],
    'train_Accuracy' : [],
    'test_Accuracy'  : [],
    'colsample_bytree': [],
    'learning_rate' :  [],
    'max_depth' :      [],
    'alpha'     :      [],
    'n_estimators'   : []
}

In [65]:
# Loop through each combinationo of hyperparameters

for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    index = index + 1
                    
                    # Define and train the XGBoost model
                    model = xgb.XGBClassifier(objective = 'multi:softmax',
                                             num_class = 4,
                                             colsample_bytree = colsample_bytree,
                                             learning_rate = learning_rate,
                                             max_depth = max_depth,
                                             alpha = alpha,
                                             n_estimators = n_estimators)
                    
                    y = df_encoded['Approved_Flag']
                    x = df_encoded.drop(['Approved_Flag'], axis = 1)
                    
                    label_encoder = LabelEncoder()
                    y_encoded = label_encoder.fit_transform(y)
                    
                    x_train, x_test, y_train, y_test = train_test_split(x,y_encoded,test_size=0.2,random_state=42)
                    
                    model.fit(x_train, y_train)
                    
                    # Predict on training and testing sets
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)
                    
                    # Calculate train and test results
                    
                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test, y_pred_test)
                    
                    # Include into the list
                    answers_grid['combination'].append(index)
                    answers_grid['train_Accuracy'].append(train_accuracy)
                    answers_grid['test_Accuracy'].append(test_accuracy)
                    answers_grid['colsample_bytree'].append(colsample_bytree)
                    answers_grid['learning_rate'].append(learning_rate)
                    answers_grid['max_depth'].append(max_depth)
                    answers_grid['alpha'].append(alpha)
                    answers_grid['n_estimators'].append(n_estimators)
                    
                    # print results for this combination
                    print(f"Combination {index}")
                    print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}")

                    print(f"Train Accuracy: {train_accuracy:.2f}")
                    print(f"Test Accuracy: {test_accuracy:.2f}")
                    print("-"*30)

Combination 1
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 2
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 3
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 4
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 5
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 6
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 7
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.61
Test Accuracy: 0.60
-----------------------

Combination 58
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 59
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 60
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.61
------------------------------
Combination 61
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 62
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 63
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.61
Test Accuracy: 0.60
------------------------------
Combination 64
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 10
Train Accuracy: 0.61
Test Accuracy: 0.60
----------------------

Combination 116
colsample_bytree: 0.1, learning_rate: 1, max_depth: 3
Train Accuracy: 0.75
Test Accuracy: 0.75
------------------------------
Combination 117
colsample_bytree: 0.1, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.76
------------------------------
Combination 118
colsample_bytree: 0.1, learning_rate: 1, max_depth: 5
Train Accuracy: 0.72
Test Accuracy: 0.70
------------------------------
Combination 119
colsample_bytree: 0.1, learning_rate: 1, max_depth: 5
Train Accuracy: 0.82
Test Accuracy: 0.76
------------------------------
Combination 120
colsample_bytree: 0.1, learning_rate: 1, max_depth: 5
Train Accuracy: 0.86
Test Accuracy: 0.76
------------------------------
Combination 121
colsample_bytree: 0.1, learning_rate: 1, max_depth: 5
Train Accuracy: 0.71
Test Accuracy: 0.70
------------------------------
Combination 122
colsample_bytree: 0.1, learning_rate: 1, max_depth: 5
Train Accuracy: 0.79
Test Accuracy: 0.76
------------------------------
Combin

Combination 173
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.73
Test Accuracy: 0.69
------------------------------
Combination 174
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.73
Test Accuracy: 0.69
------------------------------
Combination 175
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.69
Test Accuracy: 0.68
------------------------------
Combination 176
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.69
Test Accuracy: 0.68
------------------------------
Combination 177
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.69
Test Accuracy: 0.68
------------------------------
Combination 178
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.66
Test Accuracy: 0.66
------------------------------
Combination 179
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.66
Test Accuracy: 0.65
--

Combination 230
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.76
Test Accuracy: 0.75
------------------------------
Combination 231
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.79
Test Accuracy: 0.77
------------------------------
Combination 232
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.67
Test Accuracy: 0.67
------------------------------
Combination 233
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.73
Test Accuracy: 0.72
------------------------------
Combination 234
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.76
Test Accuracy: 0.75
------------------------------
Combination 235
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8
Train Accuracy: 0.73
Test Accuracy: 0.70
------------------------------
Combination 236
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8
Train Accuracy: 0.82
Test Accuracy: 0.75
-----------------------

Combination 288
colsample_bytree: 0.3, learning_rate: 1, max_depth: 10
Train Accuracy: 0.78
Test Accuracy: 0.76
------------------------------
Combination 289
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.69
Test Accuracy: 0.69
------------------------------
Combination 290
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.69
Test Accuracy: 0.69
------------------------------
Combination 291
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.70
Test Accuracy: 0.69
------------------------------
Combination 292
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.69
Test Accuracy: 0.69
------------------------------
Combination 293
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.69
Test Accuracy: 0.69
------------------------------
Combination 294
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.69
Test Accuracy: 0.69
------------

Combination 345
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.79
Test Accuracy: 0.75
------------------------------
Combination 346
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.74
Test Accuracy: 0.72
------------------------------
Combination 347
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.76
Test Accuracy: 0.74
------------------------------
Combination 348
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.77
Test Accuracy: 0.75
------------------------------
Combination 349
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.70
Test Accuracy: 0.69
------------------------------
Combination 350
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.71
Test Accuracy: 0.70
------------------------------
Combination 351
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.72
Test Accuracy: 0.71
----------------

Combination 402
colsample_bytree: 0.5, learning_rate: 1, max_depth: 3
Train Accuracy: 0.81
Test Accuracy: 0.78
------------------------------
Combination 403
colsample_bytree: 0.5, learning_rate: 1, max_depth: 3
Train Accuracy: 0.76
Test Accuracy: 0.75
------------------------------
Combination 404
colsample_bytree: 0.5, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.77
------------------------------
Combination 405
colsample_bytree: 0.5, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.77
------------------------------
Combination 406
colsample_bytree: 0.5, learning_rate: 1, max_depth: 5
Train Accuracy: 0.79
Test Accuracy: 0.76
------------------------------
Combination 407
colsample_bytree: 0.5, learning_rate: 1, max_depth: 5
Train Accuracy: 0.87
Test Accuracy: 0.77
------------------------------
Combination 408
colsample_bytree: 0.5, learning_rate: 1, max_depth: 5
Train Accuracy: 0.92
Test Accuracy: 0.77
------------------------------
Combin

Combination 459
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 8
Train Accuracy: 0.74
Test Accuracy: 0.73
------------------------------
Combination 460
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.82
Test Accuracy: 0.76
------------------------------
Combination 461
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.83
Test Accuracy: 0.77
------------------------------
Combination 462
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.83
Test Accuracy: 0.77
------------------------------
Combination 463
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.78
Test Accuracy: 0.75
------------------------------
Combination 464
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.79
Test Accuracy: 0.76
------------------------------
Combination 465
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 10
Train Accuracy: 0.79
Test Accuracy: 0.76
---

Combination 516
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.81
Test Accuracy: 0.78
------------------------------
Combination 517
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.75
Test Accuracy: 0.74
------------------------------
Combination 518
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.79
Test Accuracy: 0.77
------------------------------
Combination 519
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.80
Test Accuracy: 0.78
------------------------------
Combination 520
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.73
Test Accuracy: 0.72
------------------------------
Combination 521
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.77
Test Accuracy: 0.76
------------------------------
Combination 522
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 5
Train Accuracy: 0.78
Test Accuracy: 0.77
-----------------------

Combination 574
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10
Train Accuracy: 0.78
Test Accuracy: 0.76
------------------------------
Combination 575
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10
Train Accuracy: 0.79
Test Accuracy: 0.77
------------------------------
Combination 576
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10
Train Accuracy: 0.79
Test Accuracy: 0.77
------------------------------
Combination 577
colsample_bytree: 0.9, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.71
Test Accuracy: 0.71
------------------------------
Combination 578
colsample_bytree: 0.9, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.71
Test Accuracy: 0.71
------------------------------
Combination 579
colsample_bytree: 0.9, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.72
Test Accuracy: 0.71
------------------------------
Combination 580
colsample_bytree: 0.9, learning_rate: 0.001, max_depth: 3
Train Accuracy: 0.71
Test Accuracy: 0.71
------------------

Combination 631
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.79
Test Accuracy: 0.76
------------------------------
Combination 632
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.80
Test Accuracy: 0.77
------------------------------
Combination 633
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.81
Test Accuracy: 0.77
------------------------------
Combination 634
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.78
Test Accuracy: 0.76
------------------------------
Combination 635
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.78
Test Accuracy: 0.76
------------------------------
Combination 636
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.79
Test Accuracy: 0.77
------------------------------
Combination 637
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 8
Train Accuracy: 0.75
Test Accuracy: 0.74
----------------

Combination 688
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.78
Test Accuracy: 0.77
------------------------------
Combination 689
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.80
Test Accuracy: 0.78
------------------------------
Combination 690
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.81
Test Accuracy: 0.78
------------------------------
Combination 691
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.76
------------------------------
Combination 692
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.77
------------------------------
Combination 693
colsample_bytree: 0.9, learning_rate: 1, max_depth: 3
Train Accuracy: 0.77
Test Accuracy: 0.77
------------------------------
Combination 694
colsample_bytree: 0.9, learning_rate: 1, max_depth: 5
Train Accuracy: 0.80
Test Accuracy: 0.77
------------------------------
Combin

In [66]:
model = xgb.XGBClassifier(objective='multi:softmax',
                         num_class = 4,
                         colsample_bytree=0.9,
                         learning_rate=1,
                         max_depth=3,
                         alpha=10,
                         n_estimators=100)

In [67]:
model.fit(x_train, y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='', learning_rate=1,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=3,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_class=4, num_parallel_tree=1, objective='multi:softmax',
              predictor='auto', ...)

In [69]:
accuracy = accuracy_score(y_test,model.predict(x_test))
print(f"Accuracy: {accuracy}")

Accuracy: 0.7767740401759182
