### Import Libraries

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
df = pd.read_excel('input_model.xlsx')

In [5]:
df.head()

Unnamed: 0,log_limit_bal,log_age,sex,marriage,education,sep_status,aug_status,jul_status,jun_status,may_status,apr_status,default
0,9.903488,3.178054,2,1,2,2,2,0,0,0,0,1
1,11.695247,3.258097,2,2,2,0,2,0,0,0,2,1
2,11.407565,3.526361,2,2,2,0,0,0,0,0,0,0
3,10.819778,3.610918,2,1,2,0,0,0,0,0,0,0
4,10.819778,4.043051,1,1,2,0,0,0,0,0,0,0


In [6]:
convert_to_object = ['sex','marriage','education',
                    'sep_status','aug_status','jul_status','jun_status','may_status','apr_status'
                    ]
for c_o in convert_to_object:
    df[c_o] = df[c_o].astype(str)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29998 entries, 0 to 29997
Data columns (total 12 columns):
log_limit_bal    29998 non-null float64
log_age          29998 non-null float64
sex              29998 non-null object
marriage         29998 non-null object
education        29998 non-null object
sep_status       29998 non-null object
aug_status       29998 non-null object
jul_status       29998 non-null object
jun_status       29998 non-null object
may_status       29998 non-null object
apr_status       29998 non-null object
default          29998 non-null int64
dtypes: float64(2), int64(1), object(9)
memory usage: 2.7+ MB


#### Converting the catgorical variables to dummy varibales

In [8]:
df = pd.get_dummies(df,prefix=['sex','marriage','education',
                               'sep_status','aug_status','jul_status','jun_status','may_status','apr_status'
                              ],
                    columns=['sex','marriage','education','sep_status','aug_status','jul_status','jun_status',
                             'may_status','apr_status'
                            ]
                   )

In [9]:
x = df.drop(columns='default')
x.head()

Unnamed: 0,log_limit_bal,log_age,sex_1,sex_2,marriage_1,marriage_2,marriage_3,education_1,education_2,education_3,...,may_status_7,may_status_8,apr_status_0,apr_status_2,apr_status_3,apr_status_4,apr_status_5,apr_status_6,apr_status_7,apr_status_8
0,9.903488,3.178054,0,1,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,11.695247,3.258097,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,11.407565,3.526361,0,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,10.819778,3.610918,0,1,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,10.819778,4.043051,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


#### Ensemble learning methods can perform feature importance and fit the model with best features.But Logistic regression works well if we feed the model with best features.As a first step,we will find the columns with less variance and remove those features.Less variance contributes less to the model

In [10]:
from sklearn.feature_selection import VarianceThreshold
var_thresh = VarianceThreshold(threshold=0.1)
trans = var_thresh.fit(x)

In [11]:
trans.get_support()

array([ True, False,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [13]:
concol = [column for column in x.columns 
          if column not in x.columns[trans.get_support()]]

for features in concol:
    print(features)


log_age
marriage_3
education_4
sep_status_2
sep_status_3
sep_status_4
sep_status_5
sep_status_6
sep_status_7
sep_status_8
aug_status_1
aug_status_3
aug_status_4
aug_status_5
aug_status_6
aug_status_7
aug_status_8
jul_status_1
jul_status_3
jul_status_4
jul_status_5
jul_status_6
jul_status_7
jul_status_8
jun_status_1
jun_status_2
jun_status_3
jun_status_4
jun_status_5
jun_status_6
jun_status_7
jun_status_8
may_status_0
may_status_2
may_status_3
may_status_4
may_status_5
may_status_6
may_status_7
may_status_8
apr_status_0
apr_status_2
apr_status_3
apr_status_4
apr_status_5
apr_status_6
apr_status_7
apr_status_8


In [14]:
new_x = x.drop(concol,axis=1)

#### After performing the test,we removed the columns with less variance

In [15]:
new_x.head()

Unnamed: 0,log_limit_bal,sex_1,sex_2,marriage_1,marriage_2,education_1,education_2,education_3,sep_status_0,sep_status_1,aug_status_0,aug_status_2,jul_status_0,jul_status_2,jun_status_0
0,9.903488,0,1,1,0,0,1,0,0,0,0,1,1,0,1
1,11.695247,0,1,0,1,0,1,0,1,0,0,1,1,0,1
2,11.407565,0,1,0,1,0,1,0,1,0,1,0,1,0,1
3,10.819778,0,1,1,0,0,1,0,1,0,1,0,1,0,1
4,10.819778,1,0,1,0,0,1,0,1,0,1,0,1,0,1


In [17]:
y = df['default']
y.head()

0    1
1    1
2    0
3    0
4    0
Name: default, dtype: int64

#### Modeling

In [43]:
x_train,x_test,y_train,y_test = train_test_split(new_x,y,test_size=0.2,random_state=42)

In [44]:
import statsmodels.api as sm
x_train = sm.add_constant(x_train)
x_test = sm.add_constant(x_test)


  return ptp(axis=axis, out=out, **kwargs)


In [45]:
x_train.shape

(23998, 16)

#### First,we will use logit regression and remove the features greater than our significance level.Here our significance level will be 0.05

In [46]:
logit_1 = sm.Logit(y_train,x_train)
result = logit_1.fit()
result.summary()

0,1,2,3
Dep. Variable:,default,No. Observations:,23998.0
Model:,Logit,Df Residuals:,23983.0
Method:,MLE,Df Model:,14.0
Date:,"Tue, 27 Sep 2022",Pseudo R-squ.:,0.1677
Time:,16:14:41,Log-Likelihood:,-10555.0
converged:,True,LL-Null:,-12681.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6195,9.69e+05,1.67e-06,1.000,-1.9e+06,1.9e+06
log_limit_bal,-0.2490,0.020,-12.649,0.000,-0.288,-0.210
sex_1,0.8761,9.69e+05,9.04e-07,1.000,-1.9e+06,1.9e+06
sex_2,0.7434,9.69e+05,7.67e-07,1.000,-1.9e+06,1.9e+06
marriage_1,0.0914,0.152,0.602,0.547,-0.206,0.389
marriage_2,-0.1244,0.152,-0.820,0.412,-0.422,0.173
education_1,1.1895,0.219,5.442,0.000,0.761,1.618
education_2,1.1649,0.218,5.346,0.000,0.738,1.592
education_3,1.1072,0.221,5.018,0.000,0.675,1.540


#### We see sex_1 has high p value.we will remove the feature with highest p value and repeat the same process through backward elimination

In [51]:
x_train = x_train.drop('sex_1',axis=1)
logit_2 = sm.Logit(y_train,x_train)
result_2= logit_2.fit()
result_2.summary()

Optimization terminated successfully.
         Current function value: 0.439809
         Iterations 7


0,1,2,3
Dep. Variable:,default,No. Observations:,23998.0
Model:,Logit,Df Residuals:,23983.0
Method:,MLE,Df Model:,14.0
Date:,"Tue, 27 Sep 2022",Pseudo R-squ.:,0.1677
Time:,16:14:48,Log-Likelihood:,-10555.0
converged:,True,LL-Null:,-12681.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4956,0.363,6.882,0.000,1.785,3.206
log_limit_bal,-0.2490,0.020,-12.649,0.000,-0.288,-0.210
sex_2,-0.1327,0.036,-3.732,0.000,-0.202,-0.063
marriage_1,0.0914,0.152,0.602,0.547,-0.206,0.389
marriage_2,-0.1244,0.152,-0.820,0.412,-0.422,0.173
education_1,1.1895,0.219,5.442,0.000,0.761,1.618
education_2,1.1649,0.218,5.346,0.000,0.738,1.592
education_3,1.1072,0.221,5.018,0.000,0.675,1.540
sep_status_0,-2.0831,0.060,-34.738,0.000,-2.201,-1.966


In [52]:
x_train = x_train.drop('jul_status_0',axis=1)
logit_3 = sm.Logit(y_train,x_train)
result_3= logit_3.fit()
result_3.summary()

Optimization terminated successfully.
         Current function value: 0.439812
         Iterations 7


0,1,2,3
Dep. Variable:,default,No. Observations:,23998.0
Model:,Logit,Df Residuals:,23984.0
Method:,MLE,Df Model:,13.0
Date:,"Tue, 27 Sep 2022",Pseudo R-squ.:,0.1677
Time:,16:14:49,Log-Likelihood:,-10555.0
converged:,True,LL-Null:,-12681.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.5197,0.357,7.052,0.000,1.819,3.220
log_limit_bal,-0.2487,0.020,-12.643,0.000,-0.287,-0.210
sex_2,-0.1325,0.036,-3.727,0.000,-0.202,-0.063
marriage_1,0.0923,0.152,0.608,0.543,-0.205,0.390
marriage_2,-0.1236,0.152,-0.814,0.415,-0.421,0.174
education_1,1.1889,0.219,5.440,0.000,0.761,1.617
education_2,1.1644,0.218,5.344,0.000,0.737,1.591
education_3,1.1068,0.221,5.017,0.000,0.674,1.539
sep_status_0,-2.0824,0.060,-34.745,0.000,-2.200,-1.965


In [53]:
x_train = x_train.drop('aug_status_0',axis=1)
logit_4 = sm.Logit(y_train,x_train)
result_4= logit_4.fit()
result_4.summary()

Optimization terminated successfully.
         Current function value: 0.439814
         Iterations 7


0,1,2,3
Dep. Variable:,default,No. Observations:,23998.0
Model:,Logit,Df Residuals:,23985.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 27 Sep 2022",Pseudo R-squ.:,0.1677
Time:,16:14:55,Log-Likelihood:,-10555.0
converged:,True,LL-Null:,-12681.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4948,0.347,7.194,0.000,1.815,3.174
log_limit_bal,-0.2490,0.020,-12.680,0.000,-0.288,-0.211
sex_2,-0.1328,0.036,-3.738,0.000,-0.202,-0.063
marriage_1,0.0923,0.152,0.608,0.543,-0.205,0.390
marriage_2,-0.1235,0.152,-0.814,0.416,-0.421,0.174
education_1,1.1896,0.219,5.442,0.000,0.761,1.618
education_2,1.1654,0.218,5.349,0.000,0.738,1.592
education_3,1.1077,0.221,5.020,0.000,0.675,1.540
sep_status_0,-2.0867,0.058,-35.947,0.000,-2.200,-1.973


x_train = x_train.drop('marriage_1',axis=1)
logit_5 = sm.Logit(y_train,x_train)
result_5= logit_5.fit()
result_5.summary()

#### We see that the following features are significant
#### limit_bal,Female,married or single in marriage,(Grad,University degree or high school in education column),and the last 2 or 3 month pay status

#### Finally,We see that the p values are less than 0.05 and it signals,we have the important features for our model

In [57]:
drop_list =['sex_1','jul_status_0','aug_status_0','marriage_1']

In [58]:
x_test = x_test.drop(drop_list,axis=1)

In [62]:
x_train.columns

Index(['const', 'log_limit_bal', 'sex_2', 'marriage_2', 'education_1',
       'education_2', 'education_3', 'sep_status_0', 'sep_status_1',
       'aug_status_2', 'jul_status_2', 'jun_status_0'],
      dtype='object')

In [65]:
new_x_train = x_train.drop('const',axis=1)
new_x_test = x_test[new_x_train.columns]

In [66]:
new_x_train.shape

(23998, 11)

In [67]:
new_x_test.shape

(6000, 11)

#### Lets build a new model with the important feature and evaluate it

In [71]:
logistic_model =  LogisticRegression(class_weight='balanced')
logisitc_model = logistic_model.fit(new_x_train,y_train)



In [73]:
logisitc_model.score(new_x_train,y_train)

0.7805650470872573

In [74]:
predicted = logisitc_model.predict(new_x_test)

In [76]:
metrics.recall_score(y_test,predicted)

0.5474397590361446

In [77]:
metrics.roc_auc_score(y_test,predicted)

0.6952738178742367

In [78]:
metrics.f1_score(y_test,predicted)

0.5215208034433285

#### The metrics with this model is somewhat good but does not beat our Xgboost model.I am not doing the oversampling and downsampling here as i have time crunch issues