In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools
import time

In [3]:
# get Personal Loan data
'''
Experience 경력
Income 수입
Famliy 가족단위
CCAvg 월 카드사용량 
Education 교육수준 (1: undergrad; 2, Graduate; 3; Advance )
Mortgage 가계대출
Securities account 유가증권계좌유무
CD account 양도예금증서 계좌 유무
Online 온라인계좌유무
CreidtCard 신용카드유무 

'''

'\nExperience 경력\nIncome 수입\nFamliy 가족단위\nCCAvg 월 카드사용량 \nEducation 교육수준 (1: undergrad; 2, Graduate; 3; Advance )\nMortgage 가계대출\nSecurities account 유가증권계좌유무\nCD account 양도예금증서 계좌 유무\nOnline 온라인계좌유무\nCreidtCard 신용카드유무 \n\n'

In [4]:
ploan = pd.read_csv('Personal Loan.csv')
ploan.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [6]:
# remove meaningless variables ID, ZIp code
ploan_processed = ploan.dropna().drop(['ID', 'ZIP Code'], axis=1, inplace=False)

In [7]:
ploan_processed = sm.add_constant(ploan_processed, has_constant='add')
ploan_processed.head()

  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,const,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1.0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,1.0,45,19,34,3,1.5,1,0,0,1,0,0,0
2,1.0,39,15,11,1,1.0,1,0,0,0,0,0,0
3,1.0,35,9,100,1,2.7,2,0,0,0,0,0,0
4,1.0,35,8,45,4,1.0,2,0,0,0,0,0,1


### feature variable (x), target variable(Y) differentiation

In [10]:
# loan 
feature_columns = ploan_processed.columns.difference(['Personal Loan'])
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan']

In [11]:
train_x, test_x, train_y, test_y = train_test_split(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1750, 12) (750, 12) (1750,) (750,)


### logistic regression modeling y = f(x)

In [12]:
model = sm.Logit(train_y, train_x)
results = model.fit(method='newton')


Optimization terminated successfully.
         Current function value: 0.131055
         Iterations 9


In [13]:
results.summary()

0,1,2,3
Dep. Variable:,Personal Loan,No. Observations:,1750.0
Model:,Logit,Df Residuals:,1738.0
Method:,MLE,Df Model:,11.0
Date:,"Wed, 19 Aug 2020",Pseudo R-squ.:,0.603
Time:,04:38:08,Log-Likelihood:,-229.35
converged:,True,LL-Null:,-577.63
Covariance Type:,nonrobust,LLR p-value:,2.927e-142

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Age,0.0245,0.102,0.240,0.810,-0.175,0.224
CCAvg,0.0985,0.063,1.562,0.118,-0.025,0.222
CD Account,4.3726,0.568,7.703,0.000,3.260,5.485
CreditCard,-1.2374,0.337,-3.667,0.000,-1.899,-0.576
Education,1.5203,0.190,7.999,0.000,1.148,1.893
Experience,-0.0070,0.102,-0.069,0.945,-0.206,0.192
Family,0.7579,0.128,5.914,0.000,0.507,1.009
Income,0.0547,0.004,12.659,0.000,0.046,0.063
Mortgage,-0.0001,0.001,-0.144,0.885,-0.002,0.002


In [14]:
results.params

Age                    0.024471
CCAvg                  0.098468
CD Account             4.372577
CreditCard            -1.237447
Education              1.520329
Experience            -0.007032
Family                 0.757911
Income                 0.054695
Mortgage              -0.000133
Online                -0.440746
Securities Account    -1.852006
const                -13.920298
dtype: float64

In [15]:
np.exp(results.params)
## 나이가 한살 많을수록록 대출할 확률이 1.024 높다.
## 수입이 1단위 높을소룩 대출할 확률이 1.05배 높다 
## 가족 구성원수가 1많을수록 대출할 확률이 2.13배 높다
## 경력이 1단위 높을수록 대출할 확률이 0.99배 높다(귀무가설 채택)

Age                   1.024773e+00
CCAvg                 1.103479e+00
CD Account            7.924761e+01
CreditCard            2.901239e-01
Education             4.573729e+00
Experience            9.929928e-01
Family                2.133814e+00
Income                1.056218e+00
Mortgage              9.998665e-01
Online                6.435563e-01
Securities Account    1.569221e-01
const                 9.005163e-07
dtype: float64

In [16]:
# predict y hat
pred_y = results.predict(test_x)
pred_y

1065    0.012968
487     0.023841
2157    0.001210
1765    0.196245
525     0.006610
          ...   
277     0.019982
914     0.959460
542     0.005239
32      0.011344
2360    0.084464
Length: 750, dtype: float64

In [17]:
def cut_off(y,threshold):
    Y = y.copy() # use copy function so that fix prev y value
    Y[Y>threshold]=1
    Y[Y<=threshold]=0
    return(Y.astype(int))

pred_Y = cut_off(pred_y,0.5)
pred_Y

1065    0
487     0
2157    0
1765    0
525     0
       ..
277     0
914     1
542     0
32      0
2360    0
Length: 750, dtype: int64