# Credit defaulters identification using Logistic Regression

Introduction to the data set

German credit rating dataset contains categorical/symbolic attributes of the persons who availed the credit and the current status of the credit. The status of the credit is indicated by 1 for good credits and 2 for bad credits.

The dataset can be downloaded from UCI Repository.

https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29

The detailed description of variables can be found at the same link.

### Read the data set

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import LogisticRegression

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

import numpy as np


# calculate accuracy measures and confusion matrix
from sklearn import metrics


In [2]:
data = pd.read_csv( "germanCreditDefault.csv" )

In [3]:
data.head(2)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes


In [4]:
columns = ['checkin_acc', 'duration', 'credit_history', 'purpose', 'amount', 'svaing_acc', 'present_emp_since', 
           'inst_rate', 'personal_status', 'other_debtors', 'residing_since', 'property', 'age','inst_plans', 'housing', 
           'num_credits', 'job', 'dependents', 'telephone', 'foreign_worker', 'status']

In [5]:
#data.columns = columns

In [6]:
data.head(2)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes


In [7]:
data.head().T

Unnamed: 0,0,1,2,3,4
checking_balance,< 0 DM,1 - 200 DM,unknown,< 0 DM,< 0 DM
months_loan_duration,6,48,12,42,24
credit_history,critical,good,critical,good,poor
purpose,furniture/appliances,furniture/appliances,education,furniture/appliances,car
amount,1169,5951,2096,7882,4870
savings_balance,unknown,< 100 DM,< 100 DM,< 100 DM,< 100 DM
employment_duration,> 7 years,1 - 4 years,4 - 7 years,4 - 7 years,1 - 4 years
percent_of_income,4,2,2,2,3
years_at_residence,4,2,3,4,4
age,67,22,49,45,53


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


#### How many default and no default observations in the sample

In [9]:
data.default.value_counts()

no     700
yes    300
Name: default, dtype: int64

There are about 300 defaults and 700 non-default observations

### Create dummy variables for the categorical features

In [10]:
data.columns

Index(['checking_balance', 'months_loan_duration', 'credit_history', 'purpose',
       'amount', 'savings_balance', 'employment_duration', 'percent_of_income',
       'years_at_residence', 'age', 'other_credit', 'housing',
       'existing_loans_count', 'job', 'dependents', 'phone', 'default'],
      dtype='object')

In [11]:
list(data.columns )

['checking_balance',
 'months_loan_duration',
 'credit_history',
 'purpose',
 'amount',
 'savings_balance',
 'employment_duration',
 'percent_of_income',
 'years_at_residence',
 'age',
 'other_credit',
 'housing',
 'existing_loans_count',
 'job',
 'dependents',
 'phone',
 'default']

#### Remove the response variable from the dataset

In [12]:
X_features = list(data.columns )
X_features

['checking_balance',
 'months_loan_duration',
 'credit_history',
 'purpose',
 'amount',
 'savings_balance',
 'employment_duration',
 'percent_of_income',
 'years_at_residence',
 'age',
 'other_credit',
 'housing',
 'existing_loans_count',
 'job',
 'dependents',
 'phone',
 'default']

In [13]:
data_complete = pd.get_dummies(data[X_features], drop_first = True )

In [14]:
len(data_complete.columns )

36

In [15]:
data_complete.head().T

Unnamed: 0,0,1,2,3,4
months_loan_duration,6,48,12,42,24
amount,1169,5951,2096,7882,4870
percent_of_income,4,2,2,2,3
years_at_residence,4,2,3,4,4
age,67,22,49,45,53
existing_loans_count,2,1,1,1,2
dependents,1,1,2,2,2
checking_balance_< 0 DM,1,0,0,1,1
checking_balance_> 200 DM,0,0,0,0,0
checking_balance_unknown,0,0,1,0,0


In [16]:
data_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 36 columns):
months_loan_duration               1000 non-null int64
amount                             1000 non-null int64
percent_of_income                  1000 non-null int64
years_at_residence                 1000 non-null int64
age                                1000 non-null int64
existing_loans_count               1000 non-null int64
dependents                         1000 non-null int64
checking_balance_< 0 DM            1000 non-null uint8
checking_balance_> 200 DM          1000 non-null uint8
checking_balance_unknown           1000 non-null uint8
credit_history_good                1000 non-null uint8
credit_history_perfect             1000 non-null uint8
credit_history_poor                1000 non-null uint8
credit_history_very good           1000 non-null uint8
purpose_car                        1000 non-null uint8
purpose_car0                       1000 non-null uint8
purpose_educat

### Specify x ( independent) and y (target/dependent) features

In [17]:
x = data_complete.drop("default_yes", axis=1)
y = data_complete[["default_yes"]]

### Split datasets into train and test datasets

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.3)

### Build a logistic regression model

In [22]:
import statsmodels.api as sm

In [23]:
logit = sm.Logit( y_train, sm.add_constant( x_train ) )

In [24]:
lg = logit.fit()

         Current function value: inf
         Iterations: 35




In [25]:
lg.summary()



0,1,2,3
Dep. Variable:,default_yes,No. Observations:,700.0
Model:,Logit,Df Residuals:,664.0
Method:,MLE,Df Model:,35.0
Date:,"Fri, 16 Nov 2018",Pseudo R-squ.:,inf
Time:,23:21:13,Log-Likelihood:,-inf
converged:,False,LL-Null:,0.0
,,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.6949,1.163,-1.458,0.145,-3.974,0.584
months_loan_duration,0.0349,0.011,3.131,0.002,0.013,0.057
amount,8.651e-05,5.3e-05,1.632,0.103,-1.74e-05,0.000
percent_of_income,0.4570,0.106,4.317,0.000,0.249,0.664
years_at_residence,-0.0243,0.104,-0.234,0.815,-0.228,0.179
age,-0.0202,0.011,-1.758,0.079,-0.043,0.002
existing_loans_count,0.0059,0.228,0.026,0.979,-0.440,0.452
dependents,0.4338,0.284,1.530,0.126,-0.122,0.990
checking_balance_< 0 DM,0.5306,0.251,2.111,0.035,0.038,1.023


### Find significant variables

In [26]:
def get_significant_vars( lm ):
    var_p_vals_df = pd.DataFrame( lm.pvalues )
    var_p_vals_df['vars'] = var_p_vals_df.index
    var_p_vals_df.columns = ['pvals', 'vars']
    return list( var_p_vals_df[var_p_vals_df.pvals <= 0.05]['vars'] )

In [27]:
significant_vars = get_significant_vars( lg )

In [28]:
significant_vars

['months_loan_duration',
 'percent_of_income',
 'checking_balance_< 0 DM',
 'checking_balance_unknown',
 'employment_duration_4 - 7 years',
 'other_credit_none']

### Predict Test Data and Measure Accuracy

In [29]:
from sklearn import metrics

In [30]:
def get_predictions( y_test, model ):
    y_pred_df = pd.DataFrame( { 'actual': y_test,
                               "predicted_prob": model.predict( sm.add_constant( x_test ) ) } )
    return y_pred_df

In [31]:
y_pred_df = get_predictions( y_test.default_yes, lg )

In [32]:
y_pred_df.head(2)

Unnamed: 0,actual,predicted_prob
237,1,0.52427
552,1,0.294605


#### Status_1 columns provides the predicted probability for the default classes

In [33]:
y_pred_df[0:10]

Unnamed: 0,actual,predicted_prob
237,1,0.5242697
552,1,0.2946053
752,0,0.1243079
695,0,0.03981464
229,0,0.9237071
148,0,0.6296307
928,0,0.03097879
721,1,8.9194e-07
86,0,0.4703625
840,1,0.5762131


In [34]:
x = y_pred_df.predicted_prob

In [35]:
x = x[0:5]

In [36]:
x

237    0.524270
552    0.294605
752    0.124308
695    0.039815
229    0.923707
Name: predicted_prob, dtype: float64

In [37]:
x.map(lambda x: 1 if x > 0.1 else 0)

237    1
552    1
752    1
695    0
229    1
Name: predicted_prob, dtype: int64

In [140]:
y_pred_df['predicted'] = y_pred_df.predicted_prob.map( lambda x: 1 if x > 0.15 else 0)

In [141]:
y_pred_df[0:10]

Unnamed: 0,actual,predicted_prob,predicted
237,1,0.5242697,1
552,1,0.2946053,1
752,0,0.1243079,0
695,0,0.03981464,0
229,0,0.9237071,1
148,0,0.6296307,1
928,0,0.03097879,0
721,1,8.9194e-07,0
86,0,0.4703625,1
840,1,0.5762131,1


### Create confusion matrix

In [142]:
cMatrix = metrics.confusion_matrix( y_pred_df.actual, y_pred_df.predicted )
cMatrix

array([[111,  97],
       [ 21,  71]], dtype=int64)

Note: the model could only predict very few default classes.

#### Overall accuracy of the model 

In [143]:
print( 'Total Accuracy : ',np.round( metrics.accuracy_score( y_test, y_pred_df.predicted ), 2 ) )

Total Accuracy :  0.61


In [146]:
FN = ((cMatrix[1][0])/(cMatrix.sum()))*100
print('False Negative Percentage: %.2f' %(FN))

FP = ((cMatrix[0][1])/(cMatrix.sum()))*100
print('False Positive Percentage: %.2f' %(FP))

False Negative Percentage: 7.00
False Positive Percentage: 32.33
