# Import libraries to get started

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Import Test and Train data

In [34]:
train_input=pd.read_csv("C://Sunny//Softtantra//Workshop//Data//Credit Risk//Credit_Risk_Train_data.csv")
test_input=pd.read_csv("C://Sunny//Softtantra//Workshop//Data//Credit Risk//Credit_Risk_Validate_data.csv")

# Print few rows of train and test datasets to see if they are imported as needed

In [35]:
print(train_input.head())
print(test_input.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [36]:
print(train_input.columns)
print(test_input.columns)


Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'outcome'],
      dtype='object')


# We found that column names are not same in both test and train dataset

In [37]:
# use similar names in both test and train data
test_input.rename(columns={"outcome":"Loan_Status"},inplace=True)

In [38]:
print(test_input.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [39]:
print(train_input.shape)
print(test_input.shape)

(614, 13)
(367, 13)


# CONCATENATE train and test data into one so that we can fill in the missing values simultaneously.

In [40]:
all=pd.concat([train_input,test_input],axis=0)
all.shape

(981, 13)

In [41]:
print(all.head())


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [42]:
print(all.tail())

      Loan_ID Gender Married Dependents     Education Self_Employed  \
362  LP002971   Male     Yes         3+  Not Graduate           Yes   
363  LP002975   Male     Yes          0      Graduate            No   
364  LP002980   Male      No          0      Graduate            No   
365  LP002986   Male     Yes          0      Graduate            No   
366  LP002989   Male      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
362             4009             1777.0       113.0             360.0   
363             4158              709.0       115.0             360.0   
364             3250             1993.0       126.0             360.0   
365             5000             2393.0       158.0             360.0   
366             9200                0.0        98.0             180.0   

     Credit_History Property_Area Loan_Status  
362             1.0         Urban           Y  
363             1.0         Urban     

# we have seen that index of both the data sets are also added in new dataset after concatenation and there is duplicacy. This will give us problem in merging and modeling so, we will reset the index and drop the prev one.


In [43]:
all.reset_index(inplace=True,drop=True)

In [44]:
all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 13 columns):
Loan_ID              981 non-null object
Gender               957 non-null object
Married              978 non-null object
Dependents           956 non-null object
Education            981 non-null object
Self_Employed        926 non-null object
ApplicantIncome      981 non-null int64
CoapplicantIncome    981 non-null float64
LoanAmount           954 non-null float64
Loan_Amount_Term     961 non-null float64
Credit_History       902 non-null float64
Property_Area        981 non-null object
Loan_Status          981 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 99.7+ KB


# there are missing values in few columns

In [45]:
all.isnull().sum()

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

# Filling missing values generally is done with mean,median or mode values.

In [46]:
Counter(all['Gender'])

Counter({'Male': 775, 'Female': 182, nan: 24})

In [47]:
#Proportion of males is extremely large so we can fill the missing value of Gender with male.
print(all[all['Gender'].isnull()].index.tolist())
#these rows are null for Gender so we will fill the missing values with mode of Gender which is male.
gender_null=all[all['Gender'].isnull()].index.tolist()

[23, 126, 171, 188, 314, 334, 460, 467, 477, 507, 576, 588, 592, 636, 665, 720, 752, 823, 845, 859, 893, 910, 917, 932]


In [48]:
all['Gender'].iloc[gender_null]="Male"

In [49]:
Counter(all['Gender'])

Counter({'Male': 799, 'Female': 182})

In [50]:
#Fill married column's missing values
Counter(all['Married'])

Counter({'No': 347, 'Yes': 631, nan: 3})

In [51]:
#Lets fill them Yes if they have dependents else No
pd.crosstab(all['Married'].isnull(),all['Dependents'].isnull())

Dependents,False,True
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
False,956,22
True,0,3


In [52]:
#We will fill the missing values for Married Column with its mode i.e. Yes
married_null=all[all['Married'].isnull()].index.tolist()
married_null

[104, 228, 435]

In [53]:
all['Married'].iloc[married_null]=np.repeat("Yes",len(married_null))

In [54]:
all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [55]:
Counter(all['Dependents'])

Counter({'0': 545, '1': 160, '2': 160, '3+': 91, nan: 25})

In [56]:
#Lets see Dependents wrt Married
pd.crosstab(all['Married'],all['Dependents'].isnull())

Dependents,False,True
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,338,9
Yes,618,16


In [57]:
pd.crosstab(all['Dependents'],all['Married'])

Married,No,Yes
Dependents,Unnamed: 1_level_1,Unnamed: 2_level_1
0,276,269
1,36,124
2,14,146
3+,12,79


In [59]:
#For the bachelors, very high number of people are having 0 dependents
#So, we will fill 9 bachelors whose dependents value is missing with 0 dependents.
#Find the index of all the people whose dependents are missing and Married as No.
bachelor_nulldependent=all[(all['Married']=="No")& (all['Dependents'].isnull())].index.tolist()
print(bachelor_nulldependent)

[293, 332, 355, 597, 684, 752, 879, 916, 926]


In [60]:
all['Dependents'].iloc[bachelor_nulldependent]='0'

In [61]:
Counter(all['Dependents'])

Counter({'0': 554, '1': 160, '2': 160, '3+': 91, nan: 16})

In [62]:
#For the remaining 16 missing values.
#Lets check how many dependents Male and Female have, using Gender data.
pd.crosstab(all['Gender'],all['Dependents'])

Dependents,0,1,2,3+
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,127,32,13,9
Male,427,128,147,82


In [63]:
#From above, we found that females have less dependents as compared to Males.
#Lets check the Gender of the 16 missing dependents.
all['Gender'].iloc[all[all['Dependents'].isnull()].index.tolist()]

102      Male
104      Male
120      Male
226      Male
228      Male
301      Male
335      Male
346      Male
435    Female
517      Male
571      Male
660      Male
725      Male
816      Male
861      Male
865      Male
Name: Gender, dtype: object

In [64]:
#Almost all of them are males except one.
#We can see from the cross tab that most of the males i.e mode of Males is 0 dependents so we will fill Missing values with 0
all['Dependents'].iloc[all[all['Dependents'].isnull()].index.tolist()]= "0"

In [65]:
all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [66]:
Counter(all['Self_Employed'])

Counter({'No': 807, 'Yes': 119, nan: 55})

In [67]:
self_emp_null=all[all['Self_Employed'].isnull()].index.tolist()

In [68]:
all['Self_Employed'].iloc[self_emp_null]="No"

In [69]:
all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [70]:
pd.crosstab(all['LoanAmount'].isnull(),all['Loan_Amount_Term'])

Loan_Amount_Term,6.0,12.0,36.0,60.0,84.0,120.0,180.0,240.0,300.0,350.0,360.0,480.0
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,1,2,3,3,7,4,64,7,20,1,800,22
True,0,0,0,0,0,0,2,1,0,0,23,1


In [71]:
all.groupby(all['Loan_Amount_Term'])['LoanAmount'].mean()

Loan_Amount_Term
6.0       95.000000
12.0     185.500000
36.0     117.666667
60.0     139.666667
84.0     121.142857
120.0     36.750000
180.0    131.125000
240.0    128.857143
300.0    166.250000
350.0    133.000000
360.0    144.420000
480.0    137.181818
Name: LoanAmount, dtype: float64

In [72]:
#Lets fill the missing values of Loan Amount with the means of their respective terms.
#We see that 180 and 240 have almost same Loan Amount i.e 128 to 131 so will them with 130.
#For 360 we will fill with 144.
#For 480, we will fill with 130 only.
all['LoanAmount'][(all['LoanAmount'].isnull())& (all['Loan_Amount_Term']==360)]=144


In [73]:
all['LoanAmount'][all['LoanAmount'].isnull()]=130

In [74]:
all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [75]:
#Now to fill missing values of Loan Amount Term.
(all['Loan_Amount_Term']).value_counts()
#We found most of the values is for period 360.

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
36.0       3
60.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [76]:
all['Loan_Amount_Term'][all['Loan_Amount_Term'].isnull()]=360

In [77]:
all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [78]:
#Credit History's missing values.
all['Credit_History'].value_counts()

1.0    754
0.0    148
Name: Credit_History, dtype: int64

In [79]:
pd.crosstab(all['Gender'],all['Credit_History'])
#Gender makes no difference.

Credit_History,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,30,135
Male,118,619


In [80]:
pd.crosstab(all['Self_Employed'],all['Credit_History'])
#Self Employed makes no difference.

Credit_History,0.0,1.0
Self_Employed,Unnamed: 1_level_1,Unnamed: 2_level_1
No,134,658
Yes,14,96


In [81]:
pd.crosstab(all['Education'],all['Credit_History'])
#Education makes no difference.

Credit_History,0.0,1.0
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
Graduate,106,596
Not Graduate,42,158


In [82]:
pd.crosstab(all['Married'],all['Credit_History'])
#Married makes no difference.

Credit_History,0.0,1.0
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,56,263
Yes,92,491


# Drop credit history

In [83]:
all.dropna(inplace = True)

In [84]:
all.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [85]:
#Prepare a train dataset which has no missing values in Credit History.
#And test dataset which has all the missing values.
#Before splitting lets create dummy variables.
all.columns
all_new=pd.get_dummies(all.drop(['Loan_ID'],axis=1),drop_first=True)

In [86]:
all_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,144.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [87]:
all_new.isnull().sum()

ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Semiurban    0
Property_Area_Urban        0
Loan_Status_Y              0
dtype: int64

In [88]:
print(all_new.shape)

(902, 15)


# Split data into test and train

In [92]:
from sklearn.model_selection import train_test_split

In [93]:
X_tag=all_new.drop(['Loan_Status_Y','Credit_History'],axis=1)
Y_tag=all_new['Loan_Status_Y']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_tag, Y_tag, test_size=0.2, random_state=42)

In [95]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(721, 13)
(721,)
(181, 13)
(181,)


# Model Building

In [97]:
#lOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Predict values using model object

In [98]:
pred=log_reg.predict(X_test)

In [100]:
from sklearn.metrics import classification_report,confusion_matrix

In [101]:
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        41
          1       0.77      0.97      0.86       140

avg / total       0.59      0.75      0.66       181



In [103]:
print(confusion_matrix(y_test,pred))

[[  0  41]
 [  4 136]]


# Accuracy = (TP + TN )/ TP + TN + FP + FN

In [104]:
Accuracy = 136/(41+4+136)
print(Accuracy)

0.7513812154696132
