In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

In [59]:
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv",index_col=0)
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [60]:
loan_data.shape

(491, 13)

In [61]:
loan_data.columns = map(str.lower, loan_data.columns)

In [62]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 491 entries, 0 to 490
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loan_id            491 non-null    object 
 1   gender             481 non-null    object 
 2   married            490 non-null    object 
 3   dependents         482 non-null    object 
 4   education          491 non-null    object 
 5   self_employed      462 non-null    object 
 6   applicantincome    491 non-null    int64  
 7   coapplicantincome  491 non-null    float64
 8   loanamount         475 non-null    float64
 9   loan_amount_term   478 non-null    float64
 10  credit_history     448 non-null    float64
 11  property_area      491 non-null    object 
 12  loan_status        491 non-null    int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 53.7+ KB


In [63]:
loan_data.describe()

Unnamed: 0,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,loan_status
count,491.0,491.0,475.0,478.0,448.0,491.0
mean,5401.189409,1589.730998,145.014737,341.297071,0.848214,0.698574
std,6419.427177,2919.320624,86.310534,66.964051,0.359214,0.459345
min,150.0,0.0,17.0,12.0,0.0,0.0
25%,2923.5,0.0,100.0,360.0,1.0,0.0
50%,3865.0,1229.0,126.0,360.0,1.0,1.0
75%,5705.5,2251.5,162.0,360.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0


In [64]:
loan_data.isnull().sum()

loan_id               0
gender               10
married               1
dependents            9
education             0
self_employed        29
applicantincome       0
coapplicantincome     0
loanamount           16
loan_amount_term     13
credit_history       43
property_area         0
loan_status           0
dtype: int64

In [65]:
loan_data['loan_status'].value_counts() / len(loan_data) * 100

1    69.857434
0    30.142566
Name: loan_status, dtype: float64

In [66]:
loan_data.select_dtypes(include=['object']).head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,property_area
0,LP002305,Female,No,0,Graduate,No,Semiurban
1,LP001715,Male,Yes,3+,Not Graduate,Yes,Rural
2,LP002086,Female,Yes,0,Graduate,No,Urban
3,LP001136,Male,Yes,0,Not Graduate,Yes,Urban
4,LP002529,Male,Yes,2,Graduate,No,Semiurban


In [67]:
loan_data['gender'].value_counts()

Male      393
Female     88
Name: gender, dtype: int64

In [68]:
loan_data['married'].value_counts()

Yes    324
No     166
Name: married, dtype: int64

In [69]:
loan_data['dependents'].value_counts()

0     276
1      85
2      78
3+     43
Name: dependents, dtype: int64

In [70]:
loan_data['self_employed'].value_counts()

No     398
Yes     64
Name: self_employed, dtype: int64

In [71]:
loan_data['loanamount'].value_counts()

120.0    17
110.0    13
100.0    13
128.0    11
160.0    11
         ..
436.0     1
253.0     1
85.0      1
129.0     1
315.0     1
Name: loanamount, Length: 176, dtype: int64

In [72]:
loan_data['loan_amount_term'].value_counts()

360.0    404
180.0     35
480.0     13
300.0     12
84.0       4
120.0      3
240.0      3
36.0       2
60.0       1
12.0       1
Name: loan_amount_term, dtype: int64

In [74]:
loan_data['credit_history'].value_counts()

1.0    380
0.0     68
Name: credit_history, dtype: int64

In [75]:
loan_data.fillna(method='ffill',inplace=True)
loan_data.isnull().sum()

loan_id              0
gender               0
married              0
dependents           0
education            0
self_employed        0
applicantincome      0
coapplicantincome    0
loanamount           0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

### Convert categorical features into numeric

In [76]:
loan_data['education'].value_counts()

Graduate        388
Not Graduate    103
Name: education, dtype: int64

In [78]:
loan_data.drop(['loan_id'],axis=1,inplace=True)

In [79]:
def gender(x):
    if isinstance(x,str):
        return(x.replace('Male','0').replace('Female','1'))
    return(x)
loan_data['gender']=loan_data['gender'].apply(gender).astype('int')
loan_data['gender']

0      1
1      0
2      1
3      0
4      0
      ..
486    0
487    1
488    0
489    0
490    0
Name: gender, Length: 491, dtype: int32

In [80]:
def self_employed(x):
    if isinstance(x,str):
        return(x.replace('No','0').replace('Yes','1'))
    return(x)
loan_data['self_employed']=loan_data['self_employed'].apply(self_employed).astype('int')
loan_data['self_employed']

0      0
1      1
2      0
3      1
4      0
      ..
486    1
487    0
488    0
489    0
490    0
Name: self_employed, Length: 491, dtype: int32

In [81]:
def married(x):
    if isinstance(x,str):
        return(x.replace('No','0').replace('Yes','1'))
    return(x)
loan_data['married']=loan_data['married'].apply(married).astype('int')
loan_data['married']

0      0
1      1
2      1
3      1
4      1
      ..
486    1
487    0
488    1
489    0
490    1
Name: married, Length: 491, dtype: int32

In [82]:
def edu(x):
    if isinstance(x,str):
        return(x.replace('Not Graduate','0').replace('Graduate','1'))        
    return(x)
loan_data['education']=loan_data['education'].apply(edu).astype('int')
loan_data['education']

0      1
1      0
2      1
3      0
4      1
      ..
486    1
487    1
488    1
489    1
490    1
Name: education, Length: 491, dtype: int32

In [83]:
def dependents(x):
    if isinstance(x,str):
        return(x.replace('3+','4'))
    return(x)
loan_data['dependents']=loan_data['dependents'].apply(dependents).astype('int')
loan_data['dependents']

0      0
1      4
2      0
3      0
4      2
      ..
486    1
487    1
488    1
489    0
490    0
Name: dependents, Length: 491, dtype: int32

In [84]:
feature=loan_data.drop('property_area',axis=1)
feature=pd.concat([feature,pd.get_dummies(loan_data['property_area'])],axis=1)
feature.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,loan_status,Rural,Semiurban,Urban
0,1,0,0,1,0,4547,0.0,115.0,360.0,1.0,1,0,1,0
1,0,1,4,0,1,5703,0.0,130.0,360.0,1.0,1,1,0,0
2,1,1,0,1,0,4333,2451.0,110.0,360.0,1.0,0,0,0,1
3,0,1,0,0,1,4695,0.0,96.0,360.0,1.0,1,0,0,1
4,0,1,2,1,0,6700,1750.0,230.0,300.0,1.0,1,0,1,0


In [85]:
feature.dtypes

gender                 int32
married                int32
dependents             int32
education              int32
self_employed          int32
applicantincome        int64
coapplicantincome    float64
loanamount           float64
loan_amount_term     float64
credit_history       float64
loan_status            int64
Rural                  uint8
Semiurban              uint8
Urban                  uint8
dtype: object

In [87]:
X=feature.drop('loan_status',axis='columns')
y=feature['loan_status']

In [88]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
lr=LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train)
lr

LogisticRegression(solver='liblinear')

In [90]:
X.columns

Index(['gender', 'married', 'dependents', 'education', 'self_employed',
       'applicantincome', 'coapplicantincome', 'loanamount',
       'loan_amount_term', 'credit_history', 'Rural', 'Semiurban', 'Urban'],
      dtype='object')

In [91]:
np.where(X.columns=='Rural')[0][0]

10

In [94]:
def predict_status(gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area):
    area_index = np.where(X.columns==property_area)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = gender
    x[1] = married
    x[2] = dependents
    x[3] = education
    x[4] = self_employed
    x[5] = applicantincome
    x[6] = coapplicantincome
    x[7] = loanamount
    x[8] = loan_amount_term
    x[9] = credit_history
    if area_index >= 0:
        x[area_index] = 10
        
    return lr.predict([x])[0]

In [95]:
predict_status(1,1,1,1,0,5000,0.0,510.0,360.0,1.0,'Urban')

1

In [99]:
import pickle
with open('Documents/loan/model/loan_or_no_loan.pickle','wb') as f:
    pickle.dump(lr,f)

In [105]:
import json
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
columns={
    'data_columns':[col.lower() for col in X.columns]
}

dumped=json.dumps(columns,cls=NumpyEncoder)
    
with open('Documents/loan/model/columns.json','w',encoding='utf-8') as f:
    json.dump(dumped, f)

with open('Documents/loan/model/columns.json','r',encoding='utf-8') as f:
    columns=json.load(f)