## Mortgage Loans: Logistic Regression Example

In [169]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt

In [170]:
df = pd.read_csv('../data/loan_data_set.csv')
df.shape

(614, 13)

## Exploratory Data Analysis

In [171]:
# declare the list of features
features = ['Credit_History','LoanAmount','Loan_Amount_Term','ApplicantIncome']

In [172]:
# recode missing values
print(df[features].isnull().sum())
for feature in ['LoanAmount','Loan_Amount_Term','ApplicantIncome']:
    df[feature].fillna(value=df[feature].mean(), inplace=True)
print(df[features].isnull().sum())

Credit_History      50
LoanAmount          22
Loan_Amount_Term    14
ApplicantIncome      0
dtype: int64
Credit_History      50
LoanAmount           0
Loan_Amount_Term     0
ApplicantIncome      0
dtype: int64


In [197]:
for feature in ['LoanAmount','Loan_Amount_Term','ApplicantIncome']:
    print(df[feature].agg(['mean', 'median', 'min', 'max']))

mean      145.137687
median    128.500000
min         9.000000
max       700.000000
Name: LoanAmount, dtype: float64
mean      342.148936
median    360.000000
min        36.000000
max       480.000000
Name: Loan_Amount_Term, dtype: float64
mean       5466.43617
median     3815.00000
min         150.00000
max       81000.00000
Name: ApplicantIncome, dtype: float64


In [176]:
# credit: missing data
print(df.shape)
print(df['Credit_History'].value_counts(dropna=False))
df.dropna(subset=['Credit_History'], inplace=True)
print(df.shape)

(564, 13)
1.0    475
0.0     89
Name: Credit_History, dtype: int64
(564, 13)


In [27]:
# recode the target variable as numeric
df['Loan_Approval']=np.where(df['Loan_Status']=="Y", 1, 0)
df['Loan_Approval'].value_counts(dropna=False)

1    422
0    192
Name: Loan_Approval, dtype: int64

## Model Building

In [28]:
# specify X and y
y = df['Loan_Approval']
X = df[features]

In [29]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=12)

In [30]:
# Fit the model to the training dataset
mymodel = LogisticRegression()
mymodel.fit(X_train, y_train)

LogisticRegression()

In [31]:
# coefficients and intercept
mymodel.intercept_
mymodel.coef_

array([[ 3.05839266e+00, -2.55748576e-03, -5.22538765e-03,
         2.21930464e-05]])

In [41]:
# Predict the y-values on the testing dataset
y_preds = mymodel.predict(X_test)
y_probs = mymodel.predict_proba(X_test)

## Model Evalution

In [74]:
# Evaluate the model
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.84      0.54      0.66        48
           1       0.82      0.95      0.88       106

    accuracy                           0.82       154
   macro avg       0.83      0.75      0.77       154
weighted avg       0.83      0.82      0.81       154



In [128]:
# save your eval report as an html file
report = metrics.classification_report(y_test, y_preds, output_dict=True)
evalreport = pd.DataFrame(report).transpose()
evalreport.to_html('../assets/evalreport.html')
evalreport

Unnamed: 0,precision,recall,f1-score,support
0,0.83871,0.541667,0.658228,48.0
1,0.821138,0.95283,0.882096,106.0
accuracy,0.824675,0.824675,0.824675,0.824675
macro avg,0.829924,0.747248,0.770162,154.0
weighted avg,0.826615,0.824675,0.812319,154.0


In [79]:
# true positives, etc.
y_score = mymodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)

In [137]:
# display with plotly
import plotly.express as px
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_json('../assets/rocauc.json')
fig.show()

In [145]:
# how to load that back in
import json
with open('../assets/rocauc.json', 'r') as f:
    fig=json.load(f)
fig.keys()

dict_keys(['data', 'layout'])

## Make predictions on new data

In [92]:
# check out one row of the test data
X_test.iloc[0]

Credit_History         1.0
LoanAmount           109.0
Loan_Amount_Term     360.0
ApplicantIncome     2500.0
Name: 11, dtype: float64

In [94]:
# show a prediction & probability for that value
print(mymodel.predict([X_test.iloc[0].values])[0])
print(mymodel.predict_proba([X_test.iloc[0].values]).max())

1
0.7725586178712299


In [100]:
# pickle your model
import pickle
filename = open('loan_approval_logistic_model.pkl', 'wb')
pickle.dump(mymodel, filename)
filename.close()

In [101]:
# read in our pickle file
filename = open('loan_approval_logistic_model.pkl', 'rb')
unpickled_model = pickle.load(filename)
filename.close()

In [124]:
# make predictions on new data
fake1=[[1, 1000, 180, 100]]
fake2=[[1, 300, 360, 4500]]
fake3=[[0, 100, 360, 1000]]

In [217]:
# make predictions
for data in [fake1, fake2, fake3]:
    y = unpickled_model.predict(data)
    func = lambda y: 'approved' if y[0]==1 else 'denied'
    formatted_y = func(y)
    prob=unpickled_model.predict_proba(data).max()*100
    formatted_prob = "{:,.2f}%".format(prob)
    print(y[0]==1)
    print(formatted_y)
    print(formatted_prob)

False
denied
54.20%
True
approved
68.54%
False
denied
86.36%


In [229]:
# change the threshold
Threshold=50
for data in [fake1, fake2, fake3]:
        rawprob=100*unpickled_model.predict_proba(data)[0][0]
        func = lambda rawprob: 'Denied' if rawprob>Threshold else 'Approved'
        formatted_y = func(y)
        print(rawprob)
        print(formatted_y)

54.20499417094955
Approved
31.459546412729456
Approved
86.36390513992158
Approved


In [230]:
# probability of 'denied'
print(unpickled_model.predict_proba(data)[0][0])
# probability of 'denied'
unpickled_model.predict_proba(data)[0][1]

0.8636390513992158


0.13636094860078415