In [72]:
import numpy as np
import pandas as pd
import plotly.express as px

In [6]:
pd.set_option('display.max_columns', 500)

In [191]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve, precision_score, recall_score, roc_curve, roc_auc_score, classification_report

---

# <center>**Training Machine Learning Models**</center>

---

In [3]:
loan_borrower = pd.read_csv('../../data/processed/loan_borrower.csv')

---

In [53]:
loan_status_count = loan_borrower.groupby('loan_status', as_index=False).size().rename(columns={'size': 'count'})
loan_status_count['loan_status'] = loan_status_count['loan_status'].map({0: 'Current', 1: 'Default'})
loan_status_count['percent'] = round(100 * loan_status_count['count'] / loan_status_count['count'].sum(), 4)

In [56]:
px.bar(loan_status_count, x='percent', y='loan_status', color='loan_status', text_auto=True, title='Percentage of Loans by Status')

**The positive class is highly imbalanced with the negative class. We need to upsample the positive class when selecting the training set**

---

### <center>**Split the Dataset into Features and Response**</center>

In [128]:
status = loan_borrower[['loan_status']]
features = loan_borrower.loc[:, ~loan_borrower.columns.isin(['loan_status'])]

### <center>**Split the Features into Training and Testing Sets**</center>

In [153]:
features_train, features_test, status_train, status_test = train_test_split(features, status, test_size=0.3, random_state=42)

In [154]:
print(features_train.shape[0], features_test.shape[0], status_train.shape[0], status_test.shape[0])

69988 29995 69988 29995


### <center>**Upsample the Positive Class**</center>

In [155]:
features_train_u, status_train_u = resampleeatures_train[s(ftatus_train.loan_status == 1], status_train[status_train.loan_status == 1], replace=True, n_samples=features_train[status_train.loan_status == 0].shape[0], random_state=42)

In [156]:
features_train = np.concatenate((features_train[status_train.loan_status == 0], features_train_u))
status_train = np.concatenate((status_train[status_train.loan_status == 0], status_train_u))

In [157]:
print(features_train.shape[0])

125894


### <center>**Train Models**</center>

- We care more about catching a loan default when it does happen, so we need to aim for as high a recall rate as possible

In [195]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

In [196]:
lr = lr.fit(features_train, status_train.ravel())

In [205]:
predicted_status = lr.predict(features_test)

### <center>**Model Evaluation**</center>

**Confusion Matrix**

In [208]:
pd.DataFrame(confusion_matrix(status_test, predicted_status), index=['Current', 'Default'], columns=['Current', 'Default'])

Unnamed: 0,Current,Default
Current,21447,5586
Default,682,2280


In [207]:
print(classification_report(status_test, predicted_status))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87     27033
           1       0.29      0.77      0.42      2962

    accuracy                           0.79     29995
   macro avg       0.63      0.78      0.65     29995
weighted avg       0.90      0.79      0.83     29995



In [203]:
fpr, tpr, thresholds = roc_curve(status_test, lr.predict_proba(features_test)[:,1])

**ROC Curve**

In [206]:
lr_auc = roc_auc_score(status_test, predicted_status)

In [204]:
px.line({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds}, x='fpr', y='tpr', title=f'AUC for Logistic Prediction: {np.round(lr_auc, 4)}', labels={'fpr': 'False Positive Rate', 'tpr': 'True Positive Rate'})

---

# <center>**Production**</center>

---

In [209]:
data_loan_prod = pd.read_csv('../../data/raw/Loan_Prod.txt', sep='\t')
data_borrower_prod = pd.read_csv('../../data/raw/Borrower_Prod.txt', sep='\t')

In [212]:
data_loan_prod

Unnamed: 0,loanId,memberId,date,purpose,isJointApplication,loanAmount,term,interestRate,monthlyPayment,grade,loanStatus,Unnamed: 11
0,10000000.0,16334480.0,7/2/2016,debtconsolidation,0.0,23765,60 months,9.38,498.0,D1,,
1,10000001.0,16334481.0,7/3/2016,debtconsolidation,0.0,24302,60 months,6.84,479.0,C3,,
2,10000002.0,16334482.0,7/4/2016,debtconsolidation,0.0,18395,60 months,15.67,444.0,E1,,
3,10000003.0,16334483.0,7/5/2016,debtconsolidation,0.0,19621,48 months,7.48,474.0,A1,,
4,10000004.0,16334484.0,7/6/2016,debtconsolidation,0.0,20577,48 months,9.26,515.0,E3,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2996,10002996.0,16337476.0,9/8/2016,debtconsolidation,0.0,30527,36 months,23.70,439.0,B3,,
2997,10002997.0,16337477.0,9/9/2016,debtconsolidation,0.0,35253,48 months,22.30,770.0,B2,,
2998,10002998.0,16337478.0,9/10/2016,debtconsolidation,0.0,18409,60 months,10.77,353.0,A2,,
2999,10002999.0,16337479.0,9/11/2016,debtconsolidation,0.0,44020,48 months,27.23,697.0,B2,,
