# Goal: predict whether a loan will end up with maximum profits or not

---
#### Target variable: `outcome` 
* Type: **Categorical** 
* Model type: Classification 
* Sourced from: `zeroBalCode`
* Data: 
    - "0" means "Closed" (i.e. a successful outcome for Fannie Mae)
    - "1" means "Default" (i.e. a negative outcome)

---
#### This Notebook:
* Input required: The output files from "Scott - Data Pre - 2 - 50 50 split train test" notebook
* Outputs generated: Decision on what model to use

#### Expected Workflow
1. Scott - Data Pre - 1 - Feature EEE
2. Scott - Data Pre - 2 - 50 50 split train test
3. Scott - Model - 1- PyCaret Setup and Create Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pycaret.classification import *
#!pip install pycaret

from sklearn.feature_selection import VarianceThreshold

# Importing the data

In [2]:
df = pd.read_csv("data/20200524/DataPre-2-5050-split.csv")

# Remove the weird unnamed column
df.drop(['Unnamed: 0'], 1, inplace=True)

print(df.shape)

df.sample(5)

(138412, 19)


Unnamed: 0,origChannel,origIntRate,origUPB,origLTV,numBorrowers,origDebtIncRatio,loanPurp,zipCode,pMIperct,worstCreditScore,bankNumber,stateNumber,mSA,fmacRateMin,fredRate,rateDiffAbovePct,origYear,origMonth,outcome
130979,3,4.625,183000,59,2,40,2,834,0.0,718,4,14,0,4.87,3.21,-0.080517,2009,10,1
106286,3,4.875,150000,56,2,35,1,734,0.0,683,45,37,0,4.36,2.94,0.085746,2010,8,1
118964,1,5.125,272000,80,1,38,2,897,0.0,717,45,34,0,4.85,3.02,-0.004854,2009,3,1
96837,3,5.5,160000,80,1,32,1,834,0.0,736,15,14,26820,4.85,3.02,0.067961,2009,3,1
38125,1,3.75,79000,65,1,34,1,446,0.0,703,54,36,0,3.88,2.03,-0.080882,2012,3,0


# Changing this to limited inputs

In [7]:
# origIntRate, origUPB, origLTV, origDebtIncRatio, stateNumber, fredRate. The target is outcome
#    - 'fredRate' Treasury
#    - fmacRate: Freddie Mac releases weekly data. Kip took the lowest avg of the four weeks for this loan
#    - fmacRate: Allows us to bin all loans for a given month
#    - rateDiffAbovePct - remember Kip saying Tableau showing this was highly predictive
#    - msa - if '0' that references US territory (Guam, etc). Maybe lots of defaults here
dfTest = df[[
    'origIntRate'
    , 'origUPB'
    , 'origLTV'
    , 'origDebtIncRatio'
    , 'worstCreditScore'
    , 'bankNumber'
    , 'stateNumber'
    , 'rateDiffAbovePct'
    , 'mSA'
    , 'outcome'
]].copy()

dfTest.sample(5)

Unnamed: 0,origIntRate,origUPB,origLTV,origDebtIncRatio,worstCreditScore,bankNumber,stateNumber,rateDiffAbovePct,mSA,outcome
50671,4.5,272000,75,46,735,54,33,-0.142857,42140,0
84771,5.375,52000,80,28,668,0,25,0.068588,44180,1
22492,4.5,408000,80,32,748,54,22,0.053864,38860,0
3407,5.125,220000,76,53,652,80,9,-0.004854,27260,1
60296,4.875,174000,47,47,699,4,50,0.001027,42660,0


In [8]:
# Two sets: one with all mSAs and another with dropping the "0"
dfStatesOnly = dfTest[dfTest["mSA"] > 0].copy()

rows, cols = dfStatesOnly.shape
print(f'dfStatesOnly: {rows} rows')

print(dfStatesOnly.columns.to_list())

dfStatesOnly: 114094 rows
['origIntRate', 'origUPB', 'origLTV', 'origDebtIncRatio', 'worstCreditScore', 'bankNumber', 'stateNumber', 'rateDiffAbovePct', 'mSA', 'outcome']


# Pycaret - Setup with categorical definition

#### Normalization
https://pycaret.org/normalization/

> `normalize: bool, default = False` - When set to True, the feature space is transformed using the normalized_method param. **Generally, linear algorithms perform better with normalized data** however, the results may vary and it is advised to run multiple experiments to evaluate the benefit of normalization.

In [None]:
model_setup = setup(
    dfStatesOnly
    , target = 'outcome' # PyCaret will list this as "Label"
    , pca = False 
    , ignore_low_variance = True # Variance is calculated using the ratio of unique values to the number of samples, and the ratio of the most common value to the frequency of the second most common value.
    , normalize = True
    , ignore_features = None
    , remove_outliers = True # outliers from the training data are removed using PCA linear dimensionality reduction using the Singular Value Decomposition technique.
    , silent = True
    , profile = False
    , categorical_features = [
            'bankNumber'
            , 'stateNumber'
            , 'mSA'
        ]
    , numeric_features = [
        'origIntRate'
        , 'origUPB'
        , 'origLTV'
        #, 'pMIperct'
        , 'origDebtIncRatio'
        , 'worstCreditScore'
        , 'rateDiffAbovePct'
    ]
)

# session_id - if you ever want to reprint the results later, pass the session_id to setup()
#      and it will run the setup using the same split of test/train

IntProgress(value=0, description='Processing: ', max=13)

Please Enter the sample % of data you would like to use for modeling. Example: Enter 0.3 for 30%.
Press Enter if you would like to use 100% of the data.
 


# Decide which model to use

In [15]:
# Can influence performance by reducing # of folds (10 is def.) or adding blacklists/exclusions
# Regression has about 21 models
# Classification has about 15 models
# Logistic Regression - ‘lr’
# K Nearest Neighbour - ‘knn’
# Naives Bayes - ‘nb’
# Decision Tree - ‘dt’
# SVM (Linear) - ‘svm’
# Gaussian Process - ‘gpc’
# Ridge Classifier - ‘ridge’
# Random Forest - ‘rf’
# Quadratic Disc. Analysis - ‘qda’
# AdaBoost - ‘ada’
# Linear Disc. Analysis - ‘lda’
# Extra Trees Classifier - ‘et’
# Extreme Gradient Boosting - ‘xgboost’
# Light Gradient Boosting - ‘lightgbm’
# Cat Boost Classifier - ‘catboost’
####### Off by default (enable w "turbo = False")
# Multi Level Perceptron - ‘mlp’
# Gradient Boosting Classifier - ‘gbc’
# SVM (RBF) - ‘rbfsvm’
# 
# compare_models(blacklist=['catboost', 'xgboost', 'lightgbm'])
# 
# Slow: 
#    - Gaussian Process Classifier
#    - 
%%time

model_results=compare_models(
    fold=2
    , blacklist = None
    ,  round = 4 # number of decimal places to round to. 4 is default
    ,  sort = 'Recall'
    , turbo = True # True by default. Auto blacklists models that have longer training times. When True, rbfsvm, gpc and mlp are excluded due to longer training times. If you set to False, could take a long time
)
model_results

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Gaussian Process Classifier,0.7437,0.8117,0.769,0.7098,0.7382,0.4879
1,CatBoost Classifier,0.7409,0.816,0.7602,0.7093,0.7338,0.4819
2,Light Gradient Boosting Machine,0.7337,0.8107,0.7494,0.7035,0.7256,0.4675
3,Extreme Gradient Boosting,0.7194,0.7922,0.739,0.6875,0.7123,0.4392
4,SVM - Radial Kernel,0.6998,0.7754,0.7386,0.6619,0.698,0.4013
5,Extra Trees Classifier,0.7555,0.8458,0.7285,0.7455,0.7369,0.5086
6,Gradient Boosting Classifier,0.7174,0.7924,0.7275,0.6887,0.7075,0.4345
7,MLP Classifier,0.7208,0.7769,0.7226,0.6956,0.7087,0.4408
8,Decision Tree Classifier,0.6991,0.7,0.7156,0.6679,0.6909,0.3984
9,Ada Boost Classifier,0.7088,0.7775,0.7132,0.6819,0.6972,0.4171


# Results
Full test set: Extra Trees Classifier

In [16]:
%%time

model_results=compare_models(
    fold=10
    , blacklist = [
            'nb'
            , 'knn'
            , 'svm'
            , 'qda'
            , 'rf'
            , 'lda'
            , 'ridge'
        ] 
    ,  round = 4 # number of decimal places to round to. 4 is default
    ,  sort = 'Recall'
    , turbo = False # True by default. Auto blacklists models that have longer training times. When True, rbfsvm, gpc and mlp are excluded due to longer training times. If you set to False, could take a long time
)
model_results

Wall time: 2h 40min 26s


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Gaussian Process Classifier,0.7664,0.8409,0.7971,0.7304,0.7622,0.5334
1,Decision Tree Classifier,0.7583,0.7604,0.7969,0.7193,0.756,0.5177
2,MLP Classifier,0.7674,0.8232,0.7965,0.7325,0.7631,0.5355
3,Extra Trees Classifier,0.8061,0.899,0.7916,0.7951,0.7932,0.6107
4,CatBoost Classifier,0.7497,0.829,0.7672,0.7191,0.7423,0.4995
5,Light Gradient Boosting Machine,0.7397,0.8194,0.7594,0.7081,0.7327,0.4797
6,SVM - Radial Kernel,0.7078,0.7815,0.747,0.6695,0.706,0.4172
7,Extreme Gradient Boosting,0.7201,0.7948,0.741,0.6876,0.7132,0.4406
8,Gradient Boosting Classifier,0.7227,0.797,0.7392,0.6919,0.7146,0.4455
9,Logistic Regression,0.7174,0.7881,0.7176,0.6923,0.7046,0.4339


In [9]:
# First, let's split our data into X and y:
X_train = dfStatesOnly.loc[:, dfStatesOnly.columns != 'outcome']
y_train = dfStatesOnly['outcome']

et = create_model(
    'et'
    , ensemble = True
    , method = 'Boosting'
)

IntProgress(value=0, description='Processing: ', max=14)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa


NameError: name 'X_train' is not defined

# F1 score
F1 score is ((Precision * Recall) / (Precision + Recall))<sup>2</sup>

In [None]:
%%time

et = create_model(
    'et'
    , ensemble = True
    , method = 'Boosting'
)