# Lending Tree Loan Status - GridSearchCV Best Models

# Set Up Environment for Spark

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Spark/

/content/drive/MyDrive/Spark


In [None]:
# Set up environment for Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz

In [None]:
!tar xf spark-3.3.0-bin-hadoop3.tgz

In [None]:
# Set your spark folder to your system path environment. 
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/drive/MyDrive/Spark/spark-3.3.0-bin-hadoop3'

In [None]:
# Install findspark using pip
!pip install -q findspark
!pip install -U pyspark==3.3
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.3
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 59 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 64.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=88c92d7a991b03264e76d480c170456e4388f806c01df5e99b5d246fd393dac6
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
# Pyspark Session for Colab
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master('local')\
        .appName('Colab')\
        .config('spark.driver.memory', '24g')\
        .config('spark.executor.pyspark.memory', '18g')\
        .config('spark.executor.cores', '4')\
        .config('spark.python.worker.memory', '18g')\
        .config('spark.sql.execution.arrow.pyspark.enabled', 'True')\
        .config('spark.sql.debug.maxToStringFields', '1000')\
        .config('spark.sql.autoBroadcastJoinThreshold', '-1')\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

In [None]:
# Remove warnings
spark.sparkContext.setLogLevel('ERROR')

# Install & Import Packages and Set Seed

In [None]:
!pip install --upgrade mlflow 
!pip install hyperopt
import random
import numpy as np
import warnings
from pyspark.sql.functions import col, round
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import time
from datetime import datetime, timedelta
from timeit import default_timer as timer
try:
  import mlflow.pyspark.ml
  mlflow.pyspark.ml.autolog()
except:
  print(f'Your version of MLflow ({mlflow.__version__}) does not support pyspark.ml for autologging. To use autologging, upgrade your MLflow client version or use Databricks Runtime for ML 8.3 or above.')
warnings.filterwarnings('ignore')  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlflow
  Downloading mlflow-1.29.0-py3-none-any.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 25.9 MB/s 
[?25hCollecting alembic<2
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 92.4 MB/s 
Collecting importlib-metadata!=4.7.0,<5,>=3.7.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting docker<7,>=4.0.0
  Downloading docker-6.0.0-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 103.4 MB/s 
Collecting databricks-cli<1,>=0.8.7
  Downloading databricks-cli-0.17.3.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 7.0 MB/s 
Collecting gunicorn<21
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 8.8 MB/s 
[?25hCollecting prometheus-flask-exporter<1
  Downloading prometheus_flask_exporter-

In [None]:
# Set seed 
seed_value = 42
os.environ['SparkML_HPO'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# Upsampling - Oversample Minority Class 

## Read Data and View Schema

In [None]:
trainDF_US = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/trainDF_US.csv',
                         header=True, inferSchema=True)
trainDF_US.cache()
print('\nTrain Schema')
trainDF_US.printSchema()

testDF_US = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/testDF_US.csv',
                        header=True, inferSchema=True)
testDF_US.cache()
print('\nTest Schema')
testDF_US.printSchema()


Train Schema
root
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- out_prncp: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- recoveries: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- collections_12_mths_ex_med: double (nullable = true)
 |-- acc_open_past_24mths: double (nullable = true)
 |-- bc_open_to_buy: double (nullable = true)
 |-- chargeoff_within_12_mths: double (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- mths_since_recent_bc: double (nullable = true)
 |-- num_bc_sats: double (nullable = true)
 |-- num_bc_tl: double (nullable = true)
 |-- num_sats: double (nulla

## Set up Vector Assembler, Scalers and Evaluators

In [None]:
# Define features and label for train data
features = trainDF_US.columns[0: len(trainDF_US.columns) - 1]
trainDF_US = trainDF_US.select(col('loan_status').alias('label'), *features)

In [None]:
# VectorAssembler 
vecAssembler = VectorAssembler(inputCols=features, 
                               outputCol='unscaledFeatures', 
                               handleInvalid='skip')  

# Transform train data
trainDF_US = vecAssembler.transform(trainDF_US)  

In [None]:
# Define features and label for test data 
features = testDF_US.columns[0: len(testDF_US.columns) - 1]
testDF_US = testDF_US.select(col('loan_status').alias('label'), *features)

# Transform test data
testDF_US = vecAssembler.transform(testDF_US)  

In [None]:
# MinMaxScaler
mmScaler = MinMaxScaler(inputCol='unscaledFeatures', 
                        outputCol='scaledFeatures',
                        min=0, max=1)

In [None]:
# Standard scaler
stdScaler = StandardScaler(inputCol='unscaledFeatures', 
                           outputCol='scaledFeatures', 
                           withStd=True, 
                           withMean=False)

In [None]:
# Define model evaluation - AUROC
evaluator_auroc = BinaryClassificationEvaluator(labelCol='label', 
                                                metricName='areaUnderROC')
# Define model evaluation - Accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol='label', 
                                                  metricName='accuracy')

## Load Saved Models - Upsampling

In [None]:
pipelineModel_lr_hpo_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_lr_us_hpo_grid/')
pipelineModel_lsvc_hpo_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_lsvc_us_hpo_grid/')
pipelineModel_dt_hpo_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_dt_us_hpo_grid/')
pipelineModel_rf_hpo_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_rf_us_hpo_grid/')
pipelineModel_gbt_hpo_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_gbt_us_hpo_grid/')

## Predict and Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hpo_US.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hpo_US.transform(testDF_US)
prediction_dt = pipelineModel_dt_hpo_US.transform(testDF_US)
prediction_rf = pipelineModel_rf_hpo_US.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hpo_US.transform(testDF_US)

print('GridSearchCV Best Models Metrics: Upsampling')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

GridSearchCV Best Models Metrics: Upsampling


Area Under ROC Curve:
Logistic Regression: 0.9716420492988681
LinearSVC: 0.9802039596564845
Decision Trees: 0.9602220402746184
Random Forest: 0.9811805363162989
Gradient Boosted Trees: 0.9849316561714048


Accuracy:
Logistic Regression: 0.9740423101557785
LinearSVC: 0.9816335355039505
Decision Trees: 0.9810508401680568
Random Forest: 0.9791455189110072
Gradient Boosted Trees: 0.9823503432584231


In [None]:
print('GridSearchCV Best Models Metrics: Upsampling')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')
print('\n')

GridSearchCV Best Models Metrics: Upsampling

Model: prediction_lr
True Positives: 47818
True Negatives: 373429
False Positives: 4419
False Negatives: 6807
Total: 432473
Accuracy: 0.9740423101557785
Recall: 0.8753867276887872
Precision:  0.9154047897084442
F1 score: 0.8949486253298647



Model: prediction_lsvc
True Positives: 48966
True Negatives: 375564
False Positives: 2284
False Negatives: 5659
Total: 432473
Accuracy: 0.9816335355039505
Recall: 0.8964027459954234
Precision:  0.9554341463414634
F1 score: 0.9249775678866589



Model: prediction_dt
True Positives: 50419
True Negatives: 373859
False Positives: 3989
False Negatives: 4206
Total: 432473
Accuracy: 0.9810508401680568
Recall: 0.9230022883295195
Precision:  0.9266835759447141
F1 score: 0.924839268845212



Model: prediction_rf
True Positives: 49771
True Negatives: 373683
False Positives: 4165
False Negatives: 4854
Total: 432473
Accuracy: 0.9791455189110072
Recall: 0.9111395881006865
Precision:  0.9227788490062296
F1 score: 0.9

## Load Saved Models - SMOTE

In [None]:
pipelineModel_lr_hpo_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_lr_smote_hpo_grid/')
pipelineModel_lsvc_hpo_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_lsvc_smote_hpo_grid/')
pipelineModel_rf_hpo_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_rf_smote_hpo_grid/')
pipelineModel_gbt_hpo_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_gbt_smote_hpo_grid/')
pipelineModel_dt_hpo_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/GridSearchCV/pipelineModel_dt_smote_hpo_grid/')

## Predict and SMOTE Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hpo_SMOTE.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hpo_SMOTE.transform(testDF_US)
prediction_dt = pipelineModel_dt_hpo_SMOTE.transform(testDF_US)
prediction_rf = pipelineModel_rf_hpo_SMOTE.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hpo_SMOTE.transform(testDF_US)

print('GridSearchCV Best Models Metrics: SMOTE Models using Upsampling Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

GridSearchCV Best Models Metrics: SMOTE Models using Upsampling Data


Area Under ROC Curve:
Logistic Regression: 0.9705261720148816
LinearSVC: 0.979112214992606
Decision Trees: 0.9493941365983161
Random Forest: 0.9780902581532774
Gradient Boosted Trees: 0.9864679978587151


Accuracy:
Logistic Regression: 0.9745810721131724
LinearSVC: 0.9819711288334763
Decision Trees: 0.9851320198023923
Random Forest: 0.975866701505065
Gradient Boosted Trees: 0.9858603889722596


In [None]:
print('GridSearchCV Best Models Metrics: SMOTE Models using Upsampling Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

GridSearchCV Best Models Metrics: SMOTE Models using Upsampling Data

Model: prediction_lr
True Positives: 46901
True Negatives: 374579
False Positives: 3269
False Negatives: 7724
Total: 432473
Accuracy: 0.9745810721131724
Recall: 0.8585995423340961
Precision:  0.9348415387681882
F1 score: 0.89509995705902



Model: prediction_lsvc
True Positives: 48314
True Negatives: 376362
False Positives: 1486
False Negatives: 6311
Total: 432473
Accuracy: 0.9819711288334763
Recall: 0.884466819221968
Precision:  0.9701606425702811
F1 score: 0.9253339717500598



Model: prediction_dt
True Positives: 49542
True Negatives: 376501
False Positives: 1347
False Negatives: 5083
Total: 432473
Accuracy: 0.9851320198023923
Recall: 0.9069473684210526
Precision:  0.9735306254789837
F1 score: 0.9390602194969387



Model: prediction_rf
True Positives: 44961
True Negatives: 377075
False Positives: 773
False Negatives: 9664
Total: 432473
Accuracy: 0.975866701505065
Recall: 0.8230846681922197
Precision:  0.9830979140

# SMOTE - Split Over Upsampling 

## Read Data and View Schema

In [None]:
trainDF_SMOTE = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/trainDF_SMOTE.csv', 
                               header=True, inferSchema=True)
trainDF_SMOTE.cache()
trainDF_SMOTE = trainDF_SMOTE \
  .withColumn('loan_amnt', trainDF_SMOTE['loan_amnt'].cast(IntegerType())) \
  .withColumn('revol_bal', trainDF_SMOTE['revol_bal'].cast(IntegerType())) \
  .withColumn('term_ 60 months', trainDF_SMOTE['term_ 60 months'].cast(IntegerType())) \
  .withColumn('grade_B', trainDF_SMOTE['grade_B'].cast(IntegerType())) \
  .withColumn('grade_C', trainDF_SMOTE['grade_C'].cast(IntegerType())) \
  .withColumn('grade_D', trainDF_SMOTE['grade_D'].cast(IntegerType())) \
  .withColumn('home_ownership_MORTGAGE', trainDF_SMOTE['home_ownership_MORTGAGE'].cast(IntegerType())) \
  .withColumn('home_ownership_OWN', trainDF_SMOTE['home_ownership_OWN'].cast(IntegerType())) \
  .withColumn('home_ownership_RENT', trainDF_SMOTE['home_ownership_RENT'].cast(IntegerType())) \
  .withColumn('verification_status_Source Verified', trainDF_SMOTE['verification_status_Source Verified'].cast(IntegerType())) \
  .withColumn('verification_status_Verified', trainDF_SMOTE['verification_status_Verified'].cast(IntegerType())) \
  .withColumn('purpose_credit_card', trainDF_SMOTE['purpose_credit_card'].cast(IntegerType())) \
  .withColumn('initial_list_status_w', trainDF_SMOTE['initial_list_status_w'].cast(IntegerType())) \
  .withColumn('application_type_Joint App', trainDF_SMOTE['application_type_Joint App'].cast(IntegerType())) \
  .withColumn('disbursement_method_DirectPay', trainDF_SMOTE['disbursement_method_DirectPay'].cast(IntegerType())) 
print('\nTrain Schema')
trainDF_SMOTE.printSchema()

testDF_SMOTE = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/testDF_SMOTE.csv',
                              header=True, inferSchema=True)
testDF_SMOTE.cache()
print('\nTest Schema')
testDF_SMOTE.printSchema()


Train Schema
root
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- out_prncp: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- recoveries: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- collections_12_mths_ex_med: double (nullable = true)
 |-- acc_open_past_24mths: double (nullable = true)
 |-- bc_open_to_buy: double (nullable = true)
 |-- chargeoff_within_12_mths: double (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- mths_since_recent_bc: double (nullable = true)
 |-- num_bc_sats: double (nullable = true)
 |-- num_bc_tl: double (nullable = true)
 |-- num_sats: double (nulla

## Set up Vector Assembler

In [None]:
# Define features and label for train data
features = trainDF_SMOTE.columns[0: len(trainDF_SMOTE.columns) - 1]
trainDF_SMOTE = trainDF_SMOTE.select(col('loan_status').alias('label'), *features)

# Transform train data
trainDF_SMOTE = vecAssembler.transform(trainDF_SMOTE)  

In [None]:
# Define features and label for test data 
features = testDF_SMOTE.columns[0: len(testDF_SMOTE.columns) - 1]
testDF_SMOTE = testDF_SMOTE.select(col('loan_status').alias('label'), *features)

# Transform test data
testDF_SMOTE = vecAssembler.transform(testDF_SMOTE)  

## Predict and Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hpo_SMOTE.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hpo_SMOTE.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hpo_SMOTE.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hpo_SMOTE.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hpo_SMOTE.transform(testDF_SMOTE)

print('GridSearchCV Best Models Metrics: SMOTE')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

GridSearchCV Best Models Metrics: SMOTE


Area Under ROC Curve:
Logistic Regression: 0.9705261720148816
LinearSVC: 0.979112214992606
Decision Trees: 0.9493941365983161
Random Forest: 0.9780902581532774
Gradient Boosted Trees: 0.9864679978587151


Accuracy:
Logistic Regression: 0.9745810721131724
LinearSVC: 0.9819711288334763
Decision Trees: 0.9851320198023923
Random Forest: 0.975866701505065
Gradient Boosted Trees: 0.9858603889722596


In [None]:
print('GridSearchCV Best Models Metrics: SMOTE')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
              'prediction_rf', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

GridSearchCV Best Models Metrics: SMOTE

Model: prediction_lr
True Positives: 46901
True Negatives: 374579
False Positives: 3269
False Negatives: 7724
Total: 432473
Accuracy: 0.9745810721131724
Recall: 0.8585995423340961
Precision:  0.9348415387681882
F1 score: 0.89509995705902



Model: prediction_lsvc
True Positives: 48314
True Negatives: 376362
False Positives: 1486
False Negatives: 6311
Total: 432473
Accuracy: 0.9819711288334763
Recall: 0.884466819221968
Precision:  0.9701606425702811
F1 score: 0.9253339717500598



Model: prediction_dt
True Positives: 49542
True Negatives: 376501
False Positives: 1347
False Negatives: 5083
Total: 432473
Accuracy: 0.9851320198023923
Recall: 0.9069473684210526
Precision:  0.9735306254789837
F1 score: 0.9390602194969387



Model: prediction_rf
True Positives: 44961
True Negatives: 377075
False Positives: 773
False Negatives: 9664
Total: 432473
Accuracy: 0.975866701505065
Recall: 0.8230846681922197
Precision:  0.9830979140245769
F1 score: 0.8960033479

## Predict and Upsampling Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hpo_US.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hpo_US.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hpo_US.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hpo_US.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hpo_US.transform(testDF_SMOTE)

print('GridSearchCV Best Models Metrics: US Models using SMOTE Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

GridSearchCV Best Models Metrics: US Models using SMOTE Data


Area Under ROC Curve:
Logistic Regression: 0.9716420492988681
LinearSVC: 0.9802039596564845
Decision Trees: 0.9602220402746184
Random Forest: 0.9811805363162989
Gradient Boosted Trees: 0.9849316561714048


Accuracy:
Logistic Regression: 0.9740423101557785
LinearSVC: 0.9816335355039505
Decision Trees: 0.9810508401680568
Random Forest: 0.9791455189110072
Gradient Boosted Trees: 0.9823503432584231


In [None]:
print('GridSearchCV Best Models Metrics: US Models using SMOTE Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			'prediction_rf', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

GridSearchCV Best Models Metrics: US Models using SMOTE Data

Model: prediction_lr
True Positives: 47818
True Negatives: 373429
False Positives: 4419
False Negatives: 6807
Total: 432473
Accuracy: 0.9740423101557785
Recall: 0.8753867276887872
Precision:  0.9154047897084442
F1 score: 0.8949486253298647



Model: prediction_lsvc
True Positives: 48966
True Negatives: 375564
False Positives: 2284
False Negatives: 5659
Total: 432473
Accuracy: 0.9816335355039505
Recall: 0.8964027459954234
Precision:  0.9554341463414634
F1 score: 0.9249775678866589



Model: prediction_dt
True Positives: 50419
True Negatives: 373859
False Positives: 3989
False Negatives: 4206
Total: 432473
Accuracy: 0.9810508401680568
Recall: 0.9230022883295195
Precision:  0.9266835759447141
F1 score: 0.924839268845212



Model: prediction_rf
True Positives: 49771
True Negatives: 373683
False Positives: 4165
False Negatives: 4854
Total: 432473
Accuracy: 0.9791455189110072
Recall: 0.9111395881006865
Precision:  0.92277884900622