# Lending Tree Loan Status - Hyperopt Best Models

# Set Up Environment for Spark

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Spark/

/content/drive/MyDrive/Spark


In [None]:
# Set up environment for Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz

In [None]:
!tar xf spark-3.3.0-bin-hadoop3.tgz

In [None]:
# Set your spark folder to your system path environment. 
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/drive/MyDrive/Spark/spark-3.3.0-bin-hadoop3'

In [None]:
# Install findspark using pip
!pip install -q findspark
!pip install -U pyspark==3.3
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.3
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 63.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=2ac9238506d28218a2c824c8c7bc44df580e10a8b1f116a72abe92b2f2219e0d
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
# Pyspark Session for Colab
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master('local')\
        .appName('Colab')\
        .config('spark.driver.memory', '24g')\
        .config('spark.executor.pyspark.memory', '18g')\
        .config('spark.executor.cores', '4')\
        .config('spark.python.worker.memory', '18g')\
        .config('spark.sql.execution.arrow.pyspark.enabled', 'True')\
        .config('spark.sql.debug.maxToStringFields', '1000')\
        .config('spark.sql.autoBroadcastJoinThreshold', '-1')\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

In [None]:
# Remove warnings
spark.sparkContext.setLogLevel('ERROR')

# Install & Import Packages and Set Seed

In [None]:
!pip install --upgrade mlflow 
!pip install hyperopt
import random
import numpy as np
import warnings
from pyspark.sql.functions import col, round
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import time
from datetime import datetime, timedelta
from timeit import default_timer as timer
try:
  import mlflow.pyspark.ml
  mlflow.pyspark.ml.autolog()
except:
  print(f'Your version of MLflow ({mlflow.__version__}) does not support pyspark.ml for autologging. To use autologging, upgrade your MLflow client version or use Databricks Runtime for ML 8.3 or above.')
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlflow
  Downloading mlflow-1.29.0-py3-none-any.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 4.3 MB/s 
Collecting prometheus-flask-exporter<1
  Downloading prometheus_flask_exporter-0.20.3-py3-none-any.whl (18 kB)
Collecting databricks-cli<1,>=0.8.7
  Downloading databricks-cli-0.17.3.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 6.2 MB/s 
Collecting importlib-metadata!=4.7.0,<5,>=3.7.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting alembic<2
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 51.6 MB/s 
Collecting querystring-parser<2
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting docker<7,>=4.0.0
  Downloading docker-6.0.0-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 42.1 MB/s 
[?25hCollecting g

In [None]:
# Set seed 
seed_value = 42
os.environ['SparkML_HPO'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# Upsampling - Oversample Minority Class 

## Read Data and View Schema

In [None]:
trainDF_US = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/trainDF_US.csv',
                         header=True, inferSchema=True)
trainDF_US.cache()
print('\nTrain Schema')
trainDF_US.printSchema()

testDF_US = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/testDF_US.csv',
                        header=True, inferSchema=True)
testDF_US.cache()
print('\nTest Schema')
testDF_US.printSchema()


Train Schema
root
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- out_prncp: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- recoveries: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- collections_12_mths_ex_med: double (nullable = true)
 |-- acc_open_past_24mths: double (nullable = true)
 |-- bc_open_to_buy: double (nullable = true)
 |-- chargeoff_within_12_mths: double (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- mths_since_recent_bc: double (nullable = true)
 |-- num_bc_sats: double (nullable = true)
 |-- num_bc_tl: double (nullable = true)
 |-- num_sats: double (nulla

## Set up Vector Assembler, Scalers and Evaluators

In [None]:
# Define features and label for train data
features = trainDF_US.columns[0: len(trainDF_US.columns) - 1]
trainDF_US = trainDF_US.select(col('loan_status').alias('label'), *features)

In [None]:
# VectorAssembler 
vecAssembler = VectorAssembler(inputCols=features, 
                               outputCol='unscaledFeatures', 
                               handleInvalid='skip')  

# Transform train data
trainDF_US = vecAssembler.transform(trainDF_US)  

In [None]:
# Define features and label for test data 
features = testDF_US.columns[0: len(testDF_US.columns) - 1]
testDF_US = testDF_US.select(col('loan_status').alias('label'), *features)

# Transform test data
testDF_US = vecAssembler.transform(testDF_US)  

In [None]:
# MinMaxScaler
mmScaler = MinMaxScaler(inputCol='unscaledFeatures', 
                        outputCol='scaledFeatures',
                        min=0, max=1)

In [None]:
# Standard scaler
stdScaler = StandardScaler(inputCol='unscaledFeatures', 
                           outputCol='scaledFeatures', 
                           withStd=True, 
                           withMean=False)

In [None]:
# Define model evaluation - AUROC
evaluator_auroc = BinaryClassificationEvaluator(labelCol='label', 
                                                metricName='areaUnderROC')
# Define model evaluation - Accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol='label', 
                                                  metricName='accuracy')

## Load Saved Models - Upsampling

In [None]:
# AUROC
pipelineModel_lr_hyperopt_auroc_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lr_hyperopt_us_auroc_100trials/')
pipelineModel_lsvc_hyperopt_auroc_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lsvc_hyperopt_us_auroc/')
pipelineModel_dt_hyperopt_auroc_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_dt_hyperopt_us_auroc/')
pipelineModel_rf_hyperopt_auroc_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_us_auroc_30trials/')
pipelineModel_rf_hyperopt_auroc1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_us_auroc_moreParams/')
pipelineModel_gbt_hyperopt_auroc_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_gbt_hyperopt_us_auroc/')

# F1
pipelineModel_lr_hyperopt_f1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lr_hyperopt_us_f1_100trials/')
pipelineModel_lsvc_hyperopt_f1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lsvc_hyperopt_us_f1/')
pipelineModel_dt_hyperopt_f1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_dt_hyperopt_us_f1/')
pipelineModel_rf_hyperopt_f1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_us_f1_30trials/')
pipelineModel_rf_hyperopt_f1_1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_us_f1_moreParams/')
pipelineModel_gbt_hyperopt_f1_US = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_gbt_hyperopt_us_f1/')

## Predict and AUROC Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_auroc_US.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hyperopt_auroc_US.transform(testDF_US)
prediction_dt = pipelineModel_dt_hyperopt_auroc_US.transform(testDF_US)
prediction_rf = pipelineModel_rf_hyperopt_auroc_US.transform(testDF_US)
prediction_rf1 = pipelineModel_rf_hyperopt_auroc1_US.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hyperopt_auroc_US.transform(testDF_US)

print('Hyperopt Best Models AUROC Metrics: Upsampling')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models AUROC Metrics: Upsampling


Area Under ROC Curve:
Logistic Regression: 0.9870101538778137
LinearSVC: 0.980184661932512
Decision Trees: 0.9625074740986497
Random Forest: 0.9845460058836394
Random Forest - More Params: 0.9849490161239277
Gradient Boosted Trees: 0.9866776626412849


Accuracy:
Logistic Regression: 0.9858303292922332
LinearSVC: 0.9812543210790038
Decision Trees: 0.974555636999304
Random Forest: 0.9845169525033933
Random Forest - More Params: 0.9847504930943666
Gradient Boosted Trees: 0.9810554647342146


In [None]:
print('Hyperopt Best Models AUROC Metrics: Upsampling')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')
print('\n')

Hyperopt Best Models AUROC Metrics: Upsampling

Model: prediction_lr
True Positives: 50199
True Negatives: 376146
False Positives: 1702
False Negatives: 4426
Total: 432473
Accuracy: 0.9858303292922332
Recall: 0.918974828375286
Precision:  0.9672067975568871
F1 score: 0.9424741377691831



Model: prediction_lsvc
True Positives: 48829
True Negatives: 375537
False Positives: 2311
False Negatives: 5796
Total: 432473
Accuracy: 0.9812543210790038
Recall: 0.8938947368421053
Precision:  0.9548103245991396
F1 score: 0.9233489339573584



Model: prediction_dt
True Positives: 50116
True Negatives: 371353
False Positives: 6495
False Negatives: 4509
Total: 432473
Accuracy: 0.974555636999304
Recall: 0.9174553775743707
Precision:  0.8852696472417021
F1 score: 0.9010751914847711



Model: prediction_rf
True Positives: 49231
True Negatives: 376546
False Positives: 1302
False Negatives: 5394
Total: 432473
Accuracy: 0.9845169525033933
Recall: 0.901254004576659
Precision:  0.9742346585399639
F1 score: 0.9

## Predict and F1 Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_f1_US.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hyperopt_f1_US.transform(testDF_US)
prediction_dt = pipelineModel_dt_hyperopt_f1_US.transform(testDF_US)
prediction_rf = pipelineModel_rf_hyperopt_f1_US.transform(testDF_US)
prediction_rf1 = pipelineModel_rf_hyperopt_f1_1_US.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hyperopt_f1_US.transform(testDF_US)

print('Hyperopt Best Models F1 Metrics: Upsampling')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models F1 Metrics: Upsampling


Area Under ROC Curve:
Logistic Regression: 0.9870101538778137
LinearSVC: 0.9801636688553507
Decision Trees: 0.9564704240277363
Random Forest: 0.9845886468846079
Random Forest - More Params: 0.9832680256640207
Gradient Boosted Trees: 0.9848670786073231


Accuracy:
Logistic Regression: 0.9858303292922332
LinearSVC: 0.9814716756884245
Decision Trees: 0.9809121031833201
Random Forest: 0.9849354757406821
Random Forest - More Params: 0.9861586734894433
Gradient Boosted Trees: 0.983296067037711


In [None]:
print('Hyperopt Best Models F1 Metrics: Upsampling')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')
print('\n')

Hyperopt Best Models F1 Metrics: Upsampling

Model: prediction_lr
True Positives: 50199
True Negatives: 376146
False Positives: 1702
False Negatives: 4426
Total: 432473
Accuracy: 0.9858303292922332
Recall: 0.918974828375286
Precision:  0.9672067975568871
F1 score: 0.9424741377691831



Model: prediction_lsvc
True Positives: 48978
True Negatives: 375482
False Positives: 2366
False Negatives: 5647
Total: 432473
Accuracy: 0.9814716756884245
Recall: 0.8966224256292906
Precision:  0.9539186662511686
F1 score: 0.9243835461314158



Model: prediction_dt
True Positives: 50417
True Negatives: 373801
False Positives: 4047
False Negatives: 4208
Total: 432473
Accuracy: 0.9809121031833201
Recall: 0.9229656750572083
Precision:  0.9256940364277321
F1 score: 0.9243278424039088



Model: prediction_rf
True Positives: 49291
True Negatives: 376667
False Positives: 1181
False Negatives: 5334
Total: 432473
Accuracy: 0.9849354757406821
Recall: 0.9023524027459954
Precision:  0.9766008876208591
F1 score: 0.93

## Load Saved Models - SMOTE

In [None]:
# AUROC
pipelineModel_lr_hyperopt_auroc_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lr_hyperopt_smote_auroc_100trials/')
pipelineModel_lsvc_hyperopt_auroc_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lsvc_hyperopt_smote_auroc/')
pipelineModel_dt_hyperopt_auroc_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_dt_hyperopt_smote_auroc/')
pipelineModel_rf_hyperopt_auroc_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_smote_auroc_30trials/')
pipelineModel_rf_hyperopt_auroc1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_smote_auroc_moreParams/')
pipelineModel_gbt_hyperopt_auroc_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_gbt_hyperopt_smote_auroc/')

# F1
pipelineModel_lr_hyperopt_f1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lr_hyperopt_smote_f1_100trials/')
pipelineModel_lsvc_hyperopt_f1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_lsvc_hyperopt_smote_f1/')
pipelineModel_dt_hyperopt_f1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_dt_hyperopt_smote_f1/')
pipelineModel_rf_hyperopt_f1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_smote_f1_29trials/')
pipelineModel_rf_hyperopt_f1_1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_rf_hyperopt_smote_f1_moreParams/')
pipelineModel_gbt_hyperopt_f1_SMOTE = PipelineModel.load('/content/drive/MyDrive/LoanStatus/Python/Models/ML/SparkML/Models/Hyperopt/pipelineModel_gbt_hyperopt_smote_f1/')

## Predict and SMOTE AUROC Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_auroc_SMOTE.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hyperopt_auroc_SMOTE.transform(testDF_US)
prediction_dt = pipelineModel_dt_hyperopt_auroc_SMOTE.transform(testDF_US)
prediction_rf = pipelineModel_rf_hyperopt_auroc_SMOTE.transform(testDF_US)
prediction_rf1 = pipelineModel_rf_hyperopt_auroc1_SMOTE.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hyperopt_auroc_SMOTE.transform(testDF_US)

print('Hyperopt Best Models AUROC Metrics: SMOTE Models using Upsampling Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models AUROC Metrics: SMOTE Models using Upsampling Data


Area Under ROC Curve:
Logistic Regression: 0.9860501025753575
LinearSVC: 0.9792529293074257
Decision Trees: 0.9619760183492718
Random Forest: 0.9807582878240927
Random Forest - More Params: 0.9809293771442331
Gradient Boosted Trees: 0.9871254350362428


Accuracy:
Logistic Regression: 0.9866072564067584
LinearSVC: 0.9824312731661861
Decision Trees: 0.980546762456847
Random Forest: 0.9816127249562401
Random Forest - More Params: 0.9837770219181313
Gradient Boosted Trees: 0.9866095686898373


In [None]:
print('Hyperopt Best Models AUROC Metrics: SMOTE Models using Upsampling Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models AUROC Metrics: SMOTE Models using Upsampling Data

Model: prediction_lr
True Positives: 50014
True Negatives: 376667
False Positives: 1181
False Negatives: 4611
Total: 432473
Accuracy: 0.9866072564067584
Recall: 0.9155881006864989
Precision:  0.9769313409512648
F1 score: 0.9452655452655452



Model: prediction_lsvc
True Positives: 48353
True Negatives: 376522
False Positives: 1326
False Negatives: 6272
Total: 432473
Accuracy: 0.9824312731661861
Recall: 0.8851807780320367
Precision:  0.9733086414782907
F1 score: 0.9271552385335173



Model: prediction_dt
True Positives: 49885
True Negatives: 374175
False Positives: 3673
False Negatives: 4740
Total: 432473
Accuracy: 0.980546762456847
Recall: 0.9132265446224256
Precision:  0.9314201426490907
F1 score: 0.9222336226579038



Model: prediction_rf
True Positives: 46904
True Negatives: 377617
False Positives: 231
False Negatives: 7721
Total: 432473
Accuracy: 0.9816127249562401
Recall: 0.8586544622425629
Precision:  0.99509

## Predict and SMOTE F1 Model Metrics using testDF of Upsampling Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_f1_SMOTE.transform(testDF_US)
prediction_lsvc = pipelineModel_lsvc_hyperopt_f1_SMOTE.transform(testDF_US)
prediction_dt = pipelineModel_dt_hyperopt_f1_SMOTE.transform(testDF_US)
prediction_rf = pipelineModel_rf_hyperopt_f1_SMOTE.transform(testDF_US)
prediction_rf1 = pipelineModel_rf_hyperopt_f1_1_SMOTE.transform(testDF_US)
prediction_gbt = pipelineModel_gbt_hyperopt_f1_SMOTE.transform(testDF_US)

print('Hyperopt Best Models F1 Metrics: SMOTE Models using Upsampling Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models F1 Metrics: SMOTE Models using Upsampling Data


Area Under ROC Curve:
Logistic Regression: 0.9860501025753575
LinearSVC: 0.981959571141341
Decision Trees: 0.9494460525988754
Random Forest: 0.9805702355001202
Random Forest - More Params: 0.9804896037281486
Gradient Boosted Trees: 0.9869668327152193


Accuracy:
Logistic Regression: 0.9866072564067584
LinearSVC: 0.9841793591738675
Decision Trees: 0.9862858490587851
Random Forest: 0.9815433564638717
Random Forest - More Params: 0.9834856742501844
Gradient Boosted Trees: 0.9867483056745739


In [None]:
print('Hyperopt Best Models F1 Metrics: SMOTE Models using Upsampling Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models F1 Metrics: SMOTE Models using Upsampling Data

Model: prediction_lr
True Positives: 50014
True Negatives: 376667
False Positives: 1181
False Negatives: 4611
Total: 432473
Accuracy: 0.9866072564067584
Recall: 0.9155881006864989
Precision:  0.9769313409512648
F1 score: 0.9452655452655452



Model: prediction_lsvc
True Positives: 48896
True Negatives: 376735
False Positives: 1113
False Negatives: 5729
Total: 432473
Accuracy: 0.9841793591738675
Recall: 0.8951212814645308
Precision:  0.9777440060789058
F1 score: 0.9346101649559416



Model: prediction_dt
True Positives: 49561
True Negatives: 376981
False Positives: 867
False Negatives: 5064
Total: 432473
Accuracy: 0.9862858490587851
Recall: 0.9072951945080091
Precision:  0.9828071706194971
F1 score: 0.943542783166592



Model: prediction_rf
True Positives: 46869
True Negatives: 377622
False Positives: 226
False Negatives: 7756
Total: 432473
Accuracy: 0.9815433564638717
Recall: 0.8580137299771167
Precision:  0.995201189

# SMOTE - Split Over Upsampling 

## Read Data and View Schema

In [None]:
trainDF_SMOTE = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/trainDF_SMOTE.csv', 
                               header=True, inferSchema=True)
trainDF_SMOTE.cache()
trainDF_SMOTE = trainDF_SMOTE \
  .withColumn('loan_amnt', trainDF_SMOTE['loan_amnt'].cast(IntegerType())) \
  .withColumn('revol_bal', trainDF_SMOTE['revol_bal'].cast(IntegerType())) \
  .withColumn('term_ 60 months', trainDF_SMOTE['term_ 60 months'].cast(IntegerType())) \
  .withColumn('grade_B', trainDF_SMOTE['grade_B'].cast(IntegerType())) \
  .withColumn('grade_C', trainDF_SMOTE['grade_C'].cast(IntegerType())) \
  .withColumn('grade_D', trainDF_SMOTE['grade_D'].cast(IntegerType())) \
  .withColumn('home_ownership_MORTGAGE', trainDF_SMOTE['home_ownership_MORTGAGE'].cast(IntegerType())) \
  .withColumn('home_ownership_OWN', trainDF_SMOTE['home_ownership_OWN'].cast(IntegerType())) \
  .withColumn('home_ownership_RENT', trainDF_SMOTE['home_ownership_RENT'].cast(IntegerType())) \
  .withColumn('verification_status_Source Verified', trainDF_SMOTE['verification_status_Source Verified'].cast(IntegerType())) \
  .withColumn('verification_status_Verified', trainDF_SMOTE['verification_status_Verified'].cast(IntegerType())) \
  .withColumn('purpose_credit_card', trainDF_SMOTE['purpose_credit_card'].cast(IntegerType())) \
  .withColumn('initial_list_status_w', trainDF_SMOTE['initial_list_status_w'].cast(IntegerType())) \
  .withColumn('application_type_Joint App', trainDF_SMOTE['application_type_Joint App'].cast(IntegerType())) \
  .withColumn('disbursement_method_DirectPay', trainDF_SMOTE['disbursement_method_DirectPay'].cast(IntegerType())) 
print('\nTrain Schema')
trainDF_SMOTE.printSchema()

testDF_SMOTE = spark.read.csv('/content/drive/MyDrive/LoanStatus/Data/testDF_SMOTE.csv',
                              header=True, inferSchema=True)
testDF_SMOTE.cache()
print('\nTest Schema')
testDF_SMOTE.printSchema()


Train Schema
root
 |-- loan_amnt: integer (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- out_prncp: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- total_rec_int: double (nullable = true)
 |-- total_rec_late_fee: double (nullable = true)
 |-- recoveries: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- collections_12_mths_ex_med: double (nullable = true)
 |-- acc_open_past_24mths: double (nullable = true)
 |-- bc_open_to_buy: double (nullable = true)
 |-- chargeoff_within_12_mths: double (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- mths_since_recent_bc: double (nullable = true)
 |-- num_bc_sats: double (nullable = true)
 |-- num_bc_tl: double (nullable = true)
 |-- num_sats: double (nulla

## Set up Vector Assembler

In [None]:
# Define features and label for train data
features = trainDF_SMOTE.columns[0: len(trainDF_SMOTE.columns) - 1]
trainDF_SMOTE = trainDF_SMOTE.select(col('loan_status').alias('label'), *features)

# Transform train data
trainDF_SMOTE = vecAssembler.transform(trainDF_SMOTE)  

In [None]:
# Define features and label for test data 
features = testDF_SMOTE.columns[0: len(testDF_SMOTE.columns) - 1]
testDF_SMOTE = testDF_SMOTE.select(col('loan_status').alias('label'), *features)

# Transform test data
testDF_SMOTE = vecAssembler.transform(testDF_SMOTE)  

## Predict and AUROC Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_auroc_SMOTE.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hyperopt_auroc_SMOTE.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hyperopt_auroc_SMOTE.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hyperopt_auroc_SMOTE.transform(testDF_SMOTE)
prediction_rf1 = pipelineModel_rf_hyperopt_auroc1_SMOTE.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hyperopt_auroc_SMOTE.transform(testDF_SMOTE)

print('Hyperopt Best Models AUROC Metrics: SMOTE')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1)) 
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models AUROC Metrics: SMOTE


Area Under ROC Curve:
Logistic Regression: 0.9860501025753575
LinearSVC: 0.9792529293074257
Decision Trees: 0.9619760183492718
Random Forest: 0.9807582878240927
Random Forest - More Params: 0.9809293771442331
Gradient Boosted Trees: 0.9871254350362428


Accuracy:
Logistic Regression: 0.9866072564067584
LinearSVC: 0.9824312731661861
Decision Trees: 0.980546762456847
Random Forest: 0.9816127249562401
Random Forest - More Params: 0.9837770219181313
Gradient Boosted Trees: 0.9866095686898373


In [None]:
print('Hyperopt Best Models AUROC Metrics: SMOTE')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt',
              'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models AUROC Metrics: SMOTE

Model: prediction_lr
True Positives: 50014
True Negatives: 376667
False Positives: 1181
False Negatives: 4611
Total: 432473
Accuracy: 0.9866072564067584
Recall: 0.9155881006864989
Precision:  0.9769313409512648
F1 score: 0.9452655452655452



Model: prediction_lsvc
True Positives: 48353
True Negatives: 376522
False Positives: 1326
False Negatives: 6272
Total: 432473
Accuracy: 0.9824312731661861
Recall: 0.8851807780320367
Precision:  0.9733086414782907
F1 score: 0.9271552385335173



Model: prediction_dt
True Positives: 49885
True Negatives: 374175
False Positives: 3673
False Negatives: 4740
Total: 432473
Accuracy: 0.980546762456847
Recall: 0.9132265446224256
Precision:  0.9314201426490907
F1 score: 0.9222336226579038



Model: prediction_rf
True Positives: 46904
True Negatives: 377617
False Positives: 231
False Negatives: 7721
Total: 432473
Accuracy: 0.9816127249562401
Recall: 0.8586544622425629
Precision:  0.9950991831971996
F1 score: 0.92185

## Predict and F1 Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_f1_SMOTE.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hyperopt_f1_SMOTE.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hyperopt_f1_SMOTE.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hyperopt_f1_SMOTE.transform(testDF_SMOTE)
prediction_rf1 = pipelineModel_rf_hyperopt_f1_1_SMOTE.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hyperopt_f1_SMOTE.transform(testDF_SMOTE)

print('Hyperopt Best Models F1 Metrics: SMOTE')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models F1 Metrics: SMOTE


Area Under ROC Curve:
Logistic Regression: 0.9860501025753575
LinearSVC: 0.981959571141341
Decision Trees: 0.9494460525988754
Random Forest: 0.9805702355001202
Random Forest - More Params: 0.9804896037281486
Gradient Boosted Trees: 0.9869668327152193


Accuracy:
Logistic Regression: 0.9866072564067584
LinearSVC: 0.9841793591738675
Decision Trees: 0.9862858490587851
Random Forest: 0.9815433564638717
Random Forest - More Params: 0.9834856742501844
Gradient Boosted Trees: 0.9867483056745739


In [None]:
print('Hyperopt Best Models F1 Metrics: SMOTE')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt',
              'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models F1 Metrics: SMOTE

Model: prediction_lr
True Positives: 50014
True Negatives: 376667
False Positives: 1181
False Negatives: 4611
Total: 432473
Accuracy: 0.9866072564067584
Recall: 0.9155881006864989
Precision:  0.9769313409512648
F1 score: 0.9452655452655452



Model: prediction_lsvc
True Positives: 48896
True Negatives: 376735
False Positives: 1113
False Negatives: 5729
Total: 432473
Accuracy: 0.9841793591738675
Recall: 0.8951212814645308
Precision:  0.9777440060789058
F1 score: 0.9346101649559416



Model: prediction_dt
True Positives: 49561
True Negatives: 376981
False Positives: 867
False Negatives: 5064
Total: 432473
Accuracy: 0.9862858490587851
Recall: 0.9072951945080091
Precision:  0.9828071706194971
F1 score: 0.943542783166592



Model: prediction_rf
True Positives: 46869
True Negatives: 377622
False Positives: 226
False Negatives: 7756
Total: 432473
Accuracy: 0.9815433564638717
Recall: 0.8580137299771167
Precision:  0.9952011890858902
F1 score: 0.921529689

## Predict and Upsampling AUROC Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_auroc_US.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hyperopt_auroc_US.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hyperopt_auroc_US.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hyperopt_auroc_US.transform(testDF_SMOTE)
prediction_rf1 = pipelineModel_rf_hyperopt_auroc1_US.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hyperopt_auroc_US.transform(testDF_SMOTE)

print('Hyperopt Best Models AUROC Metrics: Upsampling Models using SMOTE Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models AUROC Metrics: Upsampling Models using SMOTE Data


Area Under ROC Curve:
Logistic Regression: 0.9870101538778137
LinearSVC: 0.980184661932512
Decision Trees: 0.9625074740986497
Random Forest: 0.9845460058836394
Random Forest - More Params: 0.9849490161239277
Gradient Boosted Trees: 0.9866776626412849


Accuracy:
Logistic Regression: 0.9858303292922332
LinearSVC: 0.9812543210790038
Decision Trees: 0.974555636999304
Random Forest: 0.9845169525033933
Random Forest - More Params: 0.9847504930943666
Gradient Boosted Trees: 0.9810554647342146


In [None]:
print('Hyperopt Best Models AUROC Metrics: Upsampling Models using SMOTE Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models AUROC Metrics: Upsampling Models using SMOTE Data

Model: prediction_lr
True Positives: 50199
True Negatives: 376146
False Positives: 1702
False Negatives: 4426
Total: 432473
Accuracy: 0.9858303292922332
Recall: 0.918974828375286
Precision:  0.9672067975568871
F1 score: 0.9424741377691831



Model: prediction_lsvc
True Positives: 48829
True Negatives: 375537
False Positives: 2311
False Negatives: 5796
Total: 432473
Accuracy: 0.9812543210790038
Recall: 0.8938947368421053
Precision:  0.9548103245991396
F1 score: 0.9233489339573584



Model: prediction_dt
True Positives: 50116
True Negatives: 371353
False Positives: 6495
False Negatives: 4509
Total: 432473
Accuracy: 0.974555636999304
Recall: 0.9174553775743707
Precision:  0.8852696472417021
F1 score: 0.9010751914847711



Model: prediction_rf
True Positives: 49231
True Negatives: 376546
False Positives: 1302
False Negatives: 5394
Total: 432473
Accuracy: 0.9845169525033933
Recall: 0.901254004576659
Precision:  0.974234

## Predict and Upsampling F1 Model Metrics using testDF of SMOTE Set

In [None]:
prediction_lr = pipelineModel_lr_hyperopt_f1_US.transform(testDF_SMOTE)
prediction_lsvc = pipelineModel_lsvc_hyperopt_f1_US.transform(testDF_SMOTE)
prediction_dt = pipelineModel_dt_hyperopt_f1_US.transform(testDF_SMOTE)
prediction_rf = pipelineModel_rf_hyperopt_f1_US.transform(testDF_SMOTE)
prediction_rf1 = pipelineModel_rf_hyperopt_f1_1_US.transform(testDF_SMOTE)
prediction_gbt = pipelineModel_gbt_hyperopt_f1_US.transform(testDF_SMOTE)

print('Hyperopt Best Models F1 Metrics: Upsampling Models using SMOTE Data')
print('\n')
print('Area Under ROC Curve:')
print('Logistic Regression:', evaluator_auroc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_auroc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_auroc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_auroc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_auroc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_auroc.evaluate(prediction_gbt)) 
print('\n')
print('Accuracy:')
print('Logistic Regression:', evaluator_acc.evaluate(prediction_lr)) 
print('LinearSVC:', evaluator_acc.evaluate(prediction_lsvc)) 
print('Decision Trees:', evaluator_acc.evaluate(prediction_dt)) 
print('Random Forest:', evaluator_acc.evaluate(prediction_rf)) 
print('Random Forest - More Params:', evaluator_acc.evaluate(prediction_rf1))
print('Gradient Boosted Trees:', evaluator_acc.evaluate(prediction_gbt)) 

Hyperopt Best Models F1 Metrics: Upsampling Models using SMOTE Data


Area Under ROC Curve:
Logistic Regression: 0.9870101538778137
LinearSVC: 0.9801636688553507
Decision Trees: 0.9564704240277363
Random Forest: 0.9845886468846079
Random Forest - More Params: 0.9832680256640207
Gradient Boosted Trees: 0.9848670786073231


Accuracy:
Logistic Regression: 0.9858303292922332
LinearSVC: 0.9814716756884245
Decision Trees: 0.9809121031833201
Random Forest: 0.9849354757406821
Random Forest - More Params: 0.9861586734894433
Gradient Boosted Trees: 0.983296067037711


In [None]:
print('Hyperopt Best Models F1 Metrics: Upsampling Models using SMOTE Data')
for model in ['prediction_lr', 'prediction_lsvc', 'prediction_dt', 
			        'prediction_rf', 'prediction_rf1', 'prediction_gbt']:
    df = globals()[model]
    
    tp = df[(df.label == 1) & (df.prediction == 1)].count()
    tn = df[(df.label == 0) & (df.prediction == 0)].count()
    fp = df[(df.label == 0) & (df.prediction == 1)].count()
    fn = df[(df.label == 1) & (df.prediction == 0)].count()
    a = ((tp + tn)/df.count())
    
    if(tp + fn == 0.0):
        r = 0.0
        p = float(tp) / (tp + fp)
    elif(tp + fp == 0.0):
        r = float(tp) / (tp + fn)
        p = 0.0
    else:
        r = float(tp) / (tp + fn)
        p = float(tp) / (tp + fp)
    
    if(p + r == 0):
        f1 = 0
    else:
        f1 = 2 * ((p * r)/(p + r))
    
    print('\nModel:', model)
    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', df.count())
    print('Accuracy:', a)
    print('Recall:', r)
    print('Precision: ', p)
    print('F1 score:', f1)
    print('\n')

Hyperopt Best Models F1 Metrics: Upsampling Models using SMOTE Data

Model: prediction_lr
True Positives: 50199
True Negatives: 376146
False Positives: 1702
False Negatives: 4426
Total: 432473
Accuracy: 0.9858303292922332
Recall: 0.918974828375286
Precision:  0.9672067975568871
F1 score: 0.9424741377691831



Model: prediction_lsvc
True Positives: 48978
True Negatives: 375482
False Positives: 2366
False Negatives: 5647
Total: 432473
Accuracy: 0.9814716756884245
Recall: 0.8966224256292906
Precision:  0.9539186662511686
F1 score: 0.9243835461314158



Model: prediction_dt
True Positives: 50417
True Negatives: 373801
False Positives: 4047
False Negatives: 4208
Total: 432473
Accuracy: 0.9809121031833201
Recall: 0.9229656750572083
Precision:  0.9256940364277321
F1 score: 0.9243278424039088



Model: prediction_rf
True Positives: 49291
True Negatives: 376667
False Positives: 1181
False Negatives: 5334
Total: 432473
Accuracy: 0.9849354757406821
Recall: 0.9023524027459954
Precision:  0.9766008