In [1]:
# By default, the notebook prints the models comparing table as an output 
# only. If it is necessary to check preliminary print output, change value 
# of this parameter to True.
show_intermediate_output = False

In [2]:
# show print output which was used for analysis, feature engineering and
# regression adjustments, and is not very useful in terms of explanation, 
# but can be used for future adjustments and development (activation will
# significantly increase the time of runnig, so is NOT RECOMMENDED)
show_dev_output = False

In [3]:
# importing library to control time of running
import time

# activating timer
start_time = time.time()

In [4]:
# importing libraries

# data processing and analysis essentials
import pandas as pd

# essential graphical output
import matplotlib.pyplot as plt

# enhanced graphical output
import seaborn as sns

# mathematical essentials
import numpy as np 

# train/test split
from sklearn.model_selection import train_test_split 

# k nearest neighbors for regression
from sklearn.neighbors import KNeighborsRegressor

# logistic regression
from sklearn.linear_model import LogisticRegression  

# auc score
from sklearn.metrics import roc_auc_score            

# KNN for classification
from sklearn.neighbors import KNeighborsClassifier   

# standard scaler
from sklearn.preprocessing import StandardScaler     
  
# customizable scorer
from sklearn.metrics import make_scorer              

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV     

# confusion matrix
from sklearn.metrics import confusion_matrix

# classification trees
from sklearn.tree import DecisionTreeClassifier 

# exports graphics
from sklearn.tree import export_graphviz

# saves objects in memory
from six import StringIO

# displays on frontend
from IPython.display import Image 

# interprets dot objects
import pydotplus                                     

# random forest classifier
from sklearn.ensemble import RandomForestClassifier

# gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier


# deactivating warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# setting visualization style 
sns.set_style('whitegrid')

# setting palette
my_palette = sns.color_palette("gist_earth")
sns.set_palette(my_palette)

In [6]:
# setting pandas output option
pd.options.display.max_columns = 150
pd.options.display.max_rows = 500

In [7]:
# setting a path to the file with data
file = './datasets/Apprentice_Chef_Dataset.xlsx'

# reading the file
data = pd.read_excel(file)

In [8]:
#################### creating features from object values ####################

In [9]:
######################## creating features from EMAIL ########################

In [10]:
# email domain types

# domain names of professional emails
domain_professional = [
    '@mmm.com', '@amex.com,' '@apple.com', '@boeing.com', 
    '@caterpillar.com', '@chevron.com', '@cisco.com', 
    '@cocacola.com','@disney.com', '@dupont.com', 
    '@exxon.com', '@ge.org', '@goldmansacs.com', 
    '@homedepot.com', '@ibm.com', '@intel.com', '@jnj.com', 
    '@jpmorgan.com', '@mcdonalds.com', '@merck.com', 
    '@microsoft.com', '@nike.com', '@pfizer.com', 
    '@pg.com', '@travelers.com', '@unitedtech.com', 
    '@unitedhealth.com', '@verizon.com', '@visa.com', 
    '@walmart.com', '@amex.com', '@apple.com']

# domain names of personal emails
domain_personal = ['@gmail.com', '@yahoo.com', '@protonmail.com']

# domain names of junk emails
domain_junk = [
    '@me.com', '@aol.com', '@hotmail.com', '@live.com', 
    '@msn.com', '@passport.com']

In [11]:
# placeholders of dummy variables for domain types
data['domain_personal'] = 0
data['domain_professional'] = 0
data['domain_junk'] = 0


# looping over each email address
for index, col in data.iterrows():
    
    # splitting email domain at '@' and add '@' at the beginning
    domain = '@' + data.loc[index, 'EMAIL'].split(sep='@')[1]
    
    # filling out personal domain column
    if domain in domain_personal:
        data.loc[index, 'domain_personal'] = 1

    # filling out professional domain column
    elif domain in domain_professional:
        data.loc[index, 'domain_professional'] = 1

    # filling out junk domain column
    elif domain in domain_junk:
        data.loc[index, 'domain_junk'] = 1
    
    # print message if a domain is missing in all lists
    else:
        print('Unknown')

        
# preventing print output if it is not needed
if show_dev_output:
    
    # checking results
    print(f"""
domain_personal {data['domain_personal'].sum():12}
domain_professional {data['domain_professional'].sum():8}
domain_junk {data['domain_junk'].sum():16}""")

In [12]:
##################### creating features from FIRST_NAME ######################

In [13]:
# list of the results of gender guesser running from part I
genders_initial = [
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 
    'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'male', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'male', 'unknown', 'unknown', 'mostly_male', 'female', 'unknown', 'male', 
    'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'mostly_male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 
    'male', 'male', 'male', 'female', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'female', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'female', 'male', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'female', 
    'female', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'female', 'unknown', 'andy', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'mostly_male', 'unknown', 'unknown', 'male', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 
    'unknown', 'mostly_male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'female', 
    'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'mostly_male', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'female', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'mostly_male', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 
    'andy', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 
    'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'andy', 'male', 'unknown', 'unknown', 'male', 'male', 
    'female', 'female', 'unknown', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'female', 
    'unknown', 'male', 'female', 'unknown', 'unknown', 'unknown', 'female', 
    'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'female', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'female', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 
    'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 
    'mostly_female', 'mostly_female', 'male', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'female', 'female', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'male', 'female', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'female', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'mostly_female', 'mostly_female', 'unknown', 'male', 'unknown', 
    'female', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 
    'female', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'female', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'male', 'female', 'male', 'male', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 
    'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 
    'male', 'mostly_male', 'male', 'male', 'male', 'male', 'mostly_male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 
    'unknown', 'andy', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'female', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'mostly_male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'female', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'female', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 
    'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'male', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'female', 'female', 'male', 'male', 'female', 
    'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 
    'male', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'male', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'male', 'male', 'male', 'unknown', 'female', 'unknown', 'unknown', 
    'male', 'male', 'unknown', 'unknown', 'unknown', 'andy', 'unknown', 'female', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'female', 'unknown', 'unknown', 'male', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'mostly_male', 'male', 'male', 'male', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'male', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'mostly_male', 'male', 
    'unknown', 'male', 'unknown', 'unknown', 'female', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'male', 'male', 'unknown', 'male', 'male', 'male', 'male', 
    'unknown', 'unknown', 'unknown', 'female', 'male', 'male', 'unknown', 'male', 
    'unknown', 'mostly_female', 'male', 'unknown', 'unknown', 'female', 'male', 
    'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 
    'female', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 
    'female', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'female', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'female', 'unknown', 'mostly_male', 'mostly_male', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 
    'female', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'male', 'unknown', 'unknown', 'male', 'male', 'male', 'mostly_male', 
    'unknown', 'unknown', 'male', 'andy', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'female', 
    'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'female', 
    'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'male', 
    'female', 'mostly_female', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'mostly_female', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'female', 'unknown', 'unknown', 'unknown', 'female', 'female', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'female', 'female', 'female', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'mostly_female', 'male', 'unknown', 'unknown', 'female', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'mostly_female', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'mostly_female', 
    'female', 'female', 'male', 'male', 'male', 'unknown', 'unknown', 
    'mostly_female', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 
    'unknown', 'female', 'unknown', 'male', 'unknown', 'unknown', 'male', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'male', 'female', 'unknown', 'male', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 
    'unknown', 'andy', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'female', 'unknown', 'unknown', 'unknown', 'mostly_male', 'male', 'male', 
    'unknown', 'male', 'unknown', 'mostly_male', 'female', 'unknown', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 
    'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'mostly_female', 'unknown', 'unknown', 'unknown', 'female', 'female', 
    'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 
    'mostly_female', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 
    'unknown', 'unknown', 'male', 'male', 'unknown', 'female', 'unknown', 
    'unknown', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'mostly_female', 'male', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'andy', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 
    'unknown', 'unknown', 'male', 'mostly_male', 'unknown', 'male', 'male', 
    'unknown', 'unknown', 'male', 'male', 'male', 'male', 'andy', 'unknown', 
    'unknown', 'female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'male', 'female', 'female', 'unknown', 
    'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
    'male', 'unknown', 'unknown', 'female', 'unknown', 'unknown']

In [14]:
# converting list of gender guesses into a series and add to the data 
# dataframe
data['gender_guess'] = pd.Series(genders_initial)

In [15]:
# replacing mostly_male by male
data.loc[:, 'gender_guess'][data['gender_guess'] == 'mostly_male'] = 'male'

In [16]:
# replacing mostly_female by female
data.loc[:, 'gender_guess'][data['gender_guess'] == 'mostly_female'] = \
                                                                     'female'

In [17]:
# replacing andy by unknown
data.loc[:, 'gender_guess'][data['gender_guess'] == 'andy'] = 'unknown'

In [18]:
# creating dummy variables from the gender_guess column
data = data.join(pd.get_dummies(data['gender_guess'], prefix = 'gender'))

# dropping the gender_guess column
data.drop('gender_guess', axis=1, inplace=True)

In [19]:
## creating new variables as a result of visualisations analysis in part I ###

In [20]:
############# creating variable from PRODUCT_CATEGORIES_VIEWED ###############

In [21]:
# creating a dummy variable for PRODUCT_CATEGORIES_VIEWED <= 3
data['product_categories_viewed_up_to_3'] = \
    data['PRODUCT_CATEGORIES_VIEWED'].apply(lambda x: 1 if x <= 3 else 0)

In [22]:
# creating a dummy variable for PRODUCT_CATEGORIES_VIEWED > 3 and <= 9
data['product_categories_viewed_4_9'] = \
    data['PRODUCT_CATEGORIES_VIEWED'].apply(lambda x: 1 if x <= 9 and 
                                                                x > 3 else 0)

In [23]:
# creating a dummy variable for PRODUCT_CATEGORIES_VIEWED > 9
data['product_categories_viewed_10plus'] = \
    data['PRODUCT_CATEGORIES_VIEWED'].apply(lambda x: 1 if x > 9 else 0)

In [24]:
################## log(x + 1) transforming count variables ###################

In [25]:
data['log_LATE_DELIVERIES'] = np.log1p(data['LATE_DELIVERIES'])

data['log_TOTAL_MEALS_ORDERED'] = np.log1p(data['TOTAL_MEALS_ORDERED'])

data['log_TOTAL_PHOTOS_VIEWED'] = np.log1p(data['TOTAL_PHOTOS_VIEWED'])

In [26]:
###################### additional features engineering #######################

In [27]:
# total logins
data['total_logins'] = data['PC_LOGINS'] + data['MOBILE_LOGINS']

In [28]:
# as we know from the dictionary, LARGEST_ORDER_SIZE in reality stands for 
# average number of meals ordered per customer. This raises questions,
# but at least the following division operation might make sense
data['term'] = data['TOTAL_MEALS_ORDERED'] / (data['LARGEST_ORDER_SIZE'] + 1)

# logarithm transformed parameter
data['log_term'] = np.log1p(data['term'])

In [29]:
# since it is necessary to prove the age to get alcohol beverages, customers 
# who want to purchase alcohol have to submit a picture of a government-issued 
# ID card, so we can assume that family names of people who submitted pictures
# of their IDs are known

# to filter customers who did not submitted their IDs we will take into 
# account customers with missing family names or those ones whose first and
# last names are equal

# we will also tale into accout those customers whose family names don't 
# appear to be real family names, that is start with 'the' or 'of'

# replacing NaNs with empty strings
data['FAMILY_NAME'].fillna('', inplace=True)

# creating the column with a feature of no last name and filling it with 0s
data['no_family_name'] = 0
# filling the column with 1s for customers with no family name or customers 
# whose first and last names are equal
data.loc[:, 'no_family_name'][data['FAMILY_NAME'] == ''] = 1
data.loc[:, 'no_family_name'][data['FIRST_NAME'] == data['FAMILY_NAME']] = 1

# filling the column for customers whose family names start with 'the' or 'of'
for i in range(0, len(data['FAMILY_NAME'])):
    try:
        if data.loc[i, 'FAMILY_NAME'].split()[0] == 'the' \
                            or data.loc[i, 'FAMILY_NAME'].split()[0] == 'of':
            data.loc[i, 'no_family_name'] = 1
    except:
        pass

In [30]:
# creating features of quantity of words and letters in customers' names
# creating the columns and filling them with zeros
data['name_length_words'] = 0
data['name_length_letters'] = 0

# filling the columns with quantity of words and letters respectively
try:
    for i in range(len(data)):
        data.loc[i, 'name_length_words'] = \
            len(data.loc[i, 'FIRST_NAME'].split()) \
                    + len(data.loc[i, 'FAMILY_NAME'].split())
        data.loc[i, 'name_length_letters'] = \
            len(data.loc[i, 'FIRST_NAME']) + len(data.loc[i, 'FAMILY_NAME'])
except:
    pass

In [31]:
# creating a feature of family names frequency
family_dict = dict(data['FAMILY_NAME'].value_counts())

for i in range(len(data)):
    data.loc[i, 'family_freq'] = family_dict[data.loc[i, 'FAMILY_NAME']]

In [32]:
# creating columns for 25th, 50th, and 75th percentiles of revenue
percentiles = np.percentile(data['REVENUE'], [25, 50, 75])

data['rev_perc_25'] = data['REVENUE'].\
    apply(lambda x: 1 if x <= percentiles[0] else 0)

data['rev_perc_50'] = data['REVENUE'].\
    apply(lambda x: 1 if x > percentiles[0] and x <= percentiles[1] else 0)

data['rev_perc_75'] = data['REVENUE'].\
    apply(lambda x: 1 if x > percentiles[1] and x <= percentiles[2] else 0)

In [33]:
# creating a feature of contacts with customer service frequence in relation 
# with total quantity of orders
data['contacts_freq'] = data['CONTACTS_W_CUSTOMER_SERVICE'] \
                                            / data['TOTAL_MEALS_ORDERED']

In [34]:
# creating a feature of average photos viewed per session
data['avg_photos_per_visit'] = data['TOTAL_PHOTOS_VIEWED'] \
                                            / data['total_logins']

In [35]:
# creating a feature of total clicks
data['total_clicks'] = data['AVG_CLICKS_PER_VISIT'] * data['total_logins']

In [36]:
# creating a feature of total time spent on the company's website
data['total_time_on_site'] = data['AVG_TIME_PER_SITE_VISIT'] \
                                            * data['total_logins']

In [37]:
# # checking results
# data.head(3)

In [38]:
############### creating a list of all explanatory variables #################

In [39]:
features_all = pd.Series(data.columns)
features_all = features_all.drop([1, 2, 3, 4, 5]).to_list()

In [40]:
#################### checking correlation coefficients #######################

In [41]:
# preventing print output if it is not needed
if show_dev_output:
    
    # creating a correlation matrix
    df_corr = data.corr().round(2)

    # printing correlations with log_REVENUE
    print(df_corr.loc['CROSS_SELL_SUCCESS']\
          [abs(df_corr.loc['CROSS_SELL_SUCCESS']) > 0.00]\
                                                .sort_values(ascending=False))

In [42]:
# preventing print output if it is not needed
if show_dev_output:
    
    feat_corr = list(df_corr.loc['CROSS_SELL_SUCCESS']\
          [((abs(df_corr.loc['CROSS_SELL_SUCCESS']) > 0.069) & \
            (df_corr.index != 'CROSS_SELL_SUCCESS'))].index)
    print(feat_corr)

In [43]:
############# building and adjusting a models in scikit-learn ################

In [44]:
############################# preparing data #################################

In [45]:
# creating a dictionary to store candidate models

candidate_dict = {

'final': 
            ['domain_junk',
             'no_family_name',
             'WEEKLY_PLAN',
             'AVG_PREP_VID_TIME',
             'CANCELLATIONS_BEFORE_NOON',
             'total_time_on_site',
             'LARGEST_ORDER_SIZE',
             'log_term',
             'gender_unknown',
             'avg_photos_per_visit',
             'log_LATE_DELIVERIES',
             'contacts_freq',
             'product_categories_viewed_4_9',
             'EARLY_DELIVERIES',
             'total_logins',
             'rev_perc_50',
             'log_TOTAL_PHOTOS_VIEWED',
             'MOBILE_NUMBER',
             'log_TOTAL_MEALS_ORDERED',
             'CANCELLATIONS_AFTER_NOON',
             'MOBILE_LOGINS',
             'gender_female',
             'REFRIGERATED_LOCKER',
             'domain_personal',
             'REVENUE',
             'total_clicks',
             'TASTES_AND_PREFERENCES',
             'name_length_words',
             'family_freq',
             'name_length_letters'], 
    
'sig_dt4': 
            ['domain_junk',
             'no_family_name',
             'WEEKLY_PLAN',
             'AVG_PREP_VID_TIME',
             'CANCELLATIONS_BEFORE_NOON',
             'total_time_on_site',
             'LARGEST_ORDER_SIZE',
             'log_term',
             'gender_unknown',
             'avg_photos_per_visit',
             'log_LATE_DELIVERIES',
             'contacts_freq',
             'product_categories_viewed_4_9',
             'EARLY_DELIVERIES',
             'total_logins',
             'rev_perc_50',
             'log_TOTAL_PHOTOS_VIEWED',
             'MOBILE_NUMBER',
             'log_TOTAL_MEALS_ORDERED',
             'CANCELLATIONS_AFTER_NOON',
             'MOBILE_LOGINS',
             'gender_female',
             'REFRIGERATED_LOCKER',
             'domain_personal',
             'REVENUE',
             'total_clicks',
             'TASTES_AND_PREFERENCES'], 
    
'007_plus_l':
            ['MOBILE_NUMBER',
             'TASTES_AND_PREFERENCES',
             'REFRIGERATED_LOCKER',
             'domain_professional',
             'domain_junk',
             'gender_female',
             'gender_male',
             'log_CANCELLATIONS_BEFORE_NOON',
             'log_total_cancellations',
             'cancelled_before_noon',
             'cancelled_orders',
             'no_family_name'], 

}

In [46]:
# preparing the full dataset
X_data = data[features_all]
y_data = data.loc[:, 'CROSS_SELL_SUCCESS']

# subsetting
x_data = X_data.loc[:, candidate_dict['final']] 

# train/test splitting of the subset
x_train, x_test, y_train, y_test = train_test_split(
                                                    x_data,
                                                    y_data,
                                                    test_size=0.25,
                                                    random_state=219,
                                                    stratify=y_data)

In [47]:
# instantiating StandardScaler()
scaler = StandardScaler()

# fitting the data
scaler.fit(x_data)

# transforming the data
X_scaled = scaler.transform(x_data)

# converting to a DataFrame
x_data_scaled = pd.DataFrame(X_scaled)

# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train, y_test = train_test_split(
                                                        x_data_scaled,
                                                        y_data,
                                                        test_size=0.25,
                                                        random_state=219,
                                                        stratify=y_data)

In [48]:
############################ Logistic Regression #############################

In [49]:
# instantiating a logistic regression model
lr = LogisticRegression(solver='lbfgs',
                        C=1.0,
                        warm_start=False,
                        max_iter=100,
                        random_state=219)

# fitting the training data
lr_fit = lr.fit(x_train_scaled, y_train)

# predicting based on the testing set
lr_pred = lr_fit.predict(x_test_scaled)

# saving scoring data for future use
lr_train_score = lr_fit.score(x_train_scaled, y_train).round(4) 
lr_test_score = lr_fit.score(x_test_scaled, y_test).round(4)   
lr_auc = roc_auc_score(y_true=y_test, y_score=lr_pred).round(4) 

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing scoring data
    print('Training ACCURACY:', lr_train_score)
    print('Testing ACCURACY :', lr_test_score)
    print('AUC Score        :', lr_auc)

In [50]:
# unpacking the confusion matrix
lr_tn, \
lr_fp, \
lr_fn, \
lr_tp = confusion_matrix(y_true=y_test, y_pred=lr_pred).ravel()

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing each result one-by-one
    print(f"""
True Negatives : {lr_tn}
False Positives: {lr_fp}
False Negatives: {lr_fn}
True Positives : {lr_tp}
""")

In [51]:
# GridSearchCV

# declaring a hyperparameter space
C_space = [5.1]
warm_start_space = [True]
solver_space = ['lbfgs']
# C_space = pd.np.arange(1.4, 5.5, 0.1)
# warm_start_space = [True, False]
# solver_space = ['newton-cg', 'sag', 'lbfgs']

# creating a hyperparameter grid
param_grid = {'C': C_space,
              'warm_start': warm_start_space,
              'solver': solver_space}

# instantiating the model object without hyperparameters
lr_tuned = LogisticRegression(random_state=219,
                              max_iter=1000)

# GridSearchCV object
lr_tuned_cv_scaled = GridSearchCV(estimator=lr_tuned, 
                                  param_grid=param_grid, 
                                  cv=3,         
                                  n_jobs=-1,
                                  scoring=make_scorer(
                                                      roc_auc_score,
                                                      needs_threshold=False)) 

# fitting to the full dataset
lr_tuned_cv_scaled.fit(x_data_scaled, y_data)

# preventing print output if it is not needed
if show_dev_output:
    
    # printing the optimal parameters and best score
    print("Tuned Parameters:", lr_tuned_cv_scaled.best_params_)
    print("Tuned CV AUC    :", lr_tuned_cv_scaled.best_score_.round(4))

In [52]:
# preventing print output if it is not needed
if show_dev_output:
    
    # checking the best estimator for the model
    print(lr_tuned_cv_scaled.best_estimator_)

In [53]:
# building a model based on hyperparameter tuning results

# instantiating a logistic regression model with tuned values
lr_tuned = lr_tuned_cv_scaled.best_estimator_

# predicting based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test_scaled)

# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train_scaled, y_train).round(4) 
lr_tuned_test_score = lr_tuned.score(x_test_scaled, y_test).round(4) 
lr_tuned_auc = roc_auc_score(y_true=y_test, y_score=lr_tuned_pred).round(4)

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing scoring data
    print('Training ACCURACY:', lr_tuned_train_score)
    print('Testing ACCURACY :', lr_tuned_test_score)
    print('AUC Score        :', lr_tuned_auc)

In [54]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true=y_test, y_pred=lr_tuned_pred).ravel()

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing each result one-by-one
    print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")

In [55]:
def model_adjusting(model, x_train, y_train, x_test, y_test, threshold, 
                    print_output=False):
    """
    Changes the threshold of classification model from 50/50 to the given 
    value, computes and returns training and testing scores, roc-auc metric, 
    and confusion matrix parameters, if necessary, prints scores and confusion
    matrix parameters. 
    
    PARAMETERS
    ----------
    model        : model object, classification model
    x_train      : dataframe, explanatory variable training data
    y_train      : series, response variable training data
    x_test       : dataframe, explanatory variable testing data
    y_train      : series, response variable testing data
    threshold    : float, the threshold which the function uses to predict 
                   classes 0 and 1
    print_output : bool, if the result should be printed, default False
    """

    # getting probabilities for train subset
    pred_proba_train = model.predict_proba(x_train)

    # placeholder for the list of predictions
    predictions_train = []
    # predicting for train subset
    for prob in pred_proba_train[:, 1]:
        if prob >= threshold:
            predictions_train.append(1)
        else:
            predictions_train.append(0)


    # getting probabilities for test subset
    pred_proba_test = model.predict_proba(x_test)

    # placeholder for the list of predictions
    predictions_test = []
    # predicting for test subset
    for prob in pred_proba_test[:, 1]:
        if prob >= threshold:
            predictions_test.append(1)
        else:
            predictions_test.append(0)


    # unpacking the confusion matrix for train subset
    tn_train, \
    fp_train, \
    fn_train, \
    tp_train = confusion_matrix(y_true=y_train, 
                                y_pred=predictions_train).ravel()

    # unpacking the confusion matrix for test subset
    tn_test, \
    fp_test, \
    fn_test, \
    tp_test = confusion_matrix(y_true=y_test, 
                               y_pred=predictions_test).ravel()


    # saving scoring results
    # train_score = round((tn_train + tp_train) \
    #                   / (tn_train + tp_train + fp_train + fn_train), 4)
    train_score = model.score(x_train, predictions_train).round(4)

    # test_score = round((tn_test + tp_test) \
    #                  / (tn_test + tp_test + fp_test + fn_test), 4)
    test_score = model.score(x_test, predictions_test).round(4)

    auc = roc_auc_score(y_true=y_test, y_score=predictions_test).round(4)


    # preventing print output if it is not needed
    if print_output:

        # printing scores
        print('Training ACCURACY:', train_score)
        print('Testing ACCURACY :', test_score)
        print('AUC Score        :', auc)

        # printing values of confusion matrix
        print(f"""
True Negatives : {tn_test}
False Positives: {fp_test}
False Negatives: {fn_test}
True Positives : {tp_test}
    """)
        
    return train_score, test_score, auc, tn_test, fp_test, fn_test, tp_test

In [56]:
# adjusting the model
    
# setting threshold
threshold = 0.713

# calling the function and getting accuracy and auc scores as well as 
# confusion matrix parameters
adj_lr_train_score, \
adj_lr_test_score, \
adj_lr_auc, \
adj_lr_tn_test, \
adj_lr_fp_test, \
adj_lr_fn_test, \
adj_lr_tp_test = model_adjusting(lr_tuned, 
                                 x_train_scaled, y_train, 
                                 x_test_scaled, y_test, threshold, 
                                 print_output=show_intermediate_output)

In [57]:
############################ Classification Trees ############################

In [58]:
# plot feature importances function
def plot_feature_importances(model, train, export=False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

In [59]:
# preventing running if it is not needed
if show_dev_output:

    # instantiating a classification tree object
    dt = DecisionTreeClassifier(max_depth=8)

    # fitting the training data
    dt_fit = dt.fit(x_train, y_train)

    # predicting on new data
    dt_pred = dt_fit.predict(x_test)

    # saving scoring data for future use
    dt_train_score = dt_fit.score(x_train, y_train).round(4) 
    dt_test_score = dt_fit.score(x_test, y_test).round(4)
    dt_auc_score = roc_auc_score(y_true=y_test, y_score=dt_pred).round(4) 

    # printing scoring data
    print('Training ACCURACY:', dt_train_score)
    print('Testing ACCURACY :', dt_test_score)
    print('AUC Score        :', dt_auc_score)

In [60]:
# preventing print output if it is not needed
if show_dev_output:
    
    # instantiating a classification tree object
    dt_scaled = DecisionTreeClassifier(max_depth=8)

    # fitting the training data
    dt_fit_scaled = dt_scaled.fit(x_train_scaled, y_train)

    # predicting on new data
    dt_pred_scaled = dt_fit_scaled.predict(x_test_scaled)

    # saving scoring data for future use
    dt_scaled_train_score = dt_fit_scaled.\
                                    score(x_train_scaled, y_train).round(4) 
    dt_scaled_test_score = dt_fit_scaled.\
                                    score(x_test_scaled, y_test).round(4)   
    dt_scaled_auc_score = roc_auc_score(y_true=y_test,
                                        y_score=dt_pred_scaled).round(4)

    # printing scoring data
    print('Training ACCURACY:', dt_scaled_train_score)
    print('Testing ACCURACY :', dt_scaled_test_score)
    print('AUC Score        :', dt_scaled_auc_score)

In [61]:
# preventing print output if it is not needed
if show_dev_output:

    # plotting feature importance
    plot_feature_importances(dt_fit,
                             train=x_train,
                             export=False)

In [62]:
# # this chunk of code might be used to get features in descending order 
# # of their significance
# df = pd.DataFrame(zip(x_train.columns, 
#                       full_tree_fit.feature_importances_)).\
#                           sort_values(1, ascending=False)
# sig_dt = df[df[1]>0][0].to_list()
# sig_dt

In [63]:
# GridSearchCV

# declaring a hyperparameter space
criterion = ["gini"]
splitter = ["best"]
max_depth = [5]
min_samples_split = [2]
min_samples_leaf = [20]
max_features = [None]
class_weight = ['balanced']
min_weight_fraction_leaf = [0.0]
max_leaf_nodes = [None]
min_impurity_decrease = [0.0]
ccp_alpha = [0.0]
# criterion = ["gini"]
# splitter = ["best"]
# max_depth = range(3, 9)
# min_samples_split = range(2, 5)
# min_samples_leaf = range(15, 31)
# max_features = [None]
# class_weight = ['balanced']
# min_weight_fraction_leaf = [0.0, 0.0001, 0.001, 0.01, 0.1]
# max_leaf_nodes = [None]
# min_impurity_decrease = [0.0, 0.0001, 0.001, 0.01, 0.1]
# ccp_alpha = [0.0, 0.0001, 0.001, 0.01, 0.1]

# creating a hyperparameter grid
param_grid = {'criterion': criterion,
              'splitter': splitter,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'max_features': max_features,
              'class_weight': class_weight,
              'min_weight_fraction_leaf': min_weight_fraction_leaf, 
              'max_leaf_nodes': max_leaf_nodes, 
              'min_impurity_decrease': min_impurity_decrease, 
              'ccp_alpha': ccp_alpha}

# instantiating the model object without hyperparameters
dt_tuned = DecisionTreeClassifier(random_state=219)

# GridSearchCV object
dt_tuned_cv_scaled = GridSearchCV(estimator=dt_tuned,   
                                  param_grid=param_grid, 
                                  cv=3, 
                                  n_jobs=-1,
                                  scoring=make_scorer(roc_auc_score,
                                                      needs_threshold=False))

# fitting to the full dataset
dt_tuned_cv_scaled.fit(x_data_scaled, y_data)

# preventing print output if it is not needed
if show_dev_output:
    
    # printing the optimal parameters and best score
    print("Tuned Parameters:", dt_tuned_cv_scaled.best_params_)
    print("Tuned CV AUC    :", dt_tuned_cv_scaled.best_score_.round(4))

In [64]:
# preventing print output if it is not needed
if show_dev_output:

    # checking the best estimator for the model
    print(dt_tuned_cv_scaled.best_estimator_)

In [65]:
# building a model based on hyperparameter tuning results

# instantiating a logistic regression model with tuned values
dt_tuned = dt_tuned_cv_scaled.best_estimator_

# predicting based on the testing set
dt_tuned_pred = dt_tuned.predict(x_test_scaled)

# saving scoring data for future use
dt_tuned_train_score = dt_tuned.score(x_train_scaled, y_train).round(4) 
dt_tuned_test_score = dt_tuned.score(x_test_scaled, y_test).round(4)
dt_tuned_auc = roc_auc_score(y_true=y_test, y_score=dt_tuned_pred).round(4)

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing scoring data
    print('Training ACCURACY:', dt_tuned_train_score)
    print('Testing ACCURACY :', dt_tuned_test_score)
    print('AUC Score        :', dt_tuned_auc)

In [66]:
# unpacking the confusion matrix
dt_tuned_tn, \
dt_tuned_fp, \
dt_tuned_fn, \
dt_tuned_tp = confusion_matrix(y_true=y_test, y_pred=dt_tuned_pred).ravel()

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing each result one-by-one
    print(f"""
True Negatives : {dt_tuned_tn}
False Positives: {dt_tuned_fp}
False Negatives: {dt_tuned_fn}
True Positives : {dt_tuned_tp}
""")

In [67]:
# adjusting the model

# setting threshold
threshold = 0.6

# calling the function and getting accuracy and auc scores as well as 
# confusion matrix parameters
adj_dt_train_score, \
adj_dt_test_score, \
adj_dt_auc, \
adj_dt_tn_test, \
adj_dt_fp_test, \
adj_dt_fn_test, \
adj_dt_tp_test = model_adjusting(dt_tuned, 
                                 x_train_scaled, y_train, 
                                 x_test_scaled, y_test, threshold, 
                                 print_output=show_intermediate_output)

In [68]:
################## K-Nearest Neighbors Classification (KNN) ##################

In [69]:
# optimal_neighbors function
def optimal_neighbors(X_data,
                      y_data,
                      standardize=True,
                      pct_test=0.25,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    
    """
    Exhaustively compute training and testing results for KNN across
    [1, max_neighbors]. Outputs the maximum test score and (by default) a
    visualization of the results.
    PARAMETERS
    ----------
    X_data        : explanatory variable data
    y_data        : response variable
    standardize   : whether or not to standardize the X data, default True
    pct_test      : test size for training and validation from (0,1), 
                    default 0.25
    seed          : random seed to be used in algorithm, default 219
    response_type : type of neighbors algorithm to use, default 'reg'
                    Use 'reg' for regression (KNeighborsRegressor)
                    Use 'class' for classification (KNeighborsClassifier)
    max_neighbors : maximum number of neighbors in exhaustive search, 
                    default 20
    show_viz      : display or surpress k-neigbors visualization, default True
    """    
    
    if standardize == True:
        # optionally standardizing X_data
        scaler = StandardScaler()
        scaler.fit(X_data)
        X_scaled = scaler.transform(X_data)
        X_scaled_df = pd.DataFrame(X_scaled)
        X_data = X_scaled_df

    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(X_data,
                                                        y_data,
                                                        test_size=pct_test,
                                                        random_state=seed)

    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)

    
    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors=n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors=n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, \
                                                 label="training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # preventing print output if it is not needed
    if show_dev_output:
        
        # returning optimal number of neighbors
        print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy)) + 1}")
    
    return test_accuracy.index(max(test_accuracy)) + 1

In [70]:
# determining the optimal number of neighbors
opt_neighbors = optimal_neighbors(x_data,
                                  y_data,
                                  standardize=True,
                                  pct_test=0.25,
                                  seed=219,
                                  response_type='class',
                                  max_neighbors=50,
                                  show_viz=False)

In [71]:
# instantiating a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors=opt_neighbors)

# fitting the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train)

# predicting based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)

# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train).round(4)
knn_test_score = knn_fit.score(x_test_scaled, y_test).round(4)
knn_auc_score = roc_auc_score(y_true=y_test, y_score=knn_pred).round(4)

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing scoring data
    print('Training ACCURACY:', knn_train_score)
    print('Testing ACCURACY :', knn_test_score)
    print('AUC Score        :', knn_auc_score)

In [72]:
# unpacking the confusion matrix
knn_tn, \
knn_fp, \
knn_fn, \
knn_tp = confusion_matrix(y_true=y_test, y_pred=knn_pred).ravel()

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing each result one-by-one
    print(f"""
True Negatives : {knn_tn}
False Positives: {knn_fp}
False Negatives: {knn_fn}
True Positives : {knn_tp}
""")

In [73]:
# adjusting the model

# setting threshold
threshold = 0.68

# calling the function and getting accuracy and auc scores as well as 
# confusion matrix parameters
adj_knn_train_score, \
adj_knn_test_score, \
adj_knn_auc, \
adj_knn_tn_test, \
adj_knn_fp_test, \
adj_knn_fn_test, \
adj_knn_tp_test = model_adjusting(knn_fit, 
                                  x_train_scaled, y_train, 
                                  x_test_scaled, y_test, threshold, 
                                  print_output=show_intermediate_output)

In [74]:
####################### Random Forest (Classification) #######################

In [75]:
# preventing print output if it is not needed
if show_dev_output:

    # instantiating, fitting, predicting
    rf = RandomForestClassifier(random_state=219, max_depth=8) 
    rf_fit = rf.fit(x_train_scaled, y_train)
    y_pred = rf_fit.predict(x_test_scaled)

    # saving scoring data
    rf_train_score = rf_fit.score(x_train_scaled, y_train).round(4)
    rf_test_score = rf_fit.score(x_test_scaled, y_test).round(4)
    rf_auc_score = roc_auc_score(y_true=y_test, y_score=y_pred).round(4)

    # printing scoring data
    print('Training ACCURACY:', rf_train_score)
    print('Testing ACCURACY :', rf_test_score)
    print('AUC Score        :', rf_auc_score)

In [76]:
# GridSearchCV

# declaring a hyperparameter space
estimator_space = pd.np.arange(850, 900, 50)
leaf_space = pd.np.arange(1, 2, 1)
criterion_space = ['entropy']
bootstrap_space = [False]
warm_start_space = [True]
# estimator_space = pd.np.arange(100, 1100, 250)
# leaf_space = pd.np.arange(1, 31, 10)
# criterion_space = ['gini', 'entropy']
# bootstrap_space = [True, False]
# warm_start_space = [True, False]


# creating a hyperparameter grid
param_grid = {'n_estimators': estimator_space,
              'min_samples_leaf': leaf_space,
              'criterion': criterion_space,
              'bootstrap': bootstrap_space,
              'warm_start': warm_start_space}

# instantiating the model object without hyperparameters
forest_grid = RandomForestClassifier(random_state=219)

# GridSearchCV object
forest_cv = GridSearchCV(estimator=forest_grid,   
                         param_grid=param_grid, 
                         cv=3, 
                         n_jobs=-1,
                         scoring=make_scorer(
                                             roc_auc_score,
                                             needs_threshold=False)) 

# fitting to the full dataset
forest_cv.fit(x_data, y_data)

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing the optimal parameters and best score
    print("Tuned Parameters:", forest_cv.best_params_)
    print("Tuned CV AUC    :", forest_cv.best_score_.round(4))

In [77]:
# preventing print output if it is not needed
if show_dev_output:
    
    # best estimators based on RandomizedSearchCV
    print(forest_cv.best_estimator_)

In [78]:
# building a model based on hyperparameter tuning results

# instantiating with best_estimator
forest_tuned = forest_cv.best_estimator_

# predicting based on the testing set
forest_tuned_pred = forest_tuned.predict(x_test)


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4)
forest_tuned_test_score = forest_tuned.score(x_test, y_test).round(4)
forest_tuned_auc = roc_auc_score(y_true=y_test,
                                 y_score=forest_tuned_pred).round(4)

# preventing print output if it is not needed
if show_dev_output:
    
    # printing scoring data
    print('Training ACCURACY:', forest_tuned_train_score)
    print('Testing ACCURACY :', forest_tuned_test_score)
    print('AUC Score        :', forest_tuned_auc)

In [79]:
# preventing print output if it is not needed
if show_dev_output:
    
    # unpacking the confusion matrix
    tuned_rf_tn, \
    tuned_rf_fp, \
    tuned_rf_fn, \
    tuned_rf_tp = confusion_matrix(y_true=y_test, 
                                   y_pred=forest_tuned_pred).ravel()


    # printing each result one-by-one
    print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
    """)

In [80]:
# adjusting the model

# setting threshold
threshold = 0.46

# calling the function and getting accuracy and auc scores as well as 
# confusion matrix parameters
adj_rf_tuned_train_score, \
adj_rf_tuned_test_score, \
adj_rf_tuned_auc, \
adj_rf_tuned_tn_test, \
adj_rf_tuned_fp_test, \
adj_rf_tuned_fn_test, \
adj_rf_tuned_tp_test = \
                    model_adjusting(forest_tuned, 
                                    x_train_scaled, y_train, 
                                    x_test_scaled, y_test, threshold, 
                                    print_output=show_intermediate_output)

In [81]:
####################### Gradient Boosted Models (GBM) ########################

In [82]:
# preventing print output if it is not needed
if show_dev_output:
    
    # instantiating, fitting, predicting
    gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01,\
                                    random_state=219, max_depth=8)
    gb_fit = gb.fit(x_train, y_train)
    y_pred = gb_fit.predict(x_test)

    # saving scoring data
    gb_train_score = gb_fit.score(x_train, y_train).round(4)
    gb_test_score = gb_fit.score(x_test, y_test).round(4)
    gb_auc_score = roc_auc_score(y_true=y_test, y_score=y_pred).round(4)

    # printing scoring data
    print('Training ACCURACY:', gb_train_score)
    print('Testing ACCURACY :', gb_test_score)
    print('AUC Score        :', gb_auc_score)

In [83]:
# preventing print output if it is not needed
if show_dev_output:
    
    # GridSearchCV

    # declaring a hyperparameter space
    learn_space = [0.1]
    estimator_space = range(100, 150, 50)
    depth_space = range(3, 4)
    max_features_space = [20]
#     learn_space = [0.001, 0.01, 0.1, 1]
#     estimator_space = range(50, 350, 50)
#     depth_space = range(2, 6)
#     max_features_space = [None, 20, 30]

    # creating a hyperparameter grid
    param_grid = {'learning_rate': learn_space,
                  'max_depth': depth_space,
                  'n_estimators': estimator_space,
                  'max_features': max_features_space}

    # instantiating the model object without hyperparameters
    full_gbm_grid = GradientBoostingClassifier(random_state=219)

    # GridSearchCV object
    full_gbm_cv = GridSearchCV(estimator=full_gbm_grid,
                               param_grid=param_grid, 
                               cv=3, 
                               n_jobs=-1,
                               scoring=make_scorer(
                                                   roc_auc_score,
                                                   needs_threshold=False))

    # fitting to the full dataset
    full_gbm_cv.fit(x_data, y_data)

    # printing the optimal parameters and best score
    print("Tuned Parameters:", full_gbm_cv.best_params_)
    print("Tuned CV AUC    :", full_gbm_cv.best_score_.round(4))

In [84]:
# preventing print output if it is not needed
if show_dev_output:
    
    # best estimators based on RandomizedSearchCV
    print(full_gbm_cv.best_estimator_)

In [85]:
# preventing print output if it is not needed
if show_dev_output:
    
    # instantiating with best_estimator
    gb_tuned = full_gbm_cv.best_estimator_

    # predicting based on the testing set
    gb_tuned_pred = gb_tuned.predict(x_test)

    # saving scoring data
    gb_tuned_train_score = gb_tuned.score(x_train, y_train).round(4)
    gb_tuned_test_score = gb_tuned.score(x_test, y_test).round(4)
    gb_tuned_auc_score = \
                    roc_auc_score(y_true=y_test, y_score=gb_tuned_pred).round(4)

    # preventing print output if it is not needed
    if show_intermediate_output:

        # printing scoring data
        print('Training ACCURACY:', gb_tuned_train_score)
        print('Testing ACCURACY :', gb_tuned_test_score)
        print('AUC Score        :', gb_tuned_auc_score)

In [86]:
# preventing running if it is not needed
if show_dev_output:
    
    # adjusting the model

    # setting threshold
    threshold = 0.656

    # calling the function and getting accuracy and auc scores as well as 
    # confusion matrix parameters
    adj_gb_tuned_train_score, \
    adj_gb_tuned_test_score, \
    adj_gb_tuned_auc, \
    adj_gb_tuned_tn_test, \
    adj_gb_tuned_fp_test, \
    adj_gb_tuned_fn_test, \
    adj_gb_tuned_tp_test = \
                        model_adjusting(gb_tuned, 
                                        x_train, y_train, 
                                        x_test, y_test, threshold, 
                                        print_output=show_intermediate_output)

In [87]:
# GridSearchCV

# declaring a hyperparameter space
learn_space = [0.1, 0.11, 0.12]
estimator_space = range(100, 150, 10)
depth_space = range(2, 4)
max_features_space = [18, 19, 20] 

# creating a hyperparameter grid
param_grid = {'learning_rate': learn_space,
              'max_depth': depth_space,
              'n_estimators': estimator_space,
              'max_features': max_features_space}

# instantiating the model object without hyperparameters
full_gbm_grid = GradientBoostingClassifier(random_state=219)

# GridSearchCV object
full_gbm_cv_scaled = GridSearchCV(estimator=full_gbm_grid,
                                 param_grid=param_grid, 
                                 cv=3, 
                                 n_jobs=-1, 
                                 scoring=make_scorer(
                                                     roc_auc_score,
                                                     needs_threshold=False))

# fitting to the full dataset 
full_gbm_cv_scaled.fit(x_data_scaled, y_data)

# preventing print output if it is not needed
if show_dev_output:
    
    # printing the optimal parameters and best score
    print("Tuned Parameters:", full_gbm_cv_scaled.best_params_)
    print("Tuned CV AUC    :", full_gbm_cv_scaled.best_score_.round(4))

In [88]:
# preventing print output if it is not needed
if show_dev_output:
    
    # best estimators based on RandomizedSearchCV
    print(full_gbm_cv_scaled.best_estimator_)

In [89]:
# instantiating with best_estimator
gbm_tuned_scaled = full_gbm_cv_scaled.best_estimator_

# predicting based on the testing set
gbm_tuned_pred = gbm_tuned_scaled.predict(x_test_scaled)

# saving scoring data
gb_tuned_scaled_train_score = gbm_tuned_scaled.\
                                    score(x_train_scaled, y_train).round(4)
gb_tuned_scaled_test_score = gbm_tuned_scaled.\
                                    score(x_test_scaled, y_test).round(4)
gb_tuned_scaled_auc_score = roc_auc_score(y_true=y_test,
                                          y_score=gbm_tuned_pred).round(4)

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing scoring data
    print('Training ACCURACY:', gb_tuned_scaled_train_score)
    print('Testing ACCURACY :', gb_tuned_scaled_test_score)
    print('AUC Score        :', gb_tuned_scaled_auc_score)

In [90]:
# Tuned Parameters: {'learning_rate': 0.11, 'max_depth': 3, 
#                    'max_features': 19, 'n_estimators': 110}
# 
# GradientBoostingClassifier(learning_rate=0.11, max_features=19,
#                            n_estimators=110, random_state=219)
# 
# Training ACCURACY: 0.8444
# Testing ACCURACY : 0.8501
# AUC Score        : 0.7931

In [91]:
# unpacking the confusion matrix
gb_tuned_tn, \
gb_tuned_fp, \
gb_tuned_fn, \
gb_tuned_tp = confusion_matrix(y_true=y_test, y_pred=gbm_tuned_pred).ravel()

# preventing print output if it is not needed
if show_intermediate_output:
    
    # printing each result one-by-one
    print(f"""
True Negatives : {gb_tuned_tn}
False Positives: {gb_tuned_fp}
False Negatives: {gb_tuned_fn}
True Positives : {gb_tuned_tp}
""")

In [92]:
# adjusting the model

# setting threshold
threshold = 0.69

# calling the function and getting accuracy and auc scores as well as 
# confusion matrix parameters
adj_gbm_train_score, \
adj_gbm_test_score, \
adj_gbm_auc, \
adj_gbm_tn_test, \
adj_gbm_fp_test, \
adj_gbm_fn_test, \
adj_gbm_tp_test = model_adjusting(gbm_tuned_scaled, 
                                  x_train_scaled, y_train, 
                                  x_test_scaled, y_test, threshold, 
                                  print_output=show_intermediate_output)

In [93]:
############################# printing results ###############################

In [94]:
end_time = time.time()

# comparing results
print(f"""
        Model Name            Training Accuracy  Testing Accuracy  AUC Score    Confusion Matrix
--------------------------    -----------------  ----------------  ---------    ----------------
Logistic Regression{lr_train_score:>23}{lr_test_score:>18}{lr_auc:>15}{'':>4}True Negatives : {lr_tn}
{'':>79}False Positives: {lr_fp}
{'':>79}False Negatives: {lr_fn}
{'':>79}True Positives : {lr_tp}

Logistic Regression (tuned){lr_tuned_train_score:>15}{lr_tuned_test_score:>18}{lr_tuned_auc:>15}{'':>4}True Negatives : {lr_tuned_tn}
{'':>79}False Positives: {lr_tuned_fp}
{'':>79}False Negatives: {lr_tuned_fn}
{'':>79}True Positives : {lr_tuned_tp} 

Logistic Regression (adjusted){adj_lr_train_score:>12}{adj_lr_test_score:>18}{adj_lr_auc:>15}{'':>4}True Negatives : {adj_lr_tn_test}
{'':>79}False Positives: {adj_lr_fp_test}
{'':>79}False Negatives: {adj_lr_fn_test}
{'':>79}True Positives : {adj_lr_tp_test}
    
Decision Tree (tuned){dt_tuned_train_score:>21}{dt_tuned_test_score:>18}{dt_tuned_auc:>15}{'':>4}True Negatives : {dt_tuned_tn}
{'':>79}False Positives: {dt_tuned_fp}
{'':>79}False Negatives: {dt_tuned_fn}
{'':>79}True Positives : {dt_tuned_tp}

Decision Tree (adjusted){adj_dt_train_score:>18}{adj_dt_test_score:>18}{adj_dt_auc:>15}{'':>4}True Negatives : {adj_dt_tn_test}
{'':>79}False Positives: {adj_dt_fp_test}
{'':>79}False Negatives: {adj_dt_fn_test}
{'':>79}True Positives : {adj_dt_tp_test}
    
KNN {knn_train_score:>38}{knn_test_score:>18}{knn_auc_score:>15}{'':>4}True Negatives : {knn_tn}
{'':>79}False Positives: {knn_fp}
{'':>79}False Negatives: {knn_fn}
{'':>79}True Positives : {knn_tp}

KNN (adjusted){adj_knn_train_score:>28}{adj_knn_test_score:>18}{adj_knn_auc:>15}{'':>4}True Negatives : {adj_knn_tn_test}
{'':>79}False Positives: {adj_knn_fp_test}
{'':>79}False Negatives: {adj_knn_fn_test}
{'':>79}True Positives : {adj_knn_tp_test}

Random Forest (adjusted){adj_rf_tuned_train_score:>18}{adj_rf_tuned_test_score:>18}{adj_rf_tuned_auc:>15}{'':>4}True Negatives : {adj_rf_tuned_tn_test}
{'':>79}False Positives: {adj_rf_tuned_fp_test}
{'':>79}False Negatives: {adj_rf_tuned_fn_test}
{'':>79}True Positives : {adj_rf_tuned_tp_test}

GBM (tuned) {gb_tuned_scaled_train_score:>30}{gb_tuned_scaled_test_score:>18}{gb_tuned_scaled_auc_score:>15}{'':>4}True Negatives : {gb_tuned_tn}
{'':>79}False Positives: {gb_tuned_fp}
{'':>79}False Negatives: {gb_tuned_fn}
{'':>79}True Positives : {gb_tuned_tp}
    
GBM (adjusted)* {adj_gbm_train_score:>26}{adj_gbm_test_score:>18}{adj_gbm_auc:>15}{'':>4}True Negatives : {adj_gbm_tn_test}
{'':>79}False Positives: {adj_gbm_fp_test}
{'':>79}False Negatives: {adj_gbm_fn_test}
{'':>79}True Positives : {adj_gbm_tp_test} 


* Adjusted Gradient Boosting Model demonstrated the best result.
  
** Script running time: {round((end_time - start_time), 1)} s
""")


        Model Name            Training Accuracy  Testing Accuracy  AUC Score    Confusion Matrix
--------------------------    -----------------  ----------------  ---------    ----------------
Logistic Regression                 0.7697            0.7639          0.689    True Negatives : 75
                                                                               False Positives: 81
                                                                               False Negatives: 34
                                                                               True Positives : 297

Logistic Regression (tuned)         0.7629            0.7659         0.6922    True Negatives : 76
                                                                               False Positives: 80
                                                                               False Negatives: 34
                                                                               True Positives : 297 

Logistic