In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read in dataframe
df = pd.read_csv('./heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
from sklearn.neighbors import LocalOutlierFactor

LOF = LocalOutlierFactor(n_neighbors = 20, contamination = "auto")

lof_outliers = LOF.fit_predict(df)
print("Local Outlier Factor: " + str(lof_outliers[np.where(lof_outliers == -1)].shape[0]))

Local Outlier Factor: 10


In [4]:
outlier_list = np.where(lof_outliers == -1)[0].tolist()
df = df.drop(outlier_list)
len(df)

293

In [5]:
df_drop = df.drop(["trtbps", "thalachh", "age","chol","thall"], axis=1)

In [6]:
normalize = MinMaxScaler()

X_drop = df_drop.drop("output", axis = 1)
y_drop = df_drop["output"]

X = df.drop("output", axis = 1)
y = df["output"]


In [7]:

X_train_drop, X_test_drop, y_train_drop, y_test_drop = train_test_split(X_drop, y_drop,test_size = 0.3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=42)

In [8]:
#from imblearn.over_sampling import SMOTE

#col = None
#oversample = SMOTE(k_neighbors=4)
#for col in df.drop("oldpeak", axis = 1).columns.values:
#    print(col)
#    X = df
#    y = df[col]
#    df,y = oversample.fit_resample(X,y)
#
#df.hist(figsize=(20,10))
#print(len(df))

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier
from sklearn.metrics import recall_score,precision_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier,XGBRFClassifier


def model_selection(X_train, y_train,X_test, y_test, estimator):
    """
    Test various estimators.
    """
    
    
    model = estimator

    # Instantiate the classification model and visualizer
    model.fit(X_train, y_train)  
    
    expected  = y_test
    predicted = model.predict(X_test)
    

    # Compute and return the F1 score (the harmonic mean of precision and recall)
    return [model.score(X_test,y_test), recall_score(expected, predicted), precision_score(expected, predicted), f1_score(expected, predicted)]

def model_selection_normalize(X_train, y_train,X_test, y_test, estimator):
    """
    Test various estimators.
    """
    
    
    model = estimator

    # Instantiate the classification model and visualizer
    model.fit(normalize.fit_transform(X_train), y_train)  
    
    expected  = y_test
    predicted = model.predict(normalize.fit_transform(X_test))
    

    # Compute and return the F1 score (the harmonic mean of precision and recall)
    return [model.score(normalize.fit_transform(X_test),y_test), recall_score(expected, predicted), precision_score(expected, predicted), f1_score(expected, predicted)]


In [10]:
estimator_lst = [
                 RandomForestClassifier(random_state = 42),
                 SVC(random_state = 42),
                 LogisticRegression(random_state = 42),
                 MLPClassifier(random_state = 42),
                 GaussianNB(),
                 KNeighborsClassifier(),
                 SGDClassifier(random_state = 42),
                 PassiveAggressiveClassifier(random_state = 42),
                 GradientBoostingClassifier(random_state = 42),
                 AdaBoostClassifier(random_state = 42),
                 HistGradientBoostingClassifier(random_state = 42),
                 GaussianProcessClassifier(random_state = 42),
                 BaggingClassifier(random_state = 42),
                 XGBClassifier(random_state = 42,use_label_encoder=False),
                 XGBRFClassifier(random_state = 42, use_label_encoder=False)
                

]

## DF with all columns not normalized

In [11]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns")
print("not normalized")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train, y_train, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score: ", top_f1)

df with dropped columns
not normalized
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8727272727272727

Precision: 0.8727272727272727

F1 Score: 0.8727272727272727

SVC(random_state=42)
-----------------------
Accuracy: 0.7159090909090909

Recall: 0.8363636363636363

Precision: 0.7419354838709677

F1 Score: 0.7863247863247863

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9454545454545454

Precision: 0.8666666666666667

F1 Score: 0.9043478260869566



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.75

Recall: 0.7636363636363637

Precision: 0.8235294117647058

F1 Score: 0.7924528301886793

GaussianNB()
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303

KNeighborsClassifier()
-----------------------
Accuracy: 0.625

Recall: 0.6363636363636364

Precision: 0.7291666666666666

F1 Score: 0.6796116504854369

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.6704545454545454

Recall: 0.9636363636363636

Precision: 0.6625

F1 Score: 0.7851851851851853

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.6704545454545454

Recall: 0.9272727272727272

Precision: 0.6710526315789473

F1 Score: 0.7786259541984732

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8545454545454545

Precision: 0.8245614035087719

F1 Score: 0.8392857142857144

AdaBoost

## DF with all columns normalized

In [12]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns")
print("normalized")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train, y_train, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score: ", top_f1)

df with dropped columns
normalized
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596491228070176

F1 Score: 0.875

SVC(random_state=42)
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.8727272727272727

Precision: 0.8

F1 Score: 0.8347826086956521

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9636363636363636

Precision: 0.8548387096774194

F1 Score: 0.905982905982906

MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9090909090909091

Precision: 0.847457627118644

F1 Score: 0.8771929824561402

GaussianNB()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8727272727272727

Precision: 0.8571428571428571

F1 Score: 0.8648648648648648

KNeighborsClassifier()
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596491228070176

F1 Score: 0.875


## DF with columns removed not normalized

In [13]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("not normalized:")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_drop, y_train_drop, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
not normalized:
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

SVC(random_state=42)
-----------------------
Accuracy: 0.8977272727272727

Recall: 0.9818181818181818

Precision: 0.8709677419354839

F1 Score: 0.923076923076923

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8863636363636364

Recall: 0.9818181818181818

Precision: 0.8571428571428571

F1 Score: 0.9152542372881356

GaussianNB()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

KNeighborsClassifier()
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8363636363636363

Precision: 0.8679245283018868

F1 Score: 0.8518518518518519

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.5909090909090909

Recall: 0.38181818181818183

Precision: 0.9130434782608695

F1 Score: 0.5384615384615384

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9454545454545454

Precision: 0.8387096774193549

F1 Score: 0.8888888888888888

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9090909090909091

Precision: 0.86

## DF with columns removed normalized

In [14]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("normalized:")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_drop, y_train_drop, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
normalized:
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.8909090909090909

Precision: 0.875

F1 Score: 0.8828828828828829

SVC(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9090909090909091

Precision: 0.8620689655172413

F1 Score: 0.8849557522123893

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

GaussianNB()
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

KNeighborsClassifier()
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8545454545454545

Precision: 0.8245614035087719

F1 Score: 0.8392857142857144

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8181818181818182

Precision: 0.8653846153846154

F1 Score: 0.8411214953271028

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9090909090909091

Precision: 0.847457627118644

F1 Score: 0.8771929824561402

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596

In [15]:
from sklearn.utils import resample
X_resample_drop = pd.concat([X_train_drop, y_train_drop], axis=1)

# separate minority and majority classes
female_drop = X_resample_drop[X_resample_drop.output==0]
male_drop = X_resample_drop[X_resample_drop.output==1]

In [16]:
X_resample = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
female= X_resample[X_resample.output==0]
male= X_resample[X_resample.output==1]

In [17]:
# upsample minority
gender_upsample_drop = resample(female_drop,
                          replace=True, # sample with replacement
                          n_samples=len(male_drop), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsample_drop = pd.concat([male_drop, gender_upsample_drop])

X_train_drop_upsample = upsample_drop.drop("output", axis = 1)
y_train_drop_upsample = upsample_drop["output"]

In [18]:
# upsample minority
gender_downsample_drop = resample(male_drop,
                          replace=True, # sample with replacement
                          n_samples=len(female_drop), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
downsample_drop = pd.concat([gender_downsample_drop, female_drop])

X_train_drop_downsample = downsample_drop.drop("output", axis = 1)
y_train_drop_downsample = downsample_drop["output"]

In [19]:
# upsample minority
gender_upsample = resample(female,
                          replace=True, # sample with replacement
                          n_samples=len(male), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsample = pd.concat([male, gender_upsample])
X_train_upsample = upsample.drop("output", axis = 1)
y_train_upsample = upsample["output"]

In [20]:
# downsample majority
gender_downsample = resample(male,
                                replace = False, # sample without replacement
                                n_samples = len(female), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsample = pd.concat([gender_downsample, female])
X_train_downsample = downsample.drop("output", axis = 1)
y_train_downsample = downsample["output"]

In [21]:
from imblearn.over_sampling import SMOTE

#sm = SMOTE(random_state=42)
#X_smote, sex_smote = sm.fit_resample(X_resample.drop("sex", axis = 1), X_resample["sex"])
#X_smote["sex"] = sex_smote.values
#smote = X_smote

#X_train_smote = smote.drop("output", axis = 1)
#smote_cols = X_train_smote.columns.tolist()
#smote_cols = smote_cols[-1:]  + smote_cols[:-1]
#X_train_smote = X_train_smote[smote_cols]
#y_train_smote = smote["output"]

sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_resample.drop("output", axis = 1), X_resample["output"])


In [22]:
#smote.sex.value_counts()

In [23]:
#X_smote_drop, sex_smote_drop = sm.fit_resample(X_resample_drop.drop("sex", axis = 1), X_resample_drop["sex"])
#X_smote_drop["sex"] = sex_smote_drop.values
#smote_drop = X_smote_drop

#X_train_drop_smote = smote_drop.drop("output", axis = 1)
#smote_drop_cols = X_train_drop_smote.columns.tolist()
#smote_drop_cols = smote_drop_cols[-1:]  + smote_drop_cols[:-1]
#X_train_drop_smote = X_train_drop_smote[smote_drop_cols]
#y_train_drop_smote = smote_drop["output"]

X_train_drop_smote, y_train_drop_smote = sm.fit_resample(X_resample_drop.drop("output", axis = 1), X_resample_drop["output"])



## Dropped Columns Not Normalized Upsample

In [24]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("not normalized:")
print("upsample")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_drop_upsample, y_train_drop_upsample, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
not normalized:
upsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.8727272727272727

Precision: 0.8

F1 Score: 0.8347826086956521

SVC(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9454545454545454

Precision: 0.8666666666666667

F1 Score: 0.9043478260869566

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9090909090909091

Precision: 0.847457627118644

F1 Score: 0.8771929824561402

MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9818181818181818

Precision: 0.84375

F1 Score: 0.9075630252100839

GaussianNB()
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

KNeighborsClassifier()
-----------------------
Accuracy: 0.75

Recall: 0.7272727272727273

Precision: 0.851063829787234

F1 Score: 0.78431372549019

## Dropped Columns Normalized Upsample

In [25]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("normalized:")
print("upsample")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_drop_upsample, y_train_drop_upsample, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
normalized:
upsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.7727272727272727

Recall: 0.8545454545454545

Precision: 0.7966101694915254

F1 Score: 0.8245614035087718

SVC(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9636363636363636

Precision: 0.8412698412698413

F1 Score: 0.8983050847457628

GaussianNB()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

KNeighborsClassifier()
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.83636363636

## No Dropped Columns Not Normalized Upsample

In [26]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("not normalized:")
print("upsample")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_upsample, y_train_upsample, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
not normalized:
upsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.9090909090909091

Precision: 0.819672131147541

F1 Score: 0.8620689655172413

SVC(random_state=42)
-----------------------
Accuracy: 0.6931818181818182

Recall: 0.7818181818181819

Precision: 0.7413793103448276

F1 Score: 0.7610619469026548

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9454545454545454

Precision: 0.8666666666666667

F1 Score: 0.9043478260869566



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8

Precision: 0.8627450980392157

F1 Score: 0.8301886792452831

GaussianNB()
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303

KNeighborsClassifier()
-----------------------
Accuracy: 0.625

Recall: 0.6363636363636364

Precision: 0.7291666666666666

F1 Score: 0.6796116504854369

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.5909090909090909

Recall: 0.38181818181818183

Precision: 0.9130434782608695

F1 Score: 0.5384615384615384

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.6477272727272727

Recall: 1.0

Precision: 0.6395348837209303

F1 Score: 0.7801418439716312

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.7727272727272727

Recall: 0.8363636363636363

Precision: 0.8070175438596491

F1 Score: 0.8214285714285714

AdaBoostCla

## No Dropped Columns Normalized Upsample

In [27]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("normalized:")
print("upsample")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_upsample, y_train_upsample, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
normalized:
upsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.9090909090909091

Precision: 0.819672131147541

F1 Score: 0.8620689655172413

SVC(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8909090909090909

Precision: 0.8166666666666667

F1 Score: 0.8521739130434782

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9636363636363636

Precision: 0.8412698412698413

F1 Score: 0.8983050847457628





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9272727272727272

Precision: 0.8360655737704918

F1 Score: 0.8793103448275862

GaussianNB()
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303

KNeighborsClassifier()
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8727272727272727

Precision: 0.8275862068965517

F1 Score: 0.8495575221238938

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9636363636363636

Precision: 0.8412698412698413

F1 Score: 0.8983050847457628

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8545454545454545

Precision: 0.8545454545454545

F1 Score: 0.8545454545454545

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8909090909090909

Precision: 0.8305084745762712


## Dropped Columns Not Normalized Downsample

In [28]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("not normalized:")
print("downsample")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_drop_downsample, y_train_drop_downsample, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
not normalized:
downsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.7613636363636364

Recall: 0.7636363636363637

Precision: 0.84

F1 Score: 0.8000000000000002

SVC(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8727272727272727

Precision: 0.8727272727272727

F1 Score: 0.8727272727272727

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9090909090909091

Precision: 0.8620689655172413

F1 Score: 0.8849557522123893

GaussianNB()
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8363636363636363

Precision: 0.8363636363636363

F1 Score: 0.8363636363636363

KNeighborsClassifier()
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8181818181818182

Precision: 0.8823529411764706

F1 Score: 0.8490566037735848

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 1.0

Precision: 0.8208955223880597

F1 Score: 0.9016393442622952

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.8909090909090909

Precision: 0.875

F1 Score: 0.8828828828828829

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.7386363636363636

Recall: 0.7636363636363637

Precision: 0.8076923076923077

F1 Score: 0.7

## Dropped Columns Normalized Downsample

In [29]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("normalized:")
print("downsample")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_drop_downsample, y_train_drop_downsample, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
normalized:
downsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.7727272727272727

Recall: 0.7636363636363637

Precision: 0.8571428571428571

F1 Score: 0.8076923076923076

SVC(random_state=42)
-----------------------
Accuracy: 0.75

Recall: 0.7454545454545455

Precision: 0.8367346938775511

F1 Score: 0.7884615384615385

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

GaussianNB()
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.8181818181818182

Precision: 0.8333333333333334

F1 Score: 0.8256880733944955

KNeighborsClassifier()
-----------------------
Accuracy: 0.75

Recall: 0.7636363636363637

Precision: 0.8235294117647058

F1 Score: 0.7924528301886793

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9818181818181818

Precision: 0.84375

F1 Score: 0.9075630252100839

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.7386363636363636

Recall: 0.7454545454545455

Precision: 0.82

F1 Score: 0.780952380952381

AdaBoost

## No Dropped Columns Not Normalized Downsample

In [30]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("not normalized:")
print("downsample")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_downsample, y_train_downsample, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
not normalized:
downsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.8909090909090909

Precision: 0.875

F1 Score: 0.8828828828828829

SVC(random_state=42)
-----------------------
Accuracy: 0.7045454545454546

Recall: 0.8

Precision: 0.7457627118644068

F1 Score: 0.7719298245614035

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9454545454545454

Precision: 0.8666666666666667

F1 Score: 0.9043478260869566

MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.7386363636363636

Recall: 0.9090909090909091

Precision: 0.7352941176470589

F1 Score: 0.8130081300813008

GaussianNB()
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303

KNeighborsClassifier()
-----------------------
Accuracy: 0.625

Recall: 0.6363636363636364

Precision: 0.7291666666666666

F1 Score: 0.679611650

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8545454545454545

Precision: 0.8392857142857143

F1 Score: 0.8468468468468467

AdaBoostClassifier(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8545454545454545

Precision: 0.8392857142857143

F1 Score: 0.8468468468468467

HistGradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.8545454545454545

Precision: 0.8103448275862069

F1 Score: 0.831858407079646

GaussianProcessClassifier(random_state=42)
-----------------------
Accuracy: 0.6022727272727273

Recall: 0.6

Precision: 0.717391304347826

F1 Score: 0.6534653465346534

BaggingClassifier(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8545454545454545

Precision: 0.8392857142857143

F1 Score: 0.8468468468468467

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_byn

## No Dropped Columns Normalized Downsample

In [31]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("normalized:")
print("downsample")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_downsample, y_train_downsample, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
normalized:
downsample
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8545454545454545

Precision: 0.8545454545454545

F1 Score: 0.8545454545454545

SVC(random_state=42)
-----------------------
Accuracy: 0.7840909090909091

Recall: 0.8545454545454545

Precision: 0.8103448275862069

F1 Score: 0.831858407079646

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9454545454545454

Precision: 0.8524590163934426

F1 Score: 0.8965517241379309





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9454545454545454

Precision: 0.8524590163934426

F1 Score: 0.8965517241379309

GaussianNB()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8727272727272727

Precision: 0.8571428571428571

F1 Score: 0.8648648648648648

KNeighborsClassifier()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8727272727272727

Precision: 0.8571428571428571

F1 Score: 0.8648648648648648

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9818181818181818

Precision: 0.84375

F1 Score: 0.9075630252100839

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9090909090909091

Precision: 0.847457627118644

F1 Score: 0.8771929824561402

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8181818181818182

Precision: 0.8823529411764706

F1 Score: 

## Dropped Columns Not Normalized SMOTE

In [32]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("not normalized:")
print("SMOTE")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_drop_smote, y_train_drop_smote, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
not normalized:
SMOTE
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.9090909090909091

Precision: 0.8333333333333334

F1 Score: 0.8695652173913043

SVC(random_state=42)
-----------------------
Accuracy: 0.8863636363636364

Recall: 0.9636363636363636

Precision: 0.8688524590163934

F1 Score: 0.9137931034482758

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9272727272727272

Precision: 0.85

F1 Score: 0.8869565217391303





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9454545454545454

Precision: 0.8524590163934426

F1 Score: 0.8965517241379309

GaussianNB()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416

KNeighborsClassifier()
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8363636363636363

Precision: 0.8846153846153846

F1 Score: 0.8598130841121494

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.6022727272727273

Recall: 0.43636363636363634

Precision: 0.8571428571428571

F1 Score: 0.5783132530120482

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.4431818181818182

Recall: 0.10909090909090909

Precision: 1.0

F1 Score: 0.19672131147540983

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9272727272727272

Precision: 0.8793103448275862

F1 Score: 

## Dropped Columns Normalized SMOTE

In [33]:
estimator = None
top_model = None
top_f1 = 0
print("df with dropped columns:")
print("normalized:")
print("SMOTE")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_drop_smote, y_train_drop_smote, X_test_drop, y_test_drop, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df with dropped columns:
normalized:
SMOTE
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596491228070176

F1 Score: 0.875

SVC(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8545454545454545

Precision: 0.8545454545454545

F1 Score: 0.8545454545454545

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8909090909090909

Precision: 0.8448275862068966

F1 Score: 0.8672566371681416





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

GaussianNB()
-----------------------
Accuracy: 0.8181818181818182

Recall: 0.8727272727272727

Precision: 0.8421052631578947

F1 Score: 0.8571428571428571

KNeighborsClassifier()
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8545454545454545

Precision: 0.8245614035087719

F1 Score: 0.8392857142857144

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 1.0

Precision: 0.8088235294117647

F1 Score: 0.8943089430894309

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.45454545454545453

Recall: 0.12727272727272726

Precision: 1.0

F1 Score: 0.22580645161290322

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9272727272727272

Precision: 0.864406779661017

F1 Score: 0.8

In [38]:
X_train_smote_cols = X_train_smote.columns.tolist()
X_train_smote_cols.insert(0, X_train_smote_cols.pop(1))
X_train_smote = X_train_smote[X_train_smote_cols]

## No  Dropped Columns Not Normalized SMOTE

In [39]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("not normalized:")
print("SMOTE")
for estimator in estimator_lst:
    modelSelection = model_selection(X_train_smote, y_train_smote, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
not normalized:
SMOTE
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8727272727272727

Precision: 0.8727272727272727

F1 Score: 0.8727272727272727

SVC(random_state=42)
-----------------------
Accuracy: 0.7045454545454546

Recall: 0.7636363636363637

Precision: 0.7636363636363637

F1 Score: 0.7636363636363637

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.875

Recall: 0.9454545454545454

Precision: 0.8666666666666667

F1 Score: 0.9043478260869566



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9636363636363636

Precision: 0.828125

F1 Score: 0.8907563025210083

GaussianNB()
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9454545454545454

Precision: 0.8524590163934426

F1 Score: 0.8965517241379309

KNeighborsClassifier()
-----------------------
Accuracy: 0.6477272727272727

Recall: 0.6363636363636364

Precision: 0.7608695652173914

F1 Score: 0.693069306930693

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.7045454545454546

Recall: 0.7272727272727273

Precision: 0.7843137254901961

F1 Score: 0.7547169811320754

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.6477272727272727

Recall: 0.8545454545454545

Precision: 0.6714285714285714

F1 Score: 0.752

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8068181818181818

Recall: 0.8545454545454545

Precision: 0.8392857142857143

F1 Score:

## No Dropped Columns Normalized SMOTE

In [40]:
estimator = None
top_model = None
top_f1 = 0
print("df without dropped columns:")
print("normalized:")
print("SMOTE")
for estimator in estimator_lst:
    modelSelection = model_selection_normalize(X_train_smote, y_train_smote, X_test, y_test, estimator)
    print(estimator)
    print("-----------------------")
    print("Accuracy: " + str(modelSelection[0]) + "\n")
    print("Recall: " + str(modelSelection[1])+ "\n")
    print("Precision: " + str(modelSelection[2])+ "\n")
    print("F1 Score: " + str(modelSelection[3])+ "\n")
    if modelSelection[3] > top_f1:
        top_f1 = modelSelection[3]
        top_model = estimator
        
print("-----------------------")
print("Top Model: ", top_model)
print("Top F1 Score", top_f1)

df without dropped columns:
normalized:
SMOTE
RandomForestClassifier(random_state=42)
-----------------------
Accuracy: 0.8295454545454546

Recall: 0.8727272727272727

Precision: 0.8571428571428571

F1 Score: 0.8648648648648648

SVC(random_state=42)
-----------------------
Accuracy: 0.7954545454545454

Recall: 0.8727272727272727

Precision: 0.8135593220338984

F1 Score: 0.8421052631578948

LogisticRegression(random_state=42)
-----------------------
Accuracy: 0.8636363636363636

Recall: 0.9454545454545454

Precision: 0.8524590163934426

F1 Score: 0.8965517241379309





MLPClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.9090909090909091

Precision: 0.8620689655172413

F1 Score: 0.8849557522123893

GaussianNB()
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596491228070176

F1 Score: 0.875

KNeighborsClassifier()
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.8909090909090909

Precision: 0.8596491228070176

F1 Score: 0.875

SGDClassifier(random_state=42)
-----------------------
Accuracy: 0.8409090909090909

Recall: 0.9818181818181818

Precision: 0.8059701492537313

F1 Score: 0.8852459016393442

PassiveAggressiveClassifier(random_state=42)
-----------------------
Accuracy: 0.4318181818181818

Recall: 0.09090909090909091

Precision: 1.0

F1 Score: 0.16666666666666669

GradientBoostingClassifier(random_state=42)
-----------------------
Accuracy: 0.8522727272727273

Recall: 0.8909090909090909

Precision: 0.875

F1 Score: 0.8828828828828829

AdaBoos

# Passive Aggressive Columns Dropped Not Normalized 

In [41]:
clf = PassiveAggressiveClassifier(random_state=42)
clf.fit(X_train_drop, y_train_drop)

predictions = clf.predict(X_test_drop)
actual = y_test_drop


print("Accuracy: " + str(clf.score(X_test_drop, y_test_drop)))
print("Recall: " + str(recall_score(actual, predictions)))
print("Precision: " + str(precision_score(actual, predictions)))
print("F1 Score: " + str(f1_score(actual, predictions)))


Accuracy: 0.8522727272727273
Recall: 0.9454545454545454
Precision: 0.8387096774193549
F1 Score: 0.8888888888888888


In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_test_drop, y_test_drop)  
plt.show() 