###Packages

In [None]:
# import packages
import copy
from math import log2, sqrt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay, average_precision_score, auc

###Import data file and look for missing records

In [None]:
# load dataset
file_path = "spam.data"
df = pd.read_csv(file_path, delimiter=" ")

In [None]:
# determine if data is missing features' values
missing_records = np.where(pd.isnull(df))
for records in missing_records:
  if records.size > 0:
    print("There is at least one missing features' values")

In [None]:
df.max()

make            4.54
address        14.28
all              5.1
3d             42.81
our             10.0
over            5.88
remove          7.27
internet       11.11
order           5.26
mail           18.18
receive         2.61
will            9.67
people          5.55
report          10.0
addresses       4.41
free            20.0
business        7.14
email           9.09
you            18.75
credit         18.18
your           11.11
font            17.1
000             5.45
money           12.5
hp             20.83
hpl            16.66
george         33.33
650             9.09
lab            14.28
labs            5.88
telnet          12.5
857             4.76
data           18.18
415             4.76
85              20.0
technology      7.69
1999            6.89
parts           8.33
pm             11.11
direct          4.76
cs              7.14
meeting        14.28
original        3.57
project         20.0
re             21.42
edu            22.05
table           2.17
conference   

In [None]:
df.min()

make          0.0
address       0.0
all           0.0
3d            0.0
our           0.0
over          0.0
remove        0.0
internet      0.0
order         0.0
mail          0.0
receive       0.0
will          0.0
people        0.0
report        0.0
addresses     0.0
free          0.0
business      0.0
email         0.0
you           0.0
credit        0.0
your          0.0
font          0.0
000           0.0
money         0.0
hp            0.0
hpl           0.0
george        0.0
650           0.0
lab           0.0
labs          0.0
telnet        0.0
857           0.0
data          0.0
415           0.0
85            0.0
technology    0.0
1999          0.0
parts         0.0
pm            0.0
direct        0.0
cs            0.0
meeting       0.0
original      0.0
project       0.0
re            0.0
edu           0.0
table         0.0
conference    0.0
semicol       0.0
paren         0.0
bracket       0.0
bang          0.0
dollar        0.0
pound         0.0
cap_avg       1.0
cap_long  

###Data - Raw, Normalized, Standardized

In [None]:
# raw data
df_raw = df.drop(labels = 'Class', axis = 1 )

# normalization of features
norm_scaler = MinMaxScaler()
normalized = norm_scaler.fit_transform(df.drop(labels = 'Class', axis = 1 ))
df_normalized = pd.DataFrame(normalized, columns = df.drop(labels = 'Class', axis = 1 ).columns)

# standardization of features
standard_scaler = StandardScaler()
standardized = standard_scaler.fit_transform(df.drop(labels = 'Class', axis = 1 ))
df_standardized = pd.DataFrame(standardized, columns = df.drop(labels = 'Class', axis = 1 ).columns)


In [None]:
def data_selection(selection):

    if selection is "df_raw":
      return df_raw

    if selection is "df_normalized":
      return df_normalized

    if selection is "df_standardized":
      return df_standardized

###Parameters for each classifier

####(1) Decision Tree ([Library](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier))

In [None]:
legend_label_DTC = ['log(N)', 'sqrt(N)', 'N/2', 'N'] #number of features to consider at each split (plot legend)
N_DTC = len(df_raw.columns) # total features in dataset
considered_features_DTC = ['log2', 'auto', round(N_DTC/2), None] #number of features to consider at each split
features_per_config_DTC = [round(log2(N_DTC)), round(sqrt(N_DTC)), round(float(N_DTC/2)), N_DTC] # num of features per configuration

predictions_DTC = [] # stores predictions 
accuracies_by_num_features_DTC = [] # stores accuracies 
cms_DTC = [] # stores confusion matrices 
probs_DTC = [] # stores score probabilities of the test data 

####(2) Random Forest ([Library](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random%20forest#sklearn.ensemble.RandomForestClassifier))

In [None]:
legend_label_RFC = ['log(N)', 'sqrt(N)', 'N/2', 'N'] #number of features to consider at each split (plot legend)
N_RFC = len(df_raw.columns) # total features in dataset
considered_features_RFC = ['log2', 'auto', round(N_RFC/2), None] #number of features to consider at each split
features_per_config_RFC = [round(log2(N_RFC)), round(sqrt(N_RFC)), round(float(N_RFC/2)), N_RFC ] # num of features per configuration
n_of_base_learners_RFC = [1, 10, 50, 100, 500, 1000, 5000] #number of base learners

predictions_RFC = [] # stores predictions for each RandomForestClassifier model
accuracies_by_num_features_RFC = [] # stores accuracies for each RandomForestClassifier model
cms_RFC = [] # stores confusion matrices for each RandomForestClassifier model
probs_RFC = [] # stores score probabilities of the test data for each RandomForestClassifier model

####(3) & (4) Boosting (Ada) Ensemble with logistic regression classifier as the base learner, AdaBoost Ensemble with Decision Tree as the base learner ([Library](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html?highlight=adaboost#sklearn.ensemble.AdaBoostClassifier))

In [None]:
legend_label_Boost = ['log(N)', 'sqrt(N)', 'N/2', 'N'] #number of features to consider at each split (plot legend)
N_Boost = len(df_raw.columns) # total features in dataset
considered_features_Boost = ['log2', 'auto', round(N_Boost/2), None] #number of features to consider at each split
features_per_config_Boost = [round(log2(N_Boost)), round(sqrt(N_Boost)), round(float(N_Boost/2)), N_Boost ] # num of features per configuration
n_of_base_learners_Boost = [1, 10, 50, 100, 500, 1000, 5000] #number of base learners

predictions_Boost = [] # stores predictions
accuracies_by_num_base_learners_Boost = [] # stores accuracies
cms_Boost = [] # stores confusion matrices
probs_Boost = [] # stores score probabilities of the test data

### Functions for each classifier

####(1) Decision Trees

In [None]:
def initialize_DTC():
  predictions_DTC.clear()
  accuracies_by_num_features_DTC.clear()
  cms_DTC.clear()
  probs_DTC.clear()

In [None]:
def DTC_predictions():  

  print("------BEGIN: Decision Tree Classifiers------")
  # loop through each tunable model
  for index, feature_config in enumerate(tqdm(considered_features_DTC, ascii=True, unit='Feature Config')):

    print('')
    print("---------------------------")
    print('')
    print("Feature Config:", legend_label_DTC[index], "= {} Features".format(features_per_config_DTC[index]))

    # classifier
    dtc = DecisionTreeClassifier(criterion = "entropy", random_state = rng_seed,  max_features=feature_config)

    # train rfc model using training split
    dtc.fit(X_train,Y_train)

    # apply model to test split
    Y_pred = dtc.predict(X_test)
    Y_score = dtc.predict_proba(X_test)
    
    # store predictions, accuracy and probability scores
    predictions_DTC.append(Y_pred)
    accuracies_by_num_features_DTC.append(accuracy_score(Y_test, Y_pred))
    probs_DTC.append(Y_score)

    # store confusion matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=class_labels)
    cms_DTC.append(cm)

    # display stats
    print("\nAccuracy:", round(accuracies_by_num_features_DTC[index]*100,4), "%\n")
 
    TN, FP, FN, TP = cms_DTC[index].ravel()

    print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

    cm_display = ConfusionMatrixDisplay(confusion_matrix=cms_DTC[index], display_labels=class_labels)
    cm_display.plot()
    plt.show()

    # True Positive Rate TPR = TP/(TP+FN)
    TPR = TP/(TP+FN)
    print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

    # True Negative Rate TNR = TN/(TN+FP)
    TNR = TN/(FP+TN)
    print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

    # False Positive Rate FPR = FP/(FP+TN)
    FPR = FP/(FP+TN)
    print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

    # False Negative Rate FNR = FN/(FN+TP)
    FNR = FN/(FN+TP)
    print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

    # Precision
    PRECISION = TP/(TP+FP)
    print("Precision Rate =",round(PRECISION*100, 4), "%")

    # Recall
    RECALL = TP/(TP+FN)
    print("Recall Rate =",round(RECALL*100, 4), "%\n")

    # get false and true positive rates
    fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_DTC[index][:,1], pos_label='spam')
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

    # get area under the curve
    roc_auc = auc(fpr, tpr)

    # get prec and recall rates
    prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_DTC[index][:,1], pos_label='spam')
    pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

    # get area under the curve
    pr_auc = auc(recall, prec)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(8, 5)

    ax1.plot(fpr, tpr, lw=1)
    ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
    ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
    ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
    ax1.set_xlim([-.1, 1.1])

    ax2.plot(recall, prec, lw=1)
    ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
    ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
    ax2.set(xlabel = 'Recall', ylabel = 'Precision')
    ax2.set_xlim([-.1, 1.1])

    plt.tight_layout()
    plt.show()

  print("\n-----END-----")

In [None]:
# Plot graphs (RFC)
def plot_all_graphs_DTC():

  # plot Accuracies of DTCs vs Num of Features 

  adj_range = range(len(features_per_config_DTC))

  plt.plot(adj_range, accuracies_by_num_features_DTC, marker="o")

  print('\n-------DTCs SUMMARY-------\n')
  #plt.legend(legend_label_DTC, loc="best")
  plt.title("Accuracies of DTCs vs Number of Features")
  plt.ylabel(f"Accuracies")
  plt.xlabel("Number of Features")
  plt.xticks(adj_range, features_per_config_DTC)

  plt.show()

In [None]:
def best_accuracy_DTC():
  best_estimator = ['', 0, 0]
  max_index = 0
  
  max_accuracy = max(accuracies_by_num_features_DTC)
  max_index = accuracies_by_num_features_DTC.index(max_accuracy)

  print('\n-----BEST PERFORMANCE-----\n')
  print("Feature Config:", legend_label_DTC[max_index], "= {} Features with accuracy:".format(features_per_config_DTC[max_index]), round(max_accuracy*100, 4), "%\n")

  cm = cms_DTC[max_index]

  TN, FP, FN, TP = cm.ravel()

  print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

  cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
  cm_display.plot()
  plt.show()

  # True Positive Rate TPR = TP/(TP+FN)
  TPR = TP/(TP+FN)
  print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

  # True Negative Rate TNR = TN/(TN+FP)
  TNR = TN/(FP+TN)
  print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

  # False Positive Rate FPR = FP/(FP+TN)
  FPR = FP/(FP+TN)
  print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

  # False Negative Rate FNR = FN/(FN+TP)
  FNR = FN/(FN+TP)
  print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

  # Precision
  PRECISION = TP/(TP+FP)
  print("Precision Rate =",round(PRECISION*100, 4), "%")

  # Recall
  RECALL = TP/(TP+FN)
  print("Recall Rate =",round(RECALL*100, 4), "%\n")

  # get false and true positive rates
  fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_DTC[max_index][:,1], pos_label='spam')
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

  # get area under the curve
  roc_auc = auc(fpr, tpr)

  # get prec and recall rates
  prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_DTC[max_index][:,1], pos_label='spam')
  pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

  # get area under the curve
  pr_auc = auc(recall, prec)

  fig, (ax1, ax2) = plt.subplots(1, 2)
  fig.set_size_inches(8, 5)

  ax1.plot(fpr, tpr, lw=1)
  ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
  ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
  ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
  ax1.set_xlim([-.1, 1.1])

  ax2.plot(recall, prec, lw=1)
  ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
  ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
  ax2.set(xlabel = 'Recall', ylabel = 'Precision')
  ax2.set_xlim([-.1, 1.1])

  plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
  plt.show()

In [None]:
def DTC_Evaluation():
  initialize_DTC()
  DTC_predictions()
  plot_all_graphs_DTC()
  best_accuracy_DTC()

####(2) Random Forest

In [None]:
def initialize_RFC():
  predictions_RFC.clear()
  accuracies_by_num_features_RFC.clear()
  cms_RFC.clear()
  probs_RFC.clear()

In [None]:
def RFC_predictions():  

  print("------BEGIN: Random Forest Classifiers------")
  # loop through each tunable model
  for index, feature_config in enumerate(tqdm(considered_features_RFC, ascii=True, unit='Feature Config')):

    print('')
    print("---------------------------")
    print('')
    print("Feature Config:", legend_label_RFC[index], "= {} Features".format(features_per_config_RFC[index]))
    
    predictions = []
    accuracies = []
    cms = []
    probs = []

    for base_learner in tqdm(n_of_base_learners_RFC, ascii=True, unit='Base Learners Config'):

      # classifier
      rfc = RandomForestClassifier(n_estimators=base_learner, criterion = "entropy", bootstrap = True, random_state = rng_seed, max_features=feature_config)

      # train rfc model using training split
      rfc.fit(X_train,Y_train)

      # apply model to test split
      Y_pred = rfc.predict(X_test)
      Y_score = rfc.predict_proba(X_test)
     
      # store predictions, accuracy abd probability scores
      predictions.append(Y_pred)
      accuracies.append(accuracy_score(Y_test, Y_pred))
      probs.append(Y_score)

      # store confusion matrix
      cm = confusion_matrix(Y_test, Y_pred, labels=class_labels)
      cms.append(cm)
    
    predictions_RFC.append(predictions)
    accuracies_by_num_features_RFC.append(accuracies)
    cms_RFC.append(cms)
    probs_RFC.append(probs)

    # display stats of best classifier by Base Learner Config
    max_accuracy = max(accuracies)
    max_index = accuracies.index(max_accuracy)
    print("\nMax Accuracy ({} base learners):".format(n_of_base_learners_RFC[max_index]), round(max_accuracy*100,4), "%\n")
 
    TN, FP, FN, TP = cms[max_index].ravel()

    print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

    cm_display = ConfusionMatrixDisplay(confusion_matrix=cms[max_index], display_labels=class_labels)
    cm_display.plot()
    plt.show()

    # True Positive Rate TPR = TP/(TP+FN)
    TPR = TP/(TP+FN)
    print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

    # True Negative Rate TNR = TN/(TN+FP)
    TNR = TN/(FP+TN)
    print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

    # False Positive Rate FPR = FP/(FP+TN)
    FPR = FP/(FP+TN)
    print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

    # False Negative Rate FNR = FN/(FN+TP)
    FNR = FN/(FN+TP)
    print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

    # Precision
    PRECISION = TP/(TP+FP)
    print("Precision Rate =",round(PRECISION*100, 4), "%")

    # Recall
    RECALL = TP/(TP+FN)
    print("Recall Rate =",round(RECALL*100, 4), "%\n")

    # get false and true positive rates
    fpr, tpr, thresholds_roc = roc_curve(Y_test, probs[max_index][:,1], pos_label='spam')
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

    # get area under the curve
    roc_auc = auc(fpr, tpr)

    # get prec and recall rates
    prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs[max_index][:,1], pos_label='spam')
    pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

    # get area under the curve
    pr_auc = auc(recall, prec)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(8, 5)

    ax1.plot(fpr, tpr, lw=1)
    ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
    ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
    ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
    ax1.set_xlim([-.1, 1.1])

    ax2.plot(recall, prec, lw=1)
    ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
    ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
    ax2.set(xlabel = 'Recall', ylabel = 'Precision')
    ax2.set_xlim([-.1, 1.1])

    plt.tight_layout()
    plt.show()

  print("\n-----END-----")

In [None]:
# Plot graphs (RFC)
def plot_all_graphs_RFC():

  # plot Accuracies of RFCs vs Number of Base Learners 
  line_style = ['-', '--', '-.', ':']

  adj_range = range(len(n_of_base_learners_RFC))

  for index, accuracies in enumerate(accuracies_by_num_features_RFC):
    plt.plot(adj_range, accuracies, line_style[index], marker="o")

  print('\n-------RFCs SUMMARY-------\n')
  plt.legend(legend_label_RFC, loc="best")
  plt.title("Accuracies of RFCs vs Number of Base Learners")
  plt.ylabel(f"Accuracies")
  plt.xlabel("Number of Base Learners")
  plt.xticks(adj_range, n_of_base_learners_RFC)

  plt.show()


In [None]:
def best_accuracy_RFC():
  best_estimator = ['', 0, 0, 0, 0]
  max_index = 0
  for index, accuracies in enumerate(accuracies_by_num_features_RFC):
      max_accuracy = max(accuracies)
      max_index = accuracies.index(max_accuracy)

      if best_estimator[4] < max_accuracy:
        best_estimator[0] = legend_label_RFC[index]
        best_estimator[1] = index
        best_estimator[2] = n_of_base_learners_RFC[max_index]
        best_estimator[3] = max_index
        best_estimator[4] = max_accuracy

  print('\n-----BEST PERFORMANCE-----\n')
  print("Feature Config:", best_estimator[0], " = {} Features with accuracy ({} base learners):".format(features_per_config_RFC[best_estimator[1]],best_estimator[2]), round(best_estimator[4]*100, 4), "%\n")

  cm = cms_RFC[best_estimator[1]][best_estimator[3]]

  TN, FP, FN, TP = cm.ravel()

  print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

  cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
  cm_display.plot()
  plt.show()

  # True Positive Rate TPR = TP/(TP+FN)
  TPR = TP/(TP+FN)
  print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

  # True Negative Rate TNR = TN/(TN+FP)
  TNR = TN/(FP+TN)
  print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

  # False Positive Rate FPR = FP/(FP+TN)
  FPR = FP/(FP+TN)
  print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

  # False Negative Rate FNR = FN/(FN+TP)
  FNR = FN/(FN+TP)
  print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

  # Precision
  PRECISION = TP/(TP+FP)
  print("Precision Rate =",round(PRECISION*100, 4), "%")

  # Recall
  RECALL = TP/(TP+FN)
  print("Recall Rate =",round(RECALL*100, 4), "%\n")

  # get false and true positive rates
  fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_RFC[best_estimator[1]][best_estimator[3]][:,1], pos_label='spam')
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

  # get area under the curve
  roc_auc = auc(fpr, tpr)

  # get prec and recall rates
  prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_RFC[best_estimator[1]][best_estimator[3]][:,1], pos_label='spam')
  pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

  # get area under the curve
  pr_auc = auc(recall, prec)

  fig, (ax1, ax2) = plt.subplots(1, 2)
  fig.set_size_inches(8, 5)

  ax1.plot(fpr, tpr, lw=1)
  ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
  ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
  ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
  ax1.set_xlim([-.1, 1.1])

  ax2.plot(recall, prec, lw=1)
  ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
  ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
  ax2.set(xlabel = 'Recall', ylabel = 'Precision')
  ax2.set_xlim([-.1, 1.1])

  plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
  plt.show()

In [None]:
def RFC_Evaluation():
  initialize_RFC()
  RFC_predictions()
  plot_all_graphs_RFC()
  best_accuracy_RFC()

####(3) & (4) Boosting (Ada) Ensemble with logistic regression classifier as the base learner, AdaBoost Ensemble with Decision Tree as the base learner

In [None]:
def initialize_Boost():
  predictions_Boost.clear()
  accuracies_by_num_base_learners_Boost.clear()
  cms_Boost.clear()
  probs_Boost.clear()

In [None]:
def Boost_LR_predictions():  

  print("------BEGIN: Boosting Ensemble with Logistic Regression------")
  # loop through each tunable model
  lr_est = LogisticRegression (max_iter = 10000, random_state=rng_seed)  #logistic regression classifier as the base learner

  for index, base_learner in enumerate(tqdm(n_of_base_learners_Boost, ascii=True, unit='Base Learners Config')):

    print('')
    print("---------------------------")
    print('')
    print("Base Learner Config:", base_learner)

    # classifier
    bs_lr = AdaBoostClassifier(n_estimators = base_learner, base_estimator = lr_est)

    # train model using training split
    bs_lr.fit(X_train,Y_train)

    # apply model to test split
    Y_pred = bs_lr.predict(X_test)
    Y_score = bs_lr.predict_proba(X_test)
    
    # store predictions, accuracy abd probability scores
    predictions_Boost.append(Y_pred)
    accuracies_by_num_base_learners_Boost.append(accuracy_score(Y_test, Y_pred))
    probs_Boost.append(Y_score)

    # store confusion matrix
    cm = confusion_matrix(Y_test, Y_pred, labels=class_labels)
    cms_Boost.append(cm)

    # display stats
    print("\nAccuracy:", round(accuracies_by_num_base_learners_Boost[index]*100,4), "%\n")
 
    TN, FP, FN, TP = cms_Boost[index].ravel()

    print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

    cm_display = ConfusionMatrixDisplay(confusion_matrix=cms_Boost[index], display_labels=class_labels)
    cm_display.plot()
    plt.show()

    # True Positive Rate TPR = TP/(TP+FN)
    TPR = TP/(TP+FN)
    print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

    # True Negative Rate TNR = TN/(TN+FP)
    TNR = TN/(FP+TN)
    print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

    # False Positive Rate FPR = FP/(FP+TN)
    FPR = FP/(FP+TN)
    print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

    # False Negative Rate FNR = FN/(FN+TP)
    FNR = FN/(FN+TP)
    print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

    # Precision
    PRECISION = TP/(TP+FP)
    print("Precision Rate =",round(PRECISION*100, 4), "%")

    # Recall
    RECALL = TP/(TP+FN)
    print("Recall Rate =",round(RECALL*100, 4), "%\n")

    # get false and true positive rates
    fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_Boost[index][:,1], pos_label='spam')
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

    # get area under the curve
    roc_auc = auc(fpr, tpr)

    # get prec and recall rates
    prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_Boost[index][:,1], pos_label='spam')
    pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

    # get area under the curve
    pr_auc = auc(recall, prec)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(8, 5)

    ax1.plot(fpr, tpr, lw=1)
    ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
    ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
    ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
    ax1.set_xlim([-.1, 1.1])

    ax2.plot(recall, prec, lw=1)
    ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
    ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
    ax2.set(xlabel = 'Recall', ylabel = 'Precision')
    ax2.set_xlim([-.1, 1.1])

    plt.tight_layout()
    plt.show()

  print("\n-----END-----")


def Boost_DTC_predictions():  

  print("------BEGIN: AdaBoost Ensemble with Decision Tree------")

  # loop through each tunable model
  for index, feature_config in enumerate(tqdm(considered_features_Boost, ascii=True, unit='Feature Config')):

    print('')
    print("---------------------------")
    print('')
    print("Feature Config:", legend_label_Boost[index], "= {} Features".format(features_per_config_Boost[index]))
    
    predictions = []
    accuracies = []
    cms = []
    probs = []

    # decision tree classifier as the base learner for feature config
    dtc_est = DecisionTreeClassifier(criterion = "entropy", random_state = rng_seed, max_features=feature_config) 

    for base_learner in tqdm(n_of_base_learners_Boost, ascii=True, unit='Base Learners Config'):

      # classifier
      bs_dtc = AdaBoostClassifier(n_estimators = base_learner, base_estimator = dtc_est)

      # train model using training split
      bs_dtc.fit(X_train,Y_train)

      # apply model to test split
      Y_pred = bs_dtc.predict(X_test)
      Y_score = bs_dtc.predict_proba(X_test)
      
      # store predictions, accuracy abd probability scores
      predictions.append(Y_pred)
      accuracies.append(accuracy_score(Y_test, Y_pred))
      probs.append(Y_score)

      # store confusion matrix
      cm = confusion_matrix(Y_test, Y_pred, labels=class_labels)
      cms.append(cm)
    
    predictions_Boost.append(predictions)
    accuracies_by_num_base_learners_Boost.append(accuracies)
    cms_Boost.append(cms)
    probs_Boost.append(probs)

    # display stats of best classifier by Base Learner Config
    max_accuracy = max(accuracies)
    max_index = accuracies.index(max_accuracy)
    print("\nMax Accuracy ({} base learners):".format(n_of_base_learners_Boost[max_index]), round(max_accuracy*100,4), "%\n")

    TN, FP, FN, TP = cms[max_index].ravel()

    print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

    cm_display = ConfusionMatrixDisplay(confusion_matrix=cms[max_index], display_labels=class_labels)
    cm_display.plot()
    plt.show()

    # True Positive Rate TPR = TP/(TP+FN)
    TPR = TP/(TP+FN)
    print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

    # True Negative Rate TNR = TN/(TN+FP)
    TNR = TN/(FP+TN)
    print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

    # False Positive Rate FPR = FP/(FP+TN)
    FPR = FP/(FP+TN)
    print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

    # False Negative Rate FNR = FN/(FN+TP)
    FNR = FN/(FN+TP)
    print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

    # Precision
    PRECISION = TP/(TP+FP)
    print("Precision Rate =",round(PRECISION*100, 4), "%")

    # Recall
    RECALL = TP/(TP+FN)
    print("Recall Rate =",round(RECALL*100, 4), "%\n")

    # get false and true positive rates
    fpr, tpr, thresholds_roc = roc_curve(Y_test, probs[max_index][:,1], pos_label='spam')
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

    # get area under the curve
    roc_auc = auc(fpr, tpr)

    # get prec and recall rates
    prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs[max_index][:,1], pos_label='spam')
    pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

    # get area under the curve
    pr_auc = auc(recall, prec)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(8, 5)

    ax1.plot(fpr, tpr, lw=1)
    ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
    ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
    ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
    ax1.set_xlim([-.1, 1.1])

    ax2.plot(recall, prec, lw=1)
    ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
    ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
    ax2.set(xlabel = 'Recall', ylabel = 'Precision')
    ax2.set_xlim([-.1, 1.1])

    plt.tight_layout()
    plt.show()

  print("\n-----END-----")

In [None]:
def plot_all_graphs_Boost_LR():

  # plot Accuracies vs Num of Features 

  adj_range = range(len(n_of_base_learners_Boost))

  plt.plot(adj_range, accuracies_by_num_base_learners_Boost, marker="o")

  print('\n-------Boost LR SUMMARY-------\n')
  plt.title("Accuracies of Boosting LR vs Number of Base Learners")
  plt.ylabel(f"Accuracies")
  plt.xlabel("Number of Base Learners")
  plt.xticks(adj_range, n_of_base_learners_Boost)

  plt.show()


def plot_all_graphs_Boost_DTC():

  # plot Accuracies  vs Number of Base Learners 
  line_style = ['-', '--', '-.', ':']

  adj_range = range(len(n_of_base_learners_Boost))

  for index, accuracies in enumerate(accuracies_by_num_base_learners_Boost):
    plt.plot(adj_range, accuracies, line_style[index], marker="o")

  print('\n-------BOOST DTC SUMMARY-------\n')
  plt.legend(legend_label_Boost, loc="best")
  plt.title("Accuracies of Boosting DTC vs Number of Base Learners")
  plt.ylabel(f"Accuracies")
  plt.xlabel("Number of Base Learners")
  plt.xticks(adj_range, n_of_base_learners_Boost)

  plt.show()

In [None]:
def best_accuracy_Boost_DTC():
  best_estimator = ['', 0, 0, 0, 0]
  max_index = 0
  for index, accuracies in enumerate(accuracies_by_num_base_learners_Boost):
      max_accuracy = max(accuracies)
      max_index = accuracies.index(max_accuracy)

      if best_estimator[4] < max_accuracy:
        best_estimator[0] = legend_label_Boost[index]
        best_estimator[1] = index
        best_estimator[2] = n_of_base_learners_Boost[max_index]
        best_estimator[3] = max_index
        best_estimator[4] = max_accuracy

  print('\n-----BEST PERFORMANCE-----\n')
  print("Feature Config:", best_estimator[0], " = {} Features with accuracy ({} base learners):".format(features_per_config_Boost[best_estimator[1]],best_estimator[2]), round(best_estimator[4]*100, 4), "%\n")

  cm = cms_Boost[best_estimator[1]][best_estimator[3]]

  TN, FP, FN, TP = cm.ravel()

  print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

  cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
  cm_display.plot()
  plt.show()

  # True Positive Rate TPR = TP/(TP+FN)
  TPR = TP/(TP+FN)
  print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

  # True Negative Rate TNR = TN/(TN+FP)
  TNR = TN/(FP+TN)
  print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

  # False Positive Rate FPR = FP/(FP+TN)
  FPR = FP/(FP+TN)
  print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%")

  # False Negative Rate FNR = FN/(FN+TP)
  FNR = FN/(FN+TP)
  print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

  # Precision
  PRECISION = TP/(TP+FP)
  print("Precision Rate =",round(PRECISION*100, 4), "%")

  # Recall
  RECALL = TP/(TP+FN)
  print("Recall Rate =",round(RECALL*100, 4), "%\n")

  # get false and true positive rates
  fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_Boost[best_estimator[1]][best_estimator[3]][:,1], pos_label='spam')
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

  # get area under the curve
  roc_auc = auc(fpr, tpr)

  # get prec and recall rates
  prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_Boost[best_estimator[1]][best_estimator[3]][:,1], pos_label='spam')
  pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

  # get area under the curve
  pr_auc = auc(recall, prec)

  fig, (ax1, ax2) = plt.subplots(1, 2)
  fig.set_size_inches(8, 5)

  ax1.plot(fpr, tpr, lw=1)
  ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
  ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
  ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
  ax1.set_xlim([-.1, 1.1])

  ax2.plot(recall, prec, lw=1)
  ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
  ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
  ax2.set(xlabel = 'Recall', ylabel = 'Precision')
  ax2.set_xlim([-.1, 1.1])

  plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
  plt.show()

def best_accuracy_Boost_LR():
  best_estimator = ['', 0, 0]
  max_index = 0
  
  max_accuracy = max(accuracies_by_num_base_learners_Boost)
  max_index = accuracies_by_num_base_learners_Boost.index(max_accuracy)

  print('\n-----BEST PERFORMANCE-----\n')
  print("Base Learner Config:", n_of_base_learners_Boost[max_index], "Base Learners with accuracy:", round(max_accuracy*100, 4), "%\n")

  cm = cms_Boost[max_index]

  TN, FP, FN, TP = cm.ravel()

  print('(TN, FP, FN, TP) =', (TN, FP, FN, TP),'\n')

  cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
  cm_display.plot()
  plt.show()

  # True Positive Rate TPR = TP/(TP+FN)
  TPR = TP/(TP+FN)
  print("\nTrue Positive Rate: TPR = TP/(TP+FN) =",round(TPR*100, 4), "%")

  # True Negative Rate TNR = TN/(TN+FP)
  TNR = TN/(FP+TN)
  print("\nTrue Negative Rate: TNR = TN/(TN+FP) =",round(TNR*100, 4), "%")

  # False Positive Rate FPR = FP/(FP+TN)
  FPR = FP/(FP+TN)
  print("\nFalse Positive Rate: FPR = FP/(FP+TN) =",round(FPR*100, 4), "%\n")

  # False Negative Rate FNR = FN/(FN+TP)
  FNR = FN/(FN+TP)
  print("\nFalse Negative Rate: FNR = FN/(FN+TP) =",round(FNR*100, 4), "%\n")

  # Precision
  PRECISION = TP/(TP+FP)
  print("Precision Rate =",round(PRECISION*100, 4), "%")

  # Recall
  RECALL = TP/(TP+FN)
  print("Recall Rate =",round(RECALL*100, 4), "%\n")

  # get false and true positive rates
  fpr, tpr, thresholds_roc = roc_curve(Y_test, probs_Boost[max_index][:,1], pos_label='spam')
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr)

  # get area under the curve
  roc_auc = auc(fpr, tpr)

  # get prec and recall rates
  prec, recall, thresholds_prc = precision_recall_curve(Y_test, probs_Boost[max_index][:,1], pos_label='spam')
  pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)

  # get area under the curve
  pr_auc = auc(recall, prec)

  fig, (ax1, ax2) = plt.subplots(1, 2)
  fig.set_size_inches(8, 5)

  ax1.plot(fpr, tpr, lw=1)
  ax1.plot(FPR, TPR, marker="o", ms = 2, color = 'red')
  ax1.set_title('ROC\nAUC = {}'.format(round(roc_auc,4)))
  ax1.set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate')
  ax1.set_xlim([-.1, 1.1])

  ax2.plot(recall, prec, lw=1)
  ax2.plot(RECALL, PRECISION, marker="o", ms = 2, color = 'red')
  ax2.set_title('PR\nAUC = {}'.format(round(pr_auc,4)))
  ax2.set(xlabel = 'Recall', ylabel = 'Precision')
  ax2.set_xlim([-.1, 1.1])

  plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
  plt.show()

In [None]:
def Boost_LR_Evaluation():
  initialize_Boost()
  Boost_LR_predictions()
  plot_all_graphs_Boost_LR()
  best_accuracy_Boost_LR()

def Boost_DTC_Evaluation():
  initialize_Boost()
  Boost_DTC_predictions()
  plot_all_graphs_Boost_DTC()
  best_accuracy_Boost_DTC()

###Results

In [None]:
data_options = "df_raw" #@param ["df_raw", "df_normalized", "df_standardized"]
split_percentage = 0.7254 #@param {type:"slider", min:0, max:1, step:0.0001}
rng_seed = 42 #@param {type:"number"}
rng_seed_split = 42 #@param {type:"number"}

#### Split Data

In [None]:
X_features_df = data_selection(data_options)
Y_class_labels_df = df['Class']
class_labels = Y_class_labels_df.unique()

print("Maximum Features (N) = {}".format(len(X_features_df.columns)))
print("Records = {} \n".format(len(X_features_df)))
print("The data is set to:", data_options)
#print("\n", X_features_df.describe().round(2))

print("\nClass Distribution - Pre Split")
print(round((Y_class_labels_df.value_counts()/Y_class_labels_df.shape)*100,4))

X_train, X_test, Y_train, Y_test = train_test_split(X_features_df, Y_class_labels_df, test_size=split_percentage, random_state=rng_seed_split, stratify=Y_class_labels_df) # split data

print("\nClass Distribution - Training {}%".format(round((1-split_percentage)*100, 6)))
print(round((Y_train.value_counts() / len(Y_train))*100, 4))

print("\nClass Distribution - Testing {}%".format(round(split_percentage*100,4)))
print(round((Y_test.value_counts() / len(Y_test))*100,4))

Maximum Features (N) = 57
Records = 4601 

The data is set to: df_raw

Class Distribution - Pre Split
ham     60.5955
spam    39.4045
Name: Class, dtype: float64

Class Distribution - Training 0.05%
spam    50.0
ham     50.0
Name: Class, dtype: float64

Class Distribution - Testing 99.95%
ham     60.6001
spam    39.3999
Name: Class, dtype: float64


####Model

In [None]:
# uncomment to review each model
#DTC_Evaluation()
#RFC_Evaluation()
#Boost_LR_Evaluation()
#Boost_DTC_Evaluation()