# Email Classifier - CS470 Final Project
## Aiden Seay Spring 2024

### Set Up Program

#### Import Necessary Libraries

In [1]:
# IMPORT FUNCTION UTILITIES
from Utilities.KNN import KNNClass
from Utilities.LogisticRegression import LogisticRegressionClass
from Utilities.NaiveBayes import NaiveBayesClass
import Utilities.SplitDataset as GetData
import Utilities.Analysis as Analysis

# CONSTANTS
TARGET = 1
FEATURE = 0
SPAM = -1
NOT_SPAM = -2
TOTAL = -3

#### Load Email Spam Data

You can find the data set [here](https://www.kaggle.com/datasets/colormap/spambase/data). Split the dataset into training, test and evaluation sub categories. 

In [2]:
# read dataset into pandas df
df = GetData.read_dataset("./Data/spambase.csv")

# split the dataset (refer to SplitDataset for fold data structure)
(train_data, validation_data, X_test, y_test, X_train, 
                                            y_train) = GetData.split_dataset(df)

# append all results from each algorithm here
results = []

### Test Algorithms
All algorithm implementation can be found in the Utilities folder.

#### Naive Bayes

In [3]:
# initialize analysis data lists
acc_list = []
fp_list = []
tp_list = []
auc_list = []

# run naive bayes algorithm across 5 folds 
for fold in range(len(train_data)):

    # initialize the class
    naive_bayes = NaiveBayesClass(train_data[fold], validation_data[fold], 
                                                                 X_test, y_test)

    # train the model
    (spam_prop, non_spam_prop, spam_word_freq, 
                                       non_spam_word_freq) = naive_bayes.learn()

    # evaluate the model
    calc_result, true_result = naive_bayes.evaluate(spam_prop, non_spam_prop, 
                                             spam_word_freq, non_spam_word_freq)
    
    # analyze the results
    acc, fp, tp, auc = Analysis.analyze_results(calc_result, true_result)
    acc_list.append(acc)
    fp_list.append(fp)
    tp_list.append(tp)
    auc_list.append(tp)

# get the average for the final results
acc_avg, fp_avg, tp_avg, auc_avg = Analysis.average_stats(acc_list, 
                                                    fp_list, tp_list, auc_list)

results.append(("Naive Bayes Algorithm", acc_avg, fp_avg, tp_avg, auc_avg))


#### K-Nearest Neighbors (KNN)

In [4]:
# run knn algorithm across 5 folds
#for fold in range(len(train_data))
# fold = 0
# knn = KNNClass(train_data[fold], validation_data[fold], X_test, y_test)

#### Logistic Regression

In [5]:
# run logistic regression algorithm across 5 folds

### Analyze Performance

Measure performance by:
* Accuracy (ACC)
* False Positive (FP)
* True Positive (TP)
* Area Under ROC Curve (AUC)

In [6]:
print("Algorithm Performance Analysis")
print("==============================\n")

for result in results:

    print(result[0] + "\n" + "-" * len(result[0]))
    print(f"Accuracy:         {result[1]}")
    print(f"False Positive:   {result[2]}")
    print(f"True Positive:    {result[3]}")
    print(f"Area Under Curve: {result[4]}\n")

Algorithm Performance Analysis

Naive Bayes Algorithm
---------------------
Accuracy:         0.8119565217391305
False Positive:   0.4120111204008374
True Positive:    0.9723299965427898
Area Under Curve: 0.9723299965427898

