In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils import shuffle

# Load in the preprocessed dataset (from the EDA)
df = pd.read_csv(r"C:\Users\Brian\Downloads\preprocessed_data.csv")  # Change the file path to where the file is

# shuffle the data to avoid bias from ordered spam/not spam entries
df = shuffle(df, random_state=42).reset_index(drop=True)

# Extract labels we are utilizing
X = df["Preprocessed Text"]  # Reads through the preprocessed text from the EDA
y = df["Spam or Not Spam"]  # Target labels (Spam/Not Spam)

#split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text to numerical representation using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Can finetune max_features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize models
knn = KNeighborsClassifier(n_neighbors=5)  # user can tune k-value later 
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(max_depth=10)  # user can tune max_depth later

models = {
    "K-Nearest Neighbors": knn,
    "Logistic Regression": log_reg,
    "Decision Tree": decision_tree
}

# Code to train, evaluate, and perform cross validation (ten times)
results = {}
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    
    # cross validation ten times
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=10, scoring='accuracy')
    
    # Evaluate on test set
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label="Spam")  # Adjust label if needed
    
    results[name] = {
        "Test Accuracy": accuracy,
        "Precision": precision,
        "Cross-Validation Accuracy": np.mean(cv_scores)
    }

# Display results
for model, metrics in results.items():
    print(f"{model}:")
    print(f"  Test Accuracy: {metrics['Test Accuracy']:.4f}")
    print(f"  Precision: {metrics['Precision']:.4f}")
    print(f"  Cross-Validation Accuracy: {metrics['Cross-Validation Accuracy']:.4f}")
    print("-" * 40)

# Identify the best performing model
best_model = max(results, key=lambda m: results[m]["Test Accuracy"])
if results[best_model]["Test Accuracy"] >= 0.95:
    print(f"{best_model} meets/exceeds 95% accuracy.")
else:
    print(f"No model reached 95% accuracy. Further tuning is needed.")


K-Nearest Neighbors:
  Test Accuracy: 0.9764
  Precision: 0.9732
  Cross-Validation Accuracy: 0.9756
----------------------------------------
Logistic Regression:
  Test Accuracy: 0.9799
  Precision: 0.9808
  Cross-Validation Accuracy: 0.9841
----------------------------------------
Decision Tree:
  Test Accuracy: 0.9468
  Precision: 0.8447
  Cross-Validation Accuracy: 0.9472
----------------------------------------
Logistic Regression meets/exceeds 95% accuracy.
