In [2]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/Colab Notebooks/DTSC620_Fall2025/
!ls

/content/drive/My Drive/Colab Notebooks/DTSC620_Fall2025
DTSC620_Project1_AngelicaMei.ipynb  spam.csv
DTSC620_Project2_AngelicaMei.ipynb


In [5]:
# Load data
df = pd.read_csv("spam.csv")
X = df.drop(columns=["Class"])
y = df["Class"]
classes = sorted(y.unique())  # e.g. ['ham', 'spam']

dt = DecisionTreeClassifier(random_state=42)
gnb = GaussianNB()
lr = LogisticRegression(max_iter=1000, solver="liblinear", random_state=42)

# Fused classifier (majority vote)
fused = VotingClassifier(
    estimators=[("dt", dt), ("gnb", gnb), ("lr", lr)],
    voting="hard",
)

# AdaBoost with decision tree base learner
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=400,
    learning_rate=0.5,
    random_state=42,
)

#Random Forest with 1000 trees
rf = RandomForestClassifier(
    n_estimators=1000,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
)

# Helper
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred, labels=classes)
    per_class_acc = cm.diagonal() / cm.sum(axis=1)

    return acc, per_class_acc, cm

def print_results(name, acc, per_class_acc, cm):
    print(f"\n{name}")
    print(f"  Overall accuracy: {acc:.4f}")
    print(f"  Per-class accuracy {classes}: {per_class_acc}")
    print("  Confusion matrix:")
    print(cm)

#Train on first 1000 (after randomization), test on remaining 360
df_shuffled = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
X_shuffled = df_shuffled.drop(columns=["Class"])
y_shuffled = df_shuffled["Class"]

X_train_1000 = X_shuffled.iloc[:1000]
y_train_1000 = y_shuffled.iloc[:1000]
X_test_3601 = X_shuffled.iloc[1000:]
y_test_3601 = y_shuffled.iloc[1000:]

print("Fused vs AdaBoost (TRAIN: 1000 / TEST: 3601)")

acc_fused, pca_fused, cm_fused = evaluate_model(
    fused, X_train_1000, y_train_1000, X_test_3601, y_test_3601
)
print_results("Fused Voting", acc_fused, pca_fused, cm_fused)

acc_ada, pca_ada, cm_ada = evaluate_model(
    ada, X_train_1000, y_train_1000, X_test_3601, y_test_3601
)
print_results("AdaBoost (DT)", acc_ada, pca_ada, cm_ada)

print("\nFused vs Random Forest (TRAIN: 1000 / TEST: 3601)")

acc_rf, pca_rf, cm_rf = evaluate_model(
    rf, X_train_1000, y_train_1000, X_test_3601, y_test_3601
)
print_results("Random Forest (1000 Trees)", acc_rf, pca_rf, cm_rf)


#Compare Fused vs AdaBoost
#Training-test splits: 50-50, 60-40, 70-30, 80-20
splits = [(0.5, 0.5), (0.6, 0.4), (0.7, 0.3), (0.8, 0.2)]

print("\nFused vs AdaBoost for different train/test splits")

for train_frac, test_frac in splits:
    print(f"\n--- Train {int(train_frac*100)}% / Test {int(test_frac*100)}% ---")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=train_frac,
        test_size=test_frac,
        stratify=y,
        random_state=42,
    )

    acc_fused_s, pca_fused_s, cm_fused_s = evaluate_model(
        fused, X_train, y_train, X_test, y_test
    )
    print_results("Fused Voting", acc_fused_s, pca_fused_s, cm_fused_s)

    acc_ada_s, pca_ada_s, cm_ada_s = evaluate_model(
        ada, X_train, y_train, X_test, y_test
    )
    print_results("AdaBoost (DT)", acc_ada_s, pca_ada_s, cm_ada_s)



Fused vs AdaBoost (TRAIN: 1000 / TEST: 3601)

Fused Voting
  Overall accuracy: 0.9328
  Per-class accuracy ['ham', 'spam']: [0.94249201 0.9177305 ]
  Confusion matrix:
[[2065  126]
 [ 116 1294]]

AdaBoost (DT)
  Overall accuracy: 0.9414
  Per-class accuracy ['ham', 'spam']: [0.97170242 0.89432624]
  Confusion matrix:
[[2129   62]
 [ 149 1261]]

Fused vs Random Forest (TRAIN: 1000 / TEST: 3601)

Random Forest (1000 Trees)
  Overall accuracy: 0.9425
  Per-class accuracy ['ham', 'spam']: [0.96942036 0.90070922]
  Confusion matrix:
[[2124   67]
 [ 140 1270]]

Fused vs AdaBoost for different train/test splits

--- Train 50% / Test 50% ---

Fused Voting
  Overall accuracy: 0.9313
  Per-class accuracy ['ham', 'spam']: [0.93185079 0.93054024]
  Confusion matrix:
[[1299   95]
 [  63  844]]

AdaBoost (DT)
  Overall accuracy: 0.9418
  Per-class accuracy ['ham', 'spam']: [0.96054519 0.91289967]
  Confusion matrix:
[[1339   55]
 [  79  828]]

--- Train 60% / Test 40% ---

Fused Voting
  Overall acc