# Imports


In [1]:
import numpy as np
import pandas as pd
import scipy

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    classification_report,
)

import optuna

In [2]:
data = pd.read_csv("../data/cleaned-data.csv")

# Loading data


In [3]:
data

Unnamed: 0,text,is_reply,spam
0,naturally irresistible corporate identity lt r...,0,1
1,stock trading gunslinger fanny merrill muzo co...,0,1
2,nbelievable new homes made easy im wanting sho...,0,1
3,color printing special request additional info...,0,1
4,money get software cds software compatibility ...,0,1
...,...,...,...
5723,research development charges gpg forwarded shi...,1,0
5724,receipts visit jim thanks invitation visit lsu...,1,0
5725,enron case study update wow day super thank mu...,1,0
5726,interest david please call shirley crenshaw as...,1,0


In [4]:
vectoriser = TfidfVectorizer()
vectorised_data = vectoriser.fit_transform(data["text"])

In [5]:
vectorised_data[0, :20].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])

In [6]:
X = scipy.sparse.hstack([vectorised_data, data["is_reply"].values.reshape(-1, 1)])
y = data["spam"].values

# Modelling


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
def get_results(model, splitter, X, y, X_test, y_test):
    accuracy = []
    precision = []
    recall = []

    for i, (train_idx, val_idx) in enumerate(splitter.split(X, y)):
        X_train = X[train_idx, :]
        X_val = X[val_idx, :]

        y_train = y[train_idx]
        y_val = y[val_idx]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy.append(accuracy_score(y_true=y_val, y_pred=y_pred))
        recall.append(recall_score(y_true=y_val, y_pred=y_pred))
        precision.append(precision_score(y_true=y_val, y_pred=y_pred))

    print(
        f"Average accuracy is {100*np.mean(accuracy):.4f} +- {100*np.std(accuracy):.4f} with max {100*np.max(accuracy):.4f}"
    )
    print(
        f"Average recall is {100*np.mean(recall):.4f} +- {100*np.std(recall):.4f} with max {100*np.max(recall):.4f}"
    )
    print(
        f"Average precision is {100*np.mean(precision):.4f} +- {100*np.std(precision):.4f} with max {100*np.max(precision):.4f}"
    )
    model.fit(X, y)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [36]:
model_1 = DecisionTreeClassifier()
get_results(model_1, splitter, X_train, y_train, X_test, y_test)

Average accuracy is 96.0708 +- 0.4134 with max 96.8023
Average recall is 93.7910 +- 0.9223 with max 94.9367
Average precision is 89.6030 +- 1.2793 with max 91.4634
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1713
           1       0.92      0.90      0.91       579

    accuracy                           0.96      2292
   macro avg       0.94      0.94      0.94      2292
weighted avg       0.96      0.96      0.96      2292



In [38]:
model_2 = LogisticRegression()
get_results(model_2, splitter, X_train, y_train, X_test, y_test)

Average accuracy is 97.4680 +- 0.4567 with max 97.9622
Average recall is 89.4767 +- 1.6599 with max 91.1392
Average precision is 99.4343 +- 0.6932 with max 100.0000
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1713
           1       0.99      0.87      0.93       579

    accuracy                           0.97      2292
   macro avg       0.98      0.93      0.95      2292
weighted avg       0.97      0.97      0.96      2292



In [40]:
model_3 = XGBClassifier()
get_results(model_3, splitter, X_train, y_train, X_test, y_test)

Average accuracy is 97.8171 +- 0.4018 with max 98.2558
Average recall is 96.7024 +- 1.5782 with max 98.7342
Average precision is 93.9764 +- 1.0021 with max 95.5414
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1713
           1       0.95      0.96      0.95       579

    accuracy                           0.98      2292
   macro avg       0.97      0.97      0.97      2292
weighted avg       0.98      0.98      0.98      2292



We consider 3 benchmark models here:
* Decision Tree classifier
* Logistic Regression
* XGboost classifier

We consider 3 metrics for each model - accuracy, recall and precision, as the data is also imbalanced.  
Best by accuracy: XGBoost classifier  
Best by recall: XGboost classifier  
Best by precision: Logistic regression


# Hyperparameter tuning


## Tuning XGboost classifier to improve accuracy further


In [43]:
def objective(trial: optuna.Trial):
    splitter = StratifiedKFold(shuffle=True, random_state=42)

    max_depth = trial.suggest_int("max_depth", 3, 7)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    subsample = trial.suggest_float("subsample", 0.4, 0.7)

    model = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
    )
    scores = []

    for i, (train_idx, val_idx) in enumerate(splitter.split(X_train, y_train)):
        X_train_cv = X_train[train_idx, :]
        X_val_cv = X_train[val_idx, :]

        y_train_cv = y_train[train_idx]
        y_val_cv = y_train[val_idx]
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_val_cv)

        scores.append(-accuracy_score(y_val_cv, y_pred))

    return np.mean(scores)


study = optuna.create_study()
study.optimize(objective, n_trials=30)

[I 2024-02-05 21:13:32,751] A new study created in memory with name: no-name-265b9a0b-3112-4e42-86b6-401e65d8726c
[I 2024-02-05 21:13:50,453] Trial 0 finished with value: -0.9659447885988964 and parameters: {'max_depth': 4, 'learning_rate': 0.07309276338077303, 'subsample': 0.4674374538829663}. Best is trial 0 with value: -0.9659447885988964.
[I 2024-02-05 21:14:08,156] Trial 1 finished with value: -0.9621619105649776 and parameters: {'max_depth': 4, 'learning_rate': 0.05358510882221666, 'subsample': 0.5650980682540039}. Best is trial 0 with value: -0.9659447885988964.
[I 2024-02-05 21:14:29,102] Trial 2 finished with value: -0.9639082123150876 and parameters: {'max_depth': 5, 'learning_rate': 0.04897457650450214, 'subsample': 0.47421526401415537}. Best is trial 0 with value: -0.9659447885988964.
[I 2024-02-05 21:14:45,828] Trial 3 finished with value: -0.9708925730340882 and parameters: {'max_depth': 4, 'learning_rate': 0.09499791078858676, 'subsample': 0.46481837766990847}. Best is t

KeyboardInterrupt: 

In [44]:
params = study.best_params
params

{'max_depth': 5,
 'learning_rate': 0.09806024570665134,
 'subsample': 0.5856590010393802}

In [45]:
model = XGBClassifier(**params)
get_results(model, splitter, X_train, y_train, X_test, y_test)

Average accuracy is 97.7005 +- 0.6209 with max 98.6919
Average recall is 97.8465 +- 1.1727 with max 99.3671
Average precision is 92.6046 +- 2.0437 with max 95.1515
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1713
           1       0.94      0.96      0.95       579

    accuracy                           0.97      2292
   macro avg       0.96      0.97      0.97      2292
weighted avg       0.97      0.97      0.97      2292



Seems like the model overfit a bit as the test accuracy went down to 0.97 from 0.98


## Tuning Logistic regression model to improve precision further


In [9]:
def objective(trial: optuna.Trial):
    splitter = StratifiedKFold(shuffle=True, random_state=42)

    C = trial.suggest_float("C", 0.8, 1.1)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 0.4)

    model = LogisticRegression(
        penalty="elasticnet", C=C, solver="saga", l1_ratio=l1_ratio
    )
    scores = []

    for i, (train_idx, val_idx) in enumerate(splitter.split(X_train, y_train)):
        X_train_cv = X_train[train_idx, :]
        X_val_cv = X_train[val_idx, :]

        y_train_cv = y_train[train_idx]
        y_val_cv = y_train[val_idx]
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_val_cv)

        scores.append(-precision_score(y_val_cv, y_pred))

    return np.mean(scores)


study = optuna.create_study()
study.optimize(objective, n_trials=30)

[I 2024-02-06 12:09:23,073] A new study created in memory with name: no-name-f1a49c82-652f-4fdd-92dc-af293ce28ce1
[I 2024-02-06 12:09:27,269] Trial 0 finished with value: -0.9912291933418693 and parameters: {'C': 0.8302208011207642, 'l1_ratio': 0.07097188320209749}. Best is trial 0 with value: -0.9912291933418693.
[I 2024-02-06 12:09:30,076] Trial 1 finished with value: -0.9900226529071231 and parameters: {'C': 1.035755353056482, 'l1_ratio': 0.1585212240322656}. Best is trial 0 with value: -0.9912291933418693.
[I 2024-02-06 12:09:32,087] Trial 2 finished with value: -0.9884698191740446 and parameters: {'C': 0.9812811230982771, 'l1_ratio': 0.21851124218195206}. Best is trial 0 with value: -0.9912291933418693.
[I 2024-02-06 12:09:33,549] Trial 3 finished with value: -0.9865418701827835 and parameters: {'C': 0.8690822576585383, 'l1_ratio': 0.26800411790762463}. Best is trial 0 with value: -0.9912291933418693.
[I 2024-02-06 12:09:34,783] Trial 4 finished with value: -0.9835694143878287 and

In [10]:
params = study.best_params
params

{'C': 0.9140473524721335, 'l1_ratio': 0.0028218514946617293}

In [13]:
model = LogisticRegression(penalty="elasticnet", solver="saga", **params)
get_results(model, splitter, X_train, y_train, X_test, y_test)

Average accuracy is 97.0896 +- 0.5126 with max 97.6710
Average recall is 87.8288 +- 1.8836 with max 89.8734
Average precision is 99.4220 +- 0.7083 with max 100.0000
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1713
           1       0.99      0.87      0.93       579

    accuracy                           0.96      2292
   macro avg       0.97      0.93      0.95      2292
weighted avg       0.97      0.96      0.96      2292

