In [3]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve


The dataset contains two columns:
- category: the category of the email
- text: the text of the email

The dataset is available at [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

We will read the dataset into a pandas dataframe then specify the category column as the label and the text column as the feature.



In [4]:
df = pd.read_csv('data/spam.csv', encoding='ISO 8859-15')[['category', 'text']]
X, y = df['text'].values, df['category']


In [5]:
X.shape, y.shape


((5572,), (5572,))

We will use the TFIDF algorithm to vectorize the emails.

In [6]:
tfidf = TfidfVectorizer(analyzer='word')
X = tfidf.fit_transform(X).toarray()
X = pd.DataFrame(X, columns=tfidf.get_feature_names_out())


Now, we have a dataset with 3000 features, and we can use it to train a model.

In [7]:
X.shape


(5572, 8710)

In [8]:
pd.DataFrame(X).describe()


Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,œscool,œt,œte,œve,œwell,œï,œû,œûªm,œûªt,œûªve
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.000395,0.001143,4.1e-05,9.2e-05,4.5e-05,5.5e-05,5.2e-05,8.2e-05,9.2e-05,0.000344,...,8.8e-05,0.000984,8.1e-05,0.00017,3.7e-05,0.007441,4.5e-05,5e-05,5e-05,4.5e-05
std,0.009329,0.017861,0.003083,0.004852,0.003338,0.004082,0.003882,0.004313,0.006839,0.009076,...,0.006562,0.015873,0.006011,0.00734,0.002774,0.05146,0.003326,0.003722,0.003722,0.003358
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.240082,0.654349,0.230097,0.260668,0.249152,0.304677,0.289745,0.227659,0.510538,0.250256,...,0.489827,0.385029,0.44871,0.332466,0.207099,0.769601,0.248248,0.2778,0.2778,0.250688


We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [9]:
model = MultinomialNB(force_alpha=True, alpha=1)
kf = KFold(n_splits=10, random_state=1, shuffle=True)


In [12]:
def score_test(y_pred: np.ndarray, y_test):
    matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = matrix.ravel()
    return {
        'confusion_matrix': {
            'tn': int(tn),
            'fp': int(fp),
            'fn': int(fn),
            'tp': int(tp),
        },
        'accuracy': round(accuracy_score(y_test, y_pred), 3),
        'f1': round(f1_score(y_test, y_pred, pos_label='ham'), 3),
        'recall': round(recall_score(y_test, y_pred, average="binary", pos_label='ham'), 3),
        'precision': round(precision_score(y_test, y_pred, pos_label='ham'), 3),
    }

For each fold of the cross validation, we will train the model on the training set and test the model on the test set. We will calculate the accuracy and the root mean squared of the results of each fold. Finally, we will calculate the average of the root mean squared of the results of each fold.

In [14]:
scores = {'accuracy': [], 'f1': [], 'recall': [], 'precision': []}
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = score_test(y_pred, y_test)
    scores['accuracy'] += [results['accuracy']]
    scores['f1'] += [results['f1']]
    scores['recall'] += [results['recall']]
    scores['precision'] += [results['precision']]
    print(results['confusion_matrix'])
print('Accuracy:', '%.3f +/- %.3f' %
      (np.mean(scores['accuracy']), np.std(scores['accuracy'])))
print('F1 Score:', '%.3f +/- %.3f' %
      (np.mean(scores['f1']), np.std(scores['f1'])))
print('Recall:', '%.3f +/- %.3f' %
      (np.mean(scores['recall']), np.std(scores['recall'])))
print('Precision:', '%.3f +/- %.3f' %
      (np.mean(scores['precision']), np.std(scores['precision'])))


{'tn': 490, 'fp': 0, 'fn': 18, 'tp': 50}
{'tn': 487, 'fp': 0, 'fn': 11, 'tp': 60}
{'tn': 478, 'fp': 0, 'fn': 25, 'tp': 54}
{'tn': 479, 'fp': 0, 'fn': 33, 'tp': 45}
{'tn': 479, 'fp': 0, 'fn': 19, 'tp': 59}
{'tn': 470, 'fp': 0, 'fn': 29, 'tp': 58}
{'tn': 479, 'fp': 0, 'fn': 12, 'tp': 66}
{'tn': 490, 'fp': 0, 'fn': 19, 'tp': 48}
{'tn': 487, 'fp': 0, 'fn': 16, 'tp': 54}
{'tn': 486, 'fp': 0, 'fn': 19, 'tp': 52}
Accuracy: 0.964 +/- 0.012
F1 Score: 0.980 +/- 0.007
Recall: 1.000 +/- 0.000
Precision: 0.960 +/- 0.013
