In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_curve


The dataset contains two columns:
- category: the category of the email
- text: the text of the email

The dataset is available at [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

We will read the dataset into a pandas dataframe then specify the category column as the label and the text column as the feature.



In [24]:
df = pd.read_csv('data/spam.csv', encoding='ISO 8859-15')[['category', 'text']]
X, y = df['text'].values, df['category']


In [25]:
df.shape


(5572, 2)

We can either count the number of word occurences per email to vectorize the emails, or we can use the TFIDF algorithm to vectorize the emails. The 3000 best features are selected using the chi-squared test.

In [26]:
K = 18000
count = CountVectorizer(analyzer='word', ngram_range=(2,2), lowercase=True)
X = count.fit_transform(X).toarray()
X = SelectKBest(chi2, k=K).fit_transform(X, y)
X = pd.DataFrame(X, columns=count.get_feature_names_out()[0:K])


Now, we have a dataset with 3000 features, and we can use it to train a model.

In [27]:
X.shape


(5572, 18000)

In [28]:
pd.DataFrame(X).describe()


Unnamed: 0,00 in,00 per,00 sub,00 subs,000 bonus,000 cash,000 homeowners,000 pounds,000 price,000 prize,...,itried2tell urmom,its about,its all,its also,its amazing,its an,its another,its answer,its at,its been
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.000179,0.000359,0.000897,0.000359,0.001256,0.001615,0.000538,0.000359,0.000179,0.000718,...,0.000179,0.004846,0.000179,0.000538,0.000179,0.009332,0.000179,0.000179,0.000179,0.000179
std,0.013397,0.018944,0.029945,0.018944,0.035425,0.040161,0.023199,0.018944,0.013397,0.026786,...,0.013397,0.079114,0.013397,0.023199,0.013397,0.132303,0.013397,0.013397,0.013397,0.013397
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0


We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [29]:
model = MultinomialNB(force_alpha=True, alpha=1)
kf = KFold(n_splits=10, random_state=1, shuffle=True)


In [30]:
def score_test(y_pred: np.ndarray, y_test):
    matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = matrix.ravel()
    return {
        'confusion_matrix': {'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)},
        'accuracy': round(accuracy_score(y_test, y_pred), 3),
        'f1': round(f1_score(y_test, y_pred, pos_label='ham'), 3),
        'recall': round(recall_score(y_test, y_pred, average="binary", pos_label='ham'), 3),
        'precision': round(precision_score(y_test, y_pred, pos_label='ham'), 3),
    }

For each fold of the cross validation, we will train the model on the training set and test the model on the test set. We will calculate the accuracy and the root mean squared of the results of each fold. Finally, we will calculate the average of the root mean squared of the results of each fold.

In [31]:
scores = {'accuracy': [], 'f1': [], 'recall': [], 'precision': []}
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = score_test(y_pred, y_test)
    scores = {k: scores[k] + [results[k]] for k in scores}
    print(results['confusion_matrix'])
print('Accuracy:', '%.3f +/- %.3f' %
      (np.mean(scores['accuracy']), np.std(scores['accuracy'])))
print('F1 Score:', '%.3f +/- %.3f' %
      (np.mean(scores['f1']), np.std(scores['f1'])))
print('Recall:', '%.3f +/- %.3f' %
      (np.mean(scores['recall']), np.std(scores['recall'])))
print('Precision:', '%.3f +/- %.3f' %
      (np.mean(scores['precision']), np.std(scores['precision'])))


{'tn': 490, 'fp': 0, 'fn': 2, 'tp': 66}
{'tn': 485, 'fp': 2, 'fn': 1, 'tp': 70}
{'tn': 476, 'fp': 2, 'fn': 5, 'tp': 74}
{'tn': 474, 'fp': 5, 'fn': 0, 'tp': 78}
{'tn': 477, 'fp': 2, 'fn': 4, 'tp': 74}
{'tn': 468, 'fp': 2, 'fn': 4, 'tp': 83}
{'tn': 473, 'fp': 6, 'fn': 1, 'tp': 77}
{'tn': 487, 'fp': 3, 'fn': 2, 'tp': 65}
{'tn': 485, 'fp': 2, 'fn': 0, 'tp': 70}
{'tn': 483, 'fp': 3, 'fn': 3, 'tp': 68}
Accuracy: 0.991 +/- 0.003
F1 Score: 0.995 +/- 0.002
Recall: 0.995 +/- 0.003
Precision: 0.996 +/- 0.003
