In [87]:
import string
import nltk
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_curve

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\belsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The dataset contains two columns:
- category: the category of the email
- text: the text of the email

The dataset is available at [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

We will read the dataset into a pandas dataframe then specify the category column as the label and the text column as the feature.



In [88]:
df = pd.read_csv('data/spam.csv', encoding='ISO 8859-15')[['category', 'text']]
X, y = df['text'], df['category']


In [96]:
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree


def remove_stopwords(text):
    output = [
        i for i in text if i not in nltk.corpus.stopwords.words('english')]
    return output


X = X.apply(remove_punctuation)
X = X.apply(remove_stopwords)

X


0       [G,  ,  ,  , u,  , n,  , l,  ,  ,  , j,  , u, ...
1       [O,  , k,  ,  ,  , l,  , r,  ,  ,  , J,  , k, ...
2       [F,  , r,  , e,  , e,  ,  ,  , e,  , n,  , r, ...
3       [U,  ,  ,  , u,  , n,  ,  ,  ,  ,  ,  ,  , e, ...
4       [N,  , h,  ,  ,  , I,  ,  ,  , n,  ,  ,  , h, ...
                              ...                        
5567    [T,  , h,  ,  ,  ,  ,  , h,  , e,  ,  ,  , 2, ...
5568    [W,  , l,  , l,  ,  ,  , ï,  , ¿,  , œ,  ,  , ...
5569    [P,  ,  ,  ,  ,  , w,  ,  ,  , n,  ,  ,  ,  , ...
5570    [T,  , h,  , e,  ,  ,  , g,  , u,  ,  ,  ,  , ...
5571    [R,  , f,  , l,  ,  ,  , I,  ,  ,  , r,  , u, ...
Name: text, Length: 5572, dtype: object

We will use the TFIDF algorithm to vectorize the emails.

In [104]:
X.dropna(inplace=True)
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=True)
X = tfidf.fit_transform(X).toarray()
X = pd.DataFrame(X, columns=tfidf.get_feature_names_out())


AttributeError: 'list' object has no attribute 'lower'

Now, we have a dataset with 3000 features, and we can use it to train a model.

In [None]:
X.shape


(5572, 41548)

We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [None]:
model = MultinomialNB(force_alpha=True, alpha=1)
kf = KFold(n_splits=10, random_state=1, shuffle=True)


In [None]:
def score_test(y_pred: np.ndarray, y_test):
    matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = matrix.ravel()
    return {
        'confusion_matrix': {'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)},
        'accuracy': round(accuracy_score(y_test, y_pred), 3),
        'f1': round(f1_score(y_test, y_pred, pos_label='ham'), 3),
        'recall': round(recall_score(y_test, y_pred, average="binary", pos_label='ham'), 3),
        'precision': round(precision_score(y_test, y_pred, pos_label='ham'), 3),
    }


For each fold of the cross validation, we will train the model on the training set and test the model on the test set. We will calculate the accuracy and the root mean squared of the results of each fold. Finally, we will calculate the average of the root mean squared of the results of each fold.

In [None]:
scores = {'accuracy': [], 'f1': [], 'recall': [], 'precision': []}
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = score_test(y_pred, y_test)
    scores = {k: scores[k] + [results[k]] for k in scores}
    print(results['confusion_matrix'])
print('Accuracy:', '%.3f +/- %.3f' %
      (np.mean(scores['accuracy']), np.std(scores['accuracy'])))
print('F1 Score:', '%.3f +/- %.3f' %
      (np.mean(scores['f1']), np.std(scores['f1'])))
print('Recall:', '%.3f +/- %.3f' %
      (np.mean(scores['recall']), np.std(scores['recall'])))
print('Precision:', '%.3f +/- %.3f' %
      (np.mean(scores['precision']), np.std(scores['precision'])))


{'tn': 490, 'fp': 0, 'fn': 20, 'tp': 48}
{'tn': 487, 'fp': 0, 'fn': 28, 'tp': 43}
{'tn': 478, 'fp': 0, 'fn': 44, 'tp': 35}
{'tn': 479, 'fp': 0, 'fn': 43, 'tp': 35}
{'tn': 479, 'fp': 0, 'fn': 30, 'tp': 48}


KeyboardInterrupt: 