In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold


The dataset contains two columns:
- category: the category of the email
- text: the text of the email

The dataset is available at [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

We will read the dataset into a pandas dataframe then specify the category column as the label and the text column as the feature.



In [2]:
df = pd.read_csv('data/spam.csv', encoding='ISO 8859-15')[['category', 'text']]
X, y = df['text'].values, df['category']


In [3]:
df.shape


(5572, 2)

We can either count the number of word occurences per email to vectorize the emails, or we can use the TFIDF algorithm to vectorize the emails. The 3000 best features are selected using the chi-squared test.

In [4]:
K = 3000
count = CountVectorizer(analyzer='word')
X = count.fit_transform(X).toarray()
X = SelectKBest(chi2, k=K).fit_transform(X, y)
X = pd.DataFrame(X, columns=count.get_feature_names_out()[0:K])


Now, we have a dataset with 3000 features, and we can use it to train a model.

In [5]:
X.shape


(5572, 3000)

In [6]:
pd.DataFrame(X).describe()


Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ever,every,every1,everybody,everyboy,everyday,everyone,everyones,everyso,everythin
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.001795,0.005205,0.000359,0.000179,0.000179,0.000179,0.000359,0.001436,0.000538,0.000179,...,0.001256,0.024408,0.000179,0.000359,0.000179,0.001795,0.000179,0.004846,0.040201,0.000179
std,0.04233,0.076788,0.018944,0.013397,0.013397,0.013397,0.018944,0.037867,0.023199,0.013397,...,0.035425,0.188847,0.013397,0.018944,0.013397,0.046377,0.013397,0.079114,0.317331,0.013397
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,1.0,1.0,2.0,1.0,2.0,8.0,1.0


We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [7]:
model = MultinomialNB(force_alpha=True, alpha=1)
kf = KFold(n_splits=10, random_state=1, shuffle=True)


For each fold of the cross validation, we will train the model on the training set and test the model on the test set. We will calculate the accuracy and the root mean squared of the results of each fold. Finally, we will calculate the average of the root mean squared of the results of each fold.

In [8]:
scores = {'rmse': [], 'accuracy': []}
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[
        test_index], y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = [0 if t == p else 1 for t, p in zip(y_test, y_pred)]
    scores['rmse'] += [(np.mean(results) ** 0.5)]
    scores['accuracy'] += [len([i for i in results if i == 0])/len(results)]
print('RMSE:', '%.3f +/- %.3f' %
      (np.mean(scores['rmse']), np.std(scores['rmse'])))
print('Accuracy:', '%.3f +/- %.3f' %
      (np.mean(scores['accuracy']), np.std(scores['accuracy'])))


RMSE: 0.094 +/- 0.026
Accuracy: 0.990 +/- 0.005
