In [73]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

The dataset contains two columns:
- category: the category of the email
- text: the text of the email

The dataset is available at [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).

We will read the dataset into a pandas dataframe then specify the category column as the label and the text column as the feature.



In [74]:
df = pd.read_csv('data/spam.csv')[['category', 'text']]
X, y = df['text'].values, df['category']

In [75]:
df.shape

(5572, 2)

We will use the TFIDF algorithm to vectorize the emails. The 3000 best features are selected using the chi-squared test.

In [76]:
K = 3000
tfidf = TfidfVectorizer(analyzer='word')
X = tfidf.fit_transform(X).toarray()
X = SelectKBest(chi2, k=K).fit_transform(X,y)
X = pd.DataFrame(X, columns=tfidf.get_feature_names_out()[0:K])

Now, we have a dataset with 3000 features, and we can use it to train a model.

In [77]:
X.shape

(5572, 3000)

In [78]:
pd.DataFrame(X).describe()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,evening,evenings,event,events,eventually,ever,every,every1,everybody,everyboy
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.000402,0.001161,9.4e-05,4.5e-05,5.5e-05,5.2e-05,8.2e-05,0.000352,0.000175,5.1e-05,...,4.5e-05,0.000994,0.000984,0.000607,0.000329,0.003232,4.9e-05,0.000215,0.000103,4.9e-05
std,0.009507,0.018105,0.004938,0.003352,0.004082,0.003882,0.004313,0.00928,0.007931,0.003838,...,0.003396,0.016265,0.016639,0.012806,0.009616,0.041859,0.00363,0.006579,0.005464,0.00367
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.240101,0.654354,0.265471,0.25018,0.304679,0.289756,0.22766,0.256908,0.473355,0.286462,...,0.253464,0.492541,0.467545,0.365817,0.414423,1.0,0.270977,0.224857,0.296786,0.273927


We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [79]:
model = GaussianNB()
kf = KFold(n_splits=10, random_state=1, shuffle=True)

For each fold of the cross validation, we will train the model on the training set and test the model on the test set. We will calculate the accuracy and the root mean squared of the results of each fold. Finally, we will calculate the average of the root mean squared of the results of each fold.

In [87]:
scores = {'rmse': [], 'accuracy': []}
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = [0 if t == p else 1 for t, p in zip(y_test, y_pred)]
    scores['rmse'] += [(np.mean(results)  ** 0.5)]
    scores['accuracy'] += [len([i for i in results if i == 0])/len(results)]
print('RMSE:', '%.3f +/- %.3f' %(np.mean(scores['rmse']), np.std(scores['rmse'])))
print('Accuracy:', '%.3f +/- %.3f' %(np.mean(scores['accuracy']), np.std(scores['accuracy'])))

RMSE: 0.157 +/- 0.031
Accuracy: 0.974 +/- 0.010
