In [52]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold

In [53]:
df = pd.read_csv('data/spam.csv')[['category', 'text']]
X, y = df['text'].values, df['category']

In [54]:
X.shape, y.shape

((5572,), (5572,))

In [55]:
tfidf = TfidfVectorizer(analyzer='word')
X = tfidf.fit_transform(X).toarray()
X = pd.DataFrame(X, columns=tfidf.get_feature_names_out())

In [56]:
X.shape

(5572, 8625)

In [57]:
pd.DataFrame(X).describe()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.000402,0.001161,4.2e-05,9.4e-05,4.5e-05,5.5e-05,5.2e-05,8.2e-05,9.2e-05,0.000352,...,4.9e-05,0.000215,6.6e-05,4.9e-05,5.8e-05,0.000103,6.4e-05,5.2e-05,4.9e-05,2.9e-05
std,0.009507,0.018105,0.003121,0.004938,0.003352,0.004082,0.003882,0.004313,0.006839,0.00928,...,0.00363,0.006579,0.004933,0.003667,0.004302,0.005464,0.004744,0.003884,0.00367,0.002151
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.240101,0.654354,0.233004,0.265471,0.25018,0.304679,0.289756,0.22766,0.510538,0.256908,...,0.270977,0.224857,0.368206,0.273741,0.321122,0.296786,0.354112,0.289917,0.273927,0.160561


We will use the Naive Bayes classifier to train the model. We will use k-fold cross validation to test the model with k=10.

In [58]:
model = GaussianNB()
kf = KFold(n_splits=10, random_state=1, shuffle=True)

In [60]:
scores = {'rmse': [], 'accuracy': []}
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results = [0 if t == p else 1 for t, p in zip(y_test, y_pred)]
    scores['rmse'] += [(np.mean(results)  ** 0.5)]
    scores['accuracy'] += [len([i for i in results if i == 0])/len(results)]
print('RMSE:', '%.3f +/- %.3f' %(np.mean(scores['rmse']), np.std(scores['rmse'])))
print('Accuracy:', '%.3f +/- %.3f' %(np.mean(scores['accuracy']), np.std(scores['accuracy'])))

RMSE: 0.318 +/- 0.020
Accuracy: 0.899 +/- 0.013
