In [26]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile

# Naive Bayes

In [41]:
data = pd.read_csv('spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
data.shape

(5572, 5)

In [29]:
data.drop(['Unnamed: 2',
'Unnamed: 3',
'Unnamed: 4'],axis=1,inplace = True)

In [33]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
X_train,X_test,y_train,y_test = train_test_split(data.v2,data.v1,test_size=0.5,random_state=10)

In [42]:
vectorizer = TfidfVectorizer() # Term Frequency

In [44]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)
X_train_transformed

<2786x5990 sparse matrix of type '<class 'numpy.float64'>'
	with 36945 stored elements in Compressed Sparse Row format>

In [45]:

feature_names = vectorizer.get_feature_names()

In [38]:
feature_names # Every email contains the predictor

['00',
 '000',
 '000pes',
 '0121',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '021',
 '03',
 '04',
 '05',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '07808',
 '07808247860',
 '07815296484',
 '07821230901',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081560665',
 '0825',
 '0844',
 '0845',
 '08452810073',
 '0870',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',
 '08701417012150p',
 '087016248',
 '08701752560',
 '087018728737',
 '0870241182716',
 '08702490080',
 '08702840625',
 '08704439680ts',
 '08706091795',
 '08707500020',
 '08707509020',
 '08

In [46]:
len(feature_names)

5990

In [47]:
selector = SelectPercentile(percentile=10)
selector.fit(X_train_transformed, y_train)
X_train_transformed = selector.transform(X_train_transformed).toarray()
X_test_transformed = selector.transform(X_test_transformed).toarray()


In [48]:
model_gaussiannb = GaussianNB()
model_gaussiannb.fit(X_train_transformed,y_train)
y_predict = model_gaussiannb.predict(X_test_transformed)
accuracy_score(y_test,y_predict)

0.9655419956927495

In [49]:
model_bernb = BernoulliNB()
model_bernb.fit(X_train_transformed,y_train)
y_predict = model_bernb.predict(X_test_transformed)
accuracy_score(y_test,y_predict)

0.9802584350323044

In [50]:
model_mulnb = MultinomialNB()
model_mulnb.fit(X_train_transformed,y_train)
y_predict = model_mulnb.predict(X_test_transformed)
accuracy_score(y_test,y_predict)

0.9292893036611629

In [51]:
pd.crosstab(y_test,y_predict)

col_0,ham,spam
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2409,0
spam,197,180


In [53]:
newEmail = pd.Series(['free voucher entry win trip'])

In [54]:
newEmail

0    free voucher entry win trip
dtype: object

In [55]:
newEmail_transformed = vectorizer.transform(newEmail)
newEmail_transformed = selector.transform(newEmail_transformed).toarray()

In [56]:
model_bernb.predict(newEmail_transformed)

array(['spam'], dtype='<U4')