# import necessarry libraries

In [113]:
from sklearn.preprocessing import LabelEncoder
import regex as re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [114]:
#import data
file = '/content/spam.csv'

In [115]:
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

# data exploration

In [116]:
df = pd.read_csv(file,encoding='Windows-1252')

In [117]:
#rename columns
df = df[['v1','v2']]
df.rename(columns= {'v1':"class", 'v2':"Emails"}, inplace = True)

In [118]:
df.columns.name='index'

In [119]:
df.shape

(5572, 2)

In [120]:
df

index,class,Emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# data pre-processing

In [121]:
def func1(line):
    line = line.lower()
    return line
def func2(line):

    line=line.strip()
    line = line.split()
    return line
def func4(last):
    lsd=[]
    for i in range(len(last)):
        x=re.sub('[^a-z]+','',last[i])
        if x.strip():
            lsd.append(x)
    return lsd
def func5(last):
    sw = ['is', 'are', 'the','i','a','we','am','its','it']
    lsd = filter(lambda x: x not in sw,last)
    return list(lsd)

In [122]:
df['Emails']=df['Emails'].apply(func1)

In [123]:
df['Emails']=df['Emails'].apply(func2)

In [124]:
df['Emails']=df['Emails'].apply(func4)

In [125]:
df['Emails']=df['Emails'].apply(func5)

In [126]:
df['Emails']

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, wkly, comp, to, win, fa, cup...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, dont, think, he, goes, to, usf, he, live...
                              ...                        
5567    [this, nd, time, have, tried, contact, u, u, h...
5568            [will, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [guy, did, some, bitching, but, acted, like, i...
5571                               [rofl, true, to, name]
Name: Emails, Length: 5572, dtype: object

In [127]:
lec = LabelEncoder()
class_ = ['ham', 'spam']
lec.fit(class_)
df['class']=lec.transform(df['class'])

In [128]:
df

index,class,Emails
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, dont, think, he, goes, to, usf, he, live..."
...,...,...
5567,1,"[this, nd, time, have, tried, contact, u, u, h..."
5568,0,"[will, b, going, to, esplanade, fr, home]"
5569,0,"[pity, was, in, mood, for, that, soany, other,..."
5570,0,"[guy, did, some, bitching, but, acted, like, i..."


# Now to create our vocabulary

In [129]:
vocab = []
for emails in df['Emails']:
    for words in emails:
        vocab.append(words)
vocab = list(set(vocab))

In [130]:
len(vocab)

8529

In [131]:
wpe = {unq : [0] * len(df['Emails']) for unq in vocab}

# This code puts values to the coresponding word in Emails

In [132]:
for index, eml in enumerate(df['Emails']):
   for word in eml:
      wpe[word][index] += 1

# Now creating the data frame

In [133]:
wpe = pd.DataFrame(wpe)

# Combining the two DFs

In [134]:
df=pd.concat([df,wpe],axis = 1)

In [135]:
df

Unnamed: 0,class,Emails,tariffs,quarter,yoville,perf,gastroenteritis,chikkuali,texas,haunt,...,romantic,strokes,busty,empty,hole,madthen,shahjahans,statements,pain,invnted
0,0,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"[nah, dont, think, he, goes, to, usf, he, live...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,"[this, nd, time, have, tried, contact, u, u, h...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,"[will, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,"[pity, was, in, mood, for, that, soany, other,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,"[guy, did, some, bitching, but, acted, like, i...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train and test

In [136]:
x=df.iloc[:,2:]
y=df.iloc[:,:1]

In [137]:
x

Unnamed: 0,tariffs,quarter,yoville,perf,gastroenteritis,chikkuali,texas,haunt,hence,boyy,...,romantic,strokes,busty,empty,hole,madthen,shahjahans,statements,pain,invnted
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
y.shape

(5572, 1)

In [139]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,train_size=0.7,random_state=101)

In [140]:
bnb = BernoulliNB()
ytrain.shape

(3900, 1)

In [141]:
bnb.fit(xtrain,ytrain)

In [142]:
ypdct = bnb.predict(xtest)

In [143]:
acry = round(accuracy_score(ytest, ypdct), 2)

In [144]:
acry

0.97

In [145]:
test = pd.concat([ytest,xtest],axis = 1)

In [146]:
itv = lec.inverse_transform(ypdct)
print(test.assign(predicted_gender = itv))

      class  tariffs  quarter  yoville  perf  gastroenteritis  chikkuali  \
2373      1        0        0        0     0                0          0   
1942      0        0        0        0     0                0          0   
54        1        0        0        0     0                0          0   
1900      0        0        0        0     0                0          0   
1564      0        0        0        0     0                0          0   
...     ...      ...      ...      ...   ...              ...        ...   
790       0        0        0        0     0                0          0   
3982      0        0        0        0     0                0          0   
823       1        0        0        0     0                0          0   
1911      0        0        0        0     0                0          0   
1237      0        0        0        0     0                0          0   

      texas  haunt  hence  ...  strokes  busty  empty  hole  madthen  \
2373      0    

# for unseen data \ user input

In [179]:
def classify(email):
    lsd=[]
    email = email.lower().strip().split()
    for i in email:
        x=re.sub('[^a-z]+','',i)
        if x.strip():
            lsd.append(x)
    sw = ['is', 'are', 'the','i','a','we','am','its','it']
    lsd = filter(lambda x: x not in sw, lsd)
    lsd = list(lsd)
    for i in lsd:
      if i not in vocab:
             lsd.remove(i)
    wp = {u : [0]*1 for u in vocab}
    wp = pd.DataFrame(wp)
    for eml in lsd:
          wp[eml][0]+=1
    prdct = bnb.predict(wp)
    if prdct == [0]:
      print('email is not spam')
    elif prdct == [1]:
      print('email is spam')



In [181]:
ema = '''Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's	'''
classify(ema)


email is spam
