# import necessarry libraries

In [93]:
from sklearn.preprocessing import LabelEncoder
import regex as re
import numpy as np 
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [94]:
#import data
file = 'spam.csv'

In [95]:
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}

# data exploration

In [96]:
df = pd.read_csv(file,encoding='Windows-1252')

In [97]:
#rename columns
df = df[['v1','v2']]
df.rename(columns= {'v1':"Class", 'v2':"Emails"}, inplace = True)

In [98]:
df.columns.name='index'

In [99]:
df.shape

(5572, 2)

In [100]:
df

index,Class,Emails
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [101]:
Train, Test = train_test_split(df,test_size=0.3,train_size=0.7,random_state=101)

In [102]:
Test

index,Class,Emails
2373,spam,XMAS Prize draws! We are trying to contact U. ...
1942,ham,K...k:)why cant you come here and search job:)
54,spam,SMS. ac Sptv: The New Jersey Devils and the De...
1900,ham,And miss vday the parachute and double coins??...
1564,ham,Tmrw. Im finishing 9 doors
...,...,...
790,ham,This is hoping you enjoyed your game yesterday...
3982,ham,Nationwide auto centre (or something like that...
823,spam,25p 4 alfie Moon's Children in need song on ur...
1911,ham,Lol grr my mom is taking forever with my presc...


# data pre-processing 

In [103]:
def func1(line):
    line = line.lower()
    return line
def func2(line):
    
    line=line.strip()
    line = line.split()
    return line   
def func4(last):
    lsd=[]
    for i in range(len(last)):
        x=re.sub('[^a-z]+','',last[i])
        if x.strip():
            lsd.append(x)
    return lsd
def func5(last):
    sw = ['is', 'are', 'the','i','a','we','am','its','it']
    lsd = filter(lambda x: x not in sw,last)
    return list(lsd)

In [104]:
df['Emails']=df['Emails'].apply(func1)

In [105]:
df['Emails']=df['Emails'].apply(func2)

In [106]:
df['Emails']=df['Emails'].apply(func4)

In [107]:
df['Emails']=df['Emails'].apply(func5)

In [108]:
df['Emails']

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, wkly, comp, to, win, fa, cup...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, dont, think, he, goes, to, usf, he, live...
                              ...                        
5567    [this, nd, time, have, tried, contact, u, u, h...
5568            [will, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [guy, did, some, bitching, but, acted, like, i...
5571                               [rofl, true, to, name]
Name: Emails, Length: 5572, dtype: object

In [109]:
lec = LabelEncoder()
class_ = ['ham', 'spam']
lec.fit(class_)
df['Class']=lec.transform(df['Class'])

In [110]:
df

index,Class,Emails
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, dont, think, he, goes, to, usf, he, live..."
...,...,...
5567,1,"[this, nd, time, have, tried, contact, u, u, h..."
5568,0,"[will, b, going, to, esplanade, fr, home]"
5569,0,"[pity, was, in, mood, for, that, soany, other,..."
5570,0,"[guy, did, some, bitching, but, acted, like, i..."


# Now to create our vocabulary

In [111]:
vocab = []
for emails in df['Emails']:
    for words in emails:
        vocab.append(words)
vocab = list(set(vocab))

In [112]:
len(vocab)

8529

In [113]:
wpe = {unq : [0] * len(df['Emails']) for unq in vocab}

# This code puts values to the coresponding word in Emails

In [114]:
for index, eml in enumerate(df['Emails']):
   for word in eml:
      wpe[word][index] += 1

# Now creating the data frame 

In [115]:
wpe = pd.DataFrame(wpe)

# Combining the two DFs

In [116]:
df=pd.concat([df,wpe],axis = 1)

In [117]:
df

Unnamed: 0,Class,Emails,multiply,breaker,surprise,poo,source,well,bowa,minecraft,...,module,mind,paypal,rs,homewot,access,dusk,december,versus,since
0,0,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"[nah, dont, think, he, goes, to, usf, he, live...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,"[this, nd, time, have, tried, contact, u, u, h...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,"[will, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,"[pity, was, in, mood, for, that, soany, other,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,"[guy, did, some, bitching, but, acted, like, i...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train and test 

In [118]:
train, test = train_test_split(df,test_size=0.3,train_size=0.7,random_state=101)

In [119]:
train

Unnamed: 0,Class,Emails,multiply,breaker,surprise,poo,source,well,bowa,minecraft,...,module,mind,paypal,rs,homewot,access,dusk,december,versus,since
4028,0,"[anyway, many, good, evenings, to, u, s]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1310,0,"[ill, always, be, there, even, if, just, in, s...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5469,0,"[ok, lor]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5375,0,"[cant, pick, phone, right, now, pls, send, mes...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3814,0,"[not, yet, chikkuk, then, wat, abt, tht, guy, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4171,0,"[sorry, ill, call, later]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599,0,"[im, going, out, to, buy, mums, present, ar]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1361,0,"[shuhui, say, change, suntec, steamboat, u, no...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1547,0,"[bishan, lar, nearer, no, need, buy, so, early...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
test

Unnamed: 0,Class,Emails,multiply,breaker,surprise,poo,source,well,bowa,minecraft,...,module,mind,paypal,rs,homewot,access,dusk,december,versus,since
2373,1,"[xmas, prize, draws, trying, to, contact, u, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1942,0,"[kkwhy, cant, you, come, here, and, search, job]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,1,"[sms, ac, sptv, new, jersey, devils, and, detr...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1900,0,"[and, miss, vday, parachute, and, double, coin...",0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1564,0,"[tmrw, im, finishing, doors]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790,0,"[this, hoping, you, enjoyed, your, game, yeste...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3982,0,"[nationwide, auto, centre, or, something, like...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
823,1,"[p, alfie, moons, children, in, need, song, on...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1911,0,"[lol, grr, my, mom, taking, forever, with, my,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Making my own Naive bayes model

In [121]:
# Isolating spam and ham messages first
spam_messages = train[train['Class'] == 1]
ham_messages = train[train['Class'] == 0]

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(train)
p_ham = len(ham_messages) / len(train)

# N_Spam
n_words_per_spam_message = spam_messages['Emails'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['Emails'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocab)

# Laplace smoothing
alpha = 1

In [122]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocab}
parameters_ham = {unique_word:0 for unique_word in vocab}

# Calculate parameters
for word in vocab:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

# Function to predict class of new message

In [123]:
def classify(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal proabilities, have a human classify this!')

# Predictions

In [124]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 5.857777252282906e-19
P(Ham|message): 1.875636876994362e-21
Label: Spam


In [125]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 5.51613658577096e-25
P(Ham|message): 6.310128667161422e-21
Label: Ham


# Time to check accuracy of our model

In [126]:
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'

In [127]:
Test['predicted'] = Test['Emails'].apply(classify_test_set)
Test.head()

index,Class,Emails,predicted
2373,spam,XMAS Prize draws! We are trying to contact U. ...,spam
1942,ham,K...k:)why cant you come here and search job:),ham
54,spam,SMS. ac Sptv: The New Jersey Devils and the De...,spam
1900,ham,And miss vday the parachute and double coins??...,ham
1564,ham,Tmrw. Im finishing 9 doors,ham


In [128]:
correct = 0
total = Test.shape[0]

for row in Test.iterrows():
   row = row[1]
   if row['Class'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1622
Incorrect: 50
Accuracy: 0.9700956937799043


# Built-in Sklearn model

In [129]:
x=df.iloc[:,2:]
y=df.iloc[:,:1]

In [130]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,train_size=0.7,random_state=101)

In [131]:
bnb = BernoulliNB(alpha=1)
ytrain.shape

(3900, 1)

In [132]:
bnb.fit(xtrain,ytrain)

In [133]:
ypdct = bnb.predict(xtest)

In [134]:
acry = round(accuracy_score(ytest, ypdct), 2)

In [135]:
acry

0.97

In [136]:
test = pd.concat([ytest,xtest],axis = 1)

In [137]:
itv = lec.inverse_transform(ypdct)
print(test.assign(predicted_gender = itv))

      Class  multiply  breaker  surprise  poo  source  well  bowa  minecraft  \
2373      1         0        0         0    0       0     0     0          0   
1942      0         0        0         0    0       0     0     0          0   
54        1         0        0         0    0       0     0     0          0   
1900      0         0        0         0    0       0     1     0          0   
1564      0         0        0         0    0       0     0     0          0   
...     ...       ...      ...       ...  ...     ...   ...   ...        ...   
790       0         0        0         0    0       0     0     0          0   
3982      0         0        0         0    0       0     0     0          0   
823       1         0        0         0    0       0     0     0          0   
1911      0         0        0         0    0       0     0     0          0   
1237      0         0        0         0    0       0     0     0          0   

      james  ...  mind  paypal  rs  hom

# for unseen data \ user input

In [138]:
def classify(email):
    lsd=[]
    email = email.lower().strip().split()
    for i in email:
        x=re.sub('[^a-z]+','',i)
        if x.strip():
            lsd.append(x)
    sw = ['is', 'are', 'the','i','a','we','am','its','it']
    lsd = filter(lambda x: x not in sw, lsd)
    lsd = list(lsd)
    for i in lsd:
        if i not in vocab:
             lsd.remove(i)
    wp = {u : [0]*1 for u in vocab}
    wp = pd.DataFrame(wp)
    for e in wp.columns:
        if e in lsd:
            wp[e]+=1
    wp = pd.DataFrame(wp)
    print(bnb.predict(wp))

In [139]:
emal = "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
classify(emal)

[1]
