In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score


In [2]:
df=pd.read_csv("spam.csv",encoding="latin-1")
df=df.iloc[:,0:2]
df.head()
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [3]:
df['LABEL']=df.v1.map({'ham':0,'spam':1})
df.tail()

Unnamed: 0,v1,v2,LABEL
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0
5571,ham,Rofl. Its true to its name,0


In [4]:
def preprocess_data(text):
    txt=str(text)
    txt=" ".join([i.lower() for i in txt.split()])
    
    stops=set(stopwords.words("english"))
    txt=" ".join([w for w in txt.split() if w not in stops])
  
    stemmer=PorterStemmer()
    txt=" ".join([stemmer.stem(w) for w in txt.split()])
    
    return txt

df['v2']=df['v2'].map(lambda x:preprocess_data(x))
print(df['v2'])

0       go jurong point, crazy.. avail bugi n great wo...
1                             ok lar... joke wif u oni...
2       free entri 2 wkli comp win fa cup final tkt 21...
3               u dun say earli hor... u c alreadi say...
4                   nah think goe usf, live around though
5       freemsg hey darl 3 week' word back! i'd like f...
6       even brother like speak me. treat like aid pat...
7       per request 'mell mell (oru minnaminungint nur...
8       winner!! valu network custom select receivea å...
9       mobil 11 month more? u r entitl updat latest c...
10      i'm gonna home soon want talk stuff anymor ton...
11      six chanc win cash! 100 20,000 pound txt> csh1...
12      urgent! 1 week free membership å£100,000 prize...
13      i'v search right word thank breather. promis w...
14                                     date sunday will!!
15      xxxmobilemovieclub: use credit, click wap link...
16                                oh k...i'm watch here:)
17      eh u r

In [5]:
##collecting info from data
x_train,x_test,y_train,y_test=train_test_split(df.v2,df.LABEL,random_state=50,test_size=0.3)
label,freq=np.unique(y_train,return_counts=True)
print(label,freq)
prob_spam=freq[1]/(freq[0]+freq[1])
prob_ham=freq[0]/(freq[0]+freq[1])
print(prob_spam)
print(prob_spam)

[0 1] [3399  501]
0.12846153846153846
0.12846153846153846


In [6]:
#creating dictionary for ham and spam containing frequencies for words
dic_ham={}
dic_spam={}
for i in range(0,len(x_train)):
    if(np.array(y_train)[i]==0):
        for w in np.array(x_train)[i].split():
            if w not in dic_ham:
                dic_ham[w]=1
            else:
                dic_ham[w]=dic_ham[w]+1
    else:
        for w in np.array(x_train)[i].split():
            if w not in dic_spam:
                dic_spam[w]=1
            else:
                dic_spam[w]=dic_spam[w]+1

In [7]:
total_unique_words=len(dic_spam)+len(dic_ham)
total_spam_words=0
total_ham_words=0
for i in dic_spam.values():
    total_spam_words=total_spam_words+i
for i in dic_ham.values():
    total_ham_words=total_ham_words+i

In [8]:
count=0
for i in range(0,len(x_test)):
    sum1=np.log(prob_ham)
    sum2=np.log(prob_spam)
    for w in np.array(x_test)[i].split():
        if w in dic_ham:
            temp=np.log((dic_ham[w]+1)/(total_ham_words+total_unique_words))
        else:
            temp=np.log(1/(total_ham_words+total_unique_words))
        sum1=sum1+temp
        if w in dic_spam:
            temp=np.log((dic_spam[w]+1)/(total_spam_words+total_unique_words))
        else:
            temp=np.log(1/(total_spam_words+total_unique_words))
        sum2=sum2+temp
    if(sum1 > sum2):
        result=0
    else:
        result=1
    if(np.array(y_test)[i]==result):
        count=count+1

In [12]:
print("Correct predictions out of",len(y_test),"is :",count)
print("Accuracy acheived :",count/len(y_test)*100)

Correct predictions out of 1672 is : 1614
Accuracy acheived : 96.5311004784689
