# NaiveBayes

In [133]:
import pandas as pd
import numpy as np
import string
from collections import Counter

In [135]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

#### Reading data  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [136]:
df = pd.read_table('/Users/bakula/Documents/bapi/projects/my-travel-plans/smsspamcollection/SMSSpamCollection',sep='\t',header=None)

  """Entry point for launching an IPython kernel.


In [137]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [138]:
df.columns = ['label','SMS']

#### Convert labels from string to integer, for making it easy in calculating accuracy scores

In [139]:

df.label = df.label.map({'ham':0,'spam':1})

In [140]:
df.head()

Unnamed: 0,label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### BoW( Bag of words) - take piece of text and count frequency of words in text. Order doesn't matter
sklearn provides us a library count vectorizer

We can also implement same logic(BoW) from scratch


#### Implementation from Scratch

1. Convert string to lower case
2. if words have any punctuations ignore them
3. stopwords - Ignore most commonly used words in English like is,am,the 
4. tokenize
5. freq count


In [45]:
df_SMS_list = list(df.SMS)

In [46]:
len(df_SMS_list)

5572

In [58]:
lower_SMS = []
for i in df_SMS_list:
    lower_SMS.append(i.lower())


    
    

In [73]:
lower_SMS

['go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say so early hor... u c already then say...',
 "nah i don't think he goes to usf, he lives around here though",
 "freemsg hey there darling it's been 3 week's now and no word back! i'd like some fun you up for it still? tb ok! xxx std chgs to send, £1.50 to rcv",
 'even my brother is not like to speak with me. they treat me like aids patent.',
 "as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your callertune for all callers. press *9 to copy your friends callertune",
 'winner!! as a valued network customer you have been selected to receivea £900 prize reward! to claim call 09061701461. claim code kl341. valid 12 hours only.',
 'had you

In [59]:
punct_SMS = []
table = str.maketrans({key:None for key in string.punctuation})
for i in lower_SMS:
    punct_SMS.append(i.translate(table))

In [60]:
punct_SMS  # all punctuations are removed like ...

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s',
 'u dun say so early hor u c already then say',
 'nah i dont think he goes to usf he lives around here though',
 'freemsg hey there darling its been 3 weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send £150 to rcv',
 'even my brother is not like to speak with me they treat me like aids patent',
 'as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press 9 to copy your friends callertune',
 'winner as a valued network customer you have been selected to receivea £900 prize reward to claim call 09061701461 claim code kl341 valid 12 hours only',
 'had your mobile 11 months or more u r entitled to update t

In [72]:
#stopwords not being handled.
#Tokenize
processed_SMS =  []
for i in punct_SMS:
    processed_SMS.append(i.split())

In [69]:
from collections import Counter

In [65]:
?Counter

In [70]:
freq_list = []
for i in processed_SMS:
    freq_list.append(Counter(i))

In [71]:
freq_list

[Counter({'go': 1,
          'until': 1,
          'jurong': 1,
          'point': 1,
          'crazy': 1,
          'available': 1,
          'only': 1,
          'in': 1,
          'bugis': 1,
          'n': 1,
          'great': 1,
          'world': 1,
          'la': 1,
          'e': 1,
          'buffet': 1,
          'cine': 1,
          'there': 1,
          'got': 1,
          'amore': 1,
          'wat': 1}),
 Counter({'ok': 1, 'lar': 1, 'joking': 1, 'wif': 1, 'u': 1, 'oni': 1}),
 Counter({'free': 1,
          'entry': 2,
          'in': 1,
          '2': 1,
          'a': 1,
          'wkly': 1,
          'comp': 1,
          'to': 3,
          'win': 1,
          'fa': 2,
          'cup': 1,
          'final': 1,
          'tkts': 1,
          '21st': 1,
          'may': 1,
          '2005': 1,
          'text': 1,
          '87121': 1,
          'receive': 1,
          'questionstd': 1,
          'txt': 1,
          'ratetcs': 1,
          'apply': 1,
          '08452810

In [130]:
df.head()

Unnamed: 0,label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### BoW (using sklearn)

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [142]:
# split data into train and test
X_train,X_test,y_train,y_test = train_test_split(df['SMS'],df['label'], random_state = 1)
cont_vec = CountVectorizer()

In [83]:
?train_test_split

In [143]:
X_train.head()

710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
Name: SMS, dtype: object

In [144]:
df.head()

Unnamed: 0,label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### naiveBayes Model training and test

In [89]:
# Fit the training data and then return the matrix of numpy integers
training_data = cont_vec.fit_transform(X_train)

#Fit the Test data
testing_data = cont_vec.transform(X_test)

In [110]:
#to array
training_array = training_data.toarray()

In [112]:
#convert array to dataframe jlt
training_df = pd.DataFrame(training_array)

In [113]:
training_df
#Here each row belongs to a words and its occurence

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7446,7447,7448,7449,7450,7451,7452,7453,7454,7455
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
#Naive Bayes model fit and predict on train and test data
naiveBayes = MultinomialNB()
naiveBayes.fit(training_data,y_train)
prediction  = naiveBayes.predict(testing_data)

In [93]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [96]:
a_score = accuracy_score(y_test,prediction)
r_score  = recall_score(y_test,prediction)
pre_score = precision_score(y_test,prediction)
f1_score = f1_score(y_test,prediction)

In [97]:
f1_score

0.9560439560439562

In [98]:
pre_score

0.9720670391061452

In [99]:
r_score

0.9405405405405406

In [100]:
a_score

0.9885139985642498