# Dependencies

In [34]:
import pandas as pd
import sklearn
import string

# Fetching dataset

In [35]:
df = pd.read_table("SMSSpamCollection",sep="\t",names=['label','sms_message'])

In [36]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
df['label'] = df.label.map({'ham':0,'spam':1}) 

In [38]:
print(df)

      label                                        sms_message
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...
5         1  FreeMsg Hey there darling it's been 3 week's n...
6         0  Even my brother is not like to speak with me. ...
7         0  As per your request 'Melle Melle (Oru Minnamin...
8         1  WINNER!! As a valued network customer you have...
9         1  Had your mobile 11 months or more? U R entitle...
10        0  I'm gonna be home soon and i don't want to tal...
11        1  SIX chances to win CASH! From 100 to 20,000 po...
12        1  URGENT! You have won a 1 week FREE membership ...
13        0  I've been searching for the right words to tha...
14        0                I HAVE A DATE ON SUNDAY WITH

In [39]:
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Bag Of Words from scratch

In [40]:
documents = ['Hello, how are you!',
'Win money, win from home.',
'Call me now.',
'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [41]:
 
sans_punctuation_documents = []
for i in lower_case_documents:
    tab = str.maketrans("","",string.punctuation)
    i = i.translate(tab)
    sans_punctuation_documents.append(i)
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [42]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
print(preprocessed_documents)


[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [43]:
frequency_list = []
import pprint
from collections import Counter
for i in preprocessed_documents:
    frequency_list.append(Counter(i))
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


# Bag of words in sklearn

In [44]:
documents = ['Hello, how are you!',
'Win money, win from home.',
'Call me now.',
'Hello, Call hello you tomorrow?']
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words=None,token_pattern = '(?u)\\b\\w\\w+\\b',lowercase = True)
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [45]:
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [46]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [47]:
frequency_matrix = pd.DataFrame(data=doc_array,columns=[count_vector.get_feature_names()])
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [48]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['sms_message'],df['label'],random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [57]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
# Transform testing data and return the matrix. Note we are not fitting the testing data 
testing_data = count_vector.transform(X_test)

# Bayes-theorem from scratch

In [64]:
# P(D)
p_diabetes = 0.01
# P(~D)
p_no_diabetes = 0.99
# Sensitivity or P(Pos|D)
p_pos_diabetes = 0.9
# Specificity or P(Neg|~D)
p_neg_no_diabetes = 0.9
# P(Pos)
p_pos = (p_diabetes * p_pos_diabetes) + (p_no_diabetes * (1-(p_neg_no_diabetes)))
print('The probability of getting a positive test result P(Pos) is: ',p_pos)

The probability of getting a positive test result P(Pos) is:  0.10799999999999998


In [65]:
# P(D|Pos)
p_diabetes_pos = p_diabetes* p_pos_diabetes / p_pos
print('Probability of an individual having diabetes, given that that individual got a positive ',format(p_diabetes_pos))

Probability of an individual having diabetes, given that that individual got a positive  0.08333333333333336


In [69]:
p_pos_no_diabetes = 1- p_neg_no_diabetes
p_no_diabetes_pos = p_no_diabetes * p_pos_no_diabetes / p_pos
print('Probability of an individual not having diabetes even after a positive test result is: ',format(p_no_diabetes_pos))

Probability of an individual not having diabetes even after a positive test result is:  0.9166666666666666


In [70]:
# P(J)
p_j = 0.5
# P(F/J)
p_j_f = 0.1
# P(I/J)
p_j_i = 0.1
p_j_text = p_j*p_j_f * p_j_i 
print(p_j_text)

0.005000000000000001


In [None]:
# P(G)
p_g = 0.5
# P(F/G)
p_g_f = 0.7
# P(I/G)
p_g_i = 0.2
p_g_text =  p_g * p_g_f * p_g_i
print(p_g_text)

In [73]:
p_f_i = p_j_text + p_g_text
print('Probability of words freedom and immigration being said are: ', format(p_f_i))

Probability of words freedom and immigration being said are:  0.075


In [74]:
p_j_fi = p_j_text / p_f_i
print('The probability of Jill Stein saying the words Freedom and Immigration: ', format(p_j_fi))

The probability of Jill Stein saying the words Freedom and Immigration:  0.06666666666666668


In [75]:
p_g_fi = p_g_text / p_f_i
print('The probability of Gary Johnson saying the words Freedom and Immigration: ', format(p_g_fi))

The probability of Gary Johnson saying the words Freedom and Immigration:  0.9333333333333332


# Naive bayes in sklearn

In [80]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [82]:
predictions = naive_bayes.predict(testing_data)

In [84]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions)))
print('Recall score: ', format(recall_score(y_test,predictions)))
print('F1 score: ', format(f1_score(y_test,predictions)))


Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
