In [1]:
# check files in your directory
ls

 Volume in drive C is Windows
 Volume Serial Number is 4C2C-E801

 Directory of C:\Users\ayush.saxena\Desktop\Practice\Spam Classification

31-05-2020  17:49    <DIR>          .
31-05-2020  17:49    <DIR>          ..
31-05-2020  11:27    <DIR>          .ipynb_checkpoints
31-05-2020  11:27    <DIR>          images
31-05-2020  11:27    <DIR>          smsspamcollection
31-05-2020  17:49               570 Spam Classification - NB.ipynb
               1 File(s)            570 bytes
               5 Dir(s)  438,030,106,624 bytes free


In [2]:
#check data file
ls smsspamcollection

 Volume in drive C is Windows
 Volume Serial Number is 4C2C-E801

 Directory of C:\Users\ayush.saxena\Desktop\Practice\Spam Classification\smsspamcollection

31-05-2020  11:27    <DIR>          .
31-05-2020  11:27    <DIR>          ..
29-08-2017  01:53             5,868 readme
29-08-2017  01:53           477,907 SMSSpamCollection
               2 File(s)        483,775 bytes
               2 Dir(s)  438,033,600,512 bytes free


In [4]:
# import data in dataframe
import pandas as pd 
import numpy as np

df = pd.read_table('smsspamcollection/SMSSpamCollection', sep = '\t', names = ['Label', 'Message'])
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Data preporcessings
df['Label'] = df['Label'].map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Implementing BoW manually
documents = ['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']
print(documents)

['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']


In [10]:
# Make all strings in small letters
lower_doc = []
for i in documents:
    lower_doc.append(i.lower())
    
print(lower_doc)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [12]:
# Remove punctuation marks
import string

remove_pan_doc = []
for i in lower_doc:
    remove_pan_doc.append(i.translate(str.maketrans('','',string.punctuation)))
    
print(remove_pan_doc)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [20]:
# Tokenize the words
from collections import Counter
import pprint

word_in_doc = []
for i in remove_pan_doc:
    word_in_doc.append(i.split(' '))
    
print(word_in_doc)

print('\n*****************************************************\n*****************************************************\n')

# Use counter to calculate frequency of each word
processed_data = []
for i in word_in_doc:
    processed_data.append(Counter(i))
    
print(processed_data)
print('\n*****************************************************\n*****************************************************\n')
pprint.pprint(processed_data)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]

*****************************************************
*****************************************************

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}), Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}), Counter({'call': 1, 'me': 1, 'now': 1}), Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]

*****************************************************
*****************************************************

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [24]:
# Implementing the same with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# default setting of CountVectorizer
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [29]:
# fit your document
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [34]:
# create feature matrix
d_array = count_vector.transform(documents).toarray()
print(d_array)

[[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


In [36]:
# Create a frquency matrix which is more readable
freq_mat =pd.DataFrame(d_array, columns = count_vector.get_feature_names())
print(freq_mat)

   are  call  from  hello  home  how  me  money  now  tomorrow  win  you
0    1     0     0      1     0    1   0      0    0         0    0    1
1    0     0     1      0     1    0   0      1    0         0    2    0
2    0     1     0      0     0    0   1      0    1         0    0    0
3    0     1     0      2     0    0   0      0    0         1    0    1


In [40]:
# now split our data in training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Label'], random_state = 42)



print('Number of rows in dataset {}'.format(df.shape[0]))
print('Number of rows in training {}'.format(X_train.shape[0]))
print('Number of rows in test {}'.format(X_test.shape[0]))

Number of rows in dataset 5572
Number of rows in training 4179
Number of rows in test 1393


In [52]:
# Now create matrix for both train and test data
training_data = count_vector.fit_transform(X_train)
# Only tarnsform of testing data no fitting.
testing_data = count_vector.transform(X_test)

In [54]:
# Call the Learning Algorithm: Multinomial as it is a discrete count problem
from sklearn.naive_bayes import MultinomialNB
naive_b = MultinomialNB()
naive_b.fit(training_data, y_train)
y_pred = naive_b.predict(testing_data)

In [56]:
# Calculate the evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('accuracy_score: {0} \nprecision: {1} \nrecall: {2}\nf1_score: {3}'.format(accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)))

accuracy_score: 0.9885139985642498 
precision: 0.9775280898876404 
recall: 0.9354838709677419
f1_score: 0.956043956043956
