# NAIVE BYES ALGORITHM

# Loading the Data

In [1]:
import pandas as pd

In [3]:
dataset = pd.read_csv('datasets/sms.tsv',sep='\t',names=['Label','Messages'])

In [4]:
dataset.head()

Unnamed: 0,Label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#checking the shape of the dataset
dataset.shape

(5572, 2)

In [7]:
# classifying the labels into 0s and 1s

In [8]:
dataset['Label'] = dataset['Label'].map({'ham':0,'spam':1})

In [9]:
dataset.head()

Unnamed: 0,Label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Now we will clean the data

In [11]:
import re

In [12]:
def clean(x):
    s = re.sub('[^A-Za-z]',' ',x)
    s = re.sub('\s+',' ',s)
    s = s.strip()
    return s.lower()
    

In [15]:
dataset['Messages'] = dataset.Messages.apply(clean)

In [16]:
x = dataset.Messages.values
y = dataset.Label.values

In [17]:
# now splitting the dataset into train test split

from sklearn.model_selection import train_test_split

In [19]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.3,random_state = 13)

In [20]:
# using stop words
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

In [21]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [22]:
# removing the not from the stopwords list


In [23]:
if 'not' in stopwords:
    stopwords.remove('not')

In [24]:
# re-confirming
if 'not' in stopwords:
    print('found')
else:
    print('Not Found')


Not Found


In [25]:
# now transforming the data into numeric

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv1 = CountVectorizer(stop_words = stopwords)


In [29]:
# model created

In [31]:
# now transforming the input and the output data
cv_train = cv1.fit_transform(xtrain).toarray()
cv_test = cv1.transform(xtest).toarray()

In [32]:
cv_train.shape

(3900, 6241)

In [33]:
cv_test.shape

(1672, 6241)

In [35]:
cv1.get_feature_names

<bound method CountVectorizer.get_feature_names of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)>

In [36]:
# training the model
from sklearn.naive_bayes import MultinomialNB

In [37]:
nb = MultinomialNB()
nb.fit(cv_train,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
test_score = nb.score(cv_test,ytest)

In [39]:
test_score

0.9814593301435407

In [40]:
# Evaluate the model on the test data

In [41]:
test = ['get free tickets..!Win cash','hi john i will call u @ 7']

In [42]:
cleaned_Data = []
for i in test:
    t = clean(i)
    cleaned_Data.append(t)

In [43]:
cleaned_Data

['get free tickets win cash', 'hi john i will call u']

In [44]:
test1 = cv1.transform(cleaned_Data)

In [45]:
nb.predict(test1)

array([1, 0], dtype=int64)