# **Naive Bayes** - [Classification Model] supervised machine learning algorithm

~ Yogesh More

In [None]:
# It has a background of bayes theorem i.e.,
#  P(A|B) = [P(B|A)*P(A)] / P(B)

In [None]:
# Objective:  build a binary text classifier.

In [None]:
# we cannot directly feed text data to any of the ml model.
# need to convert the text into numerical feature.

In [None]:
# Step A: text cleaning (removing unwanted characters like , . # @ etc
#         0. Tokenization - convert sentence into words (similar to split method but not same exactly.)
#         1. convert all text to lower case
#         2. remove non-alpha features
#         3. remove stopwords (the, how, where, when, etc)

# Step 1: extract all the unique words from the dataset.
# Step 2: create embedding vector i.e., converting text into numerical features.
#             but model (can have too many features) and will fail drastically.

In [None]:
# nltk is the library used to do this stuff.

In [None]:
# Mathematical intuition of Naive Bayes

'''
    It says that, all words are independent of each other

'''

# can also be used for numerical data but, it works very very well on text data.

# ------------------SO WE PREFER THIS FOR TEXTUAL DATA-------------------------

In [None]:
# Limitations:
'''

1. doesn't understand the meaning of the text
2. order doesn't matter since all are independent
3. if a word is not present then whole prob will be 0,
   so either remove the word or set its probability to 1.
   We can even introduce smoothing/laplace smoothing
   (alpha is hyperparameter which controls smoothing, we can tune it.)

'''

## **Code**

In [None]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
df = pd.read_csv('/content/spam_clean (1).csv', encoding = 'latin-1')
df.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# ham --> not spam
# spam --> malicious

In [None]:
df['type'].value_counts()

ham     4825
spam     747
Name: type, dtype: int64

### **Code from Scratch**

In [None]:
# Importing Libraries
import nltk, re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize

# Let's create a function to perform text cleaning
def clean_text(data):
  """This function performs basic text cleaning"""
  new_text = ""      # An empty string is created to store texts after cleaning.

  # STEP 1: Tokenization
  tokenized_words = nltk.word_tokenize(data)   # This will split each an every word from the sentences including.

  for word in tokenized_words:
    # STEP 2: converting into lower case
    lower_words = word.lower()

    # STEP 3: removing punctuations using regex
    filtered_words = re.sub(r'[^\w\s]', '', lower_words)

    # STEP 4:removing stopwords
    if filtered_words != '' and filtered_words not in stopwords.words('english'):
      new_text = new_text + " " + filtered_words

  # returning final cleaned data by removing extra spaces from start and end.
  return new_text.strip()


"""
As this function is made to pre-process the textual data, but it will only clean a single
string at a time. If we want to use this function on the whole text column the we can use '.apply'
method in pandas. Let's see how.
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


" \nAs this function is made to pre-process the textual data, but it will only clean a single\nstring at a time. If we want to use this function on the whole text column the we can use '.apply'\nmethod in pandas. Let's see how.\n"

In [None]:
df

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# We have to pre-process our column in df called 'message'.
# Let's create a new column in our df which will contain the clean text.

df['cleaned_message'] = df['message'].apply(clean_text)
# we have used .apply method and passed our function to clean the textual column.

df.head(10)

Unnamed: 0,type,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives around though
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling 3 week word back like fun ...
6,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent
7,ham,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...
8,spam,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...
9,spam,Had your mobile 11 months or more? U R entitle...,mobile 11 months u r entitled update latest co...


In [None]:
# Now, our target is to classify whether the new message will be spam or ham.
# For that let's create a function which will assign number to our target values

def target_convert(x):
  category = {'ham':0, 'spam':1}
  return category[x]


# Updating our 'type' column by numbers like 1 for spam and 0 for ham.
df['type'] = df['type'].apply(target_convert)

df

Unnamed: 0,type,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u å750 pound prize ...
5568,0,Will Ì_ b going to esplanade fr home?,ì_ b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [None]:
# Finally, our data is ready, let's create a model.

In [None]:
# importing libraries
from sklearn.model_selection import train_test_split

# creating variables to store training and testing testing data.
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['type'],
                                                    test_size = 0.25,
                                                    random_state = 42)

# importing libraries to perform count vectorization.
from sklearn import feature_extraction, naive_bayes, metrics

# creating a function to perform count vectorization
f = feature_extraction.text.CountVectorizer()

# performing fit transform
x_train = f.fit_transform(X_train)
x_test = f.transform(X_test)

In [None]:
# importing libraries to use naive bayes.
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# training our nb model using Bernoulli
nb_b = BernoulliNB()
nb_b.fit(x_train, y_train)

# predicting features
y_pred_b = nb_b.predict(x_test)

# checking metrics
f1_score_bernoulli = metrics.f1_score(y_test, y_pred_b)
accuracy_score_bernoulli = metrics.accuracy_score(y_test, y_pred_b)

# training our nb model using Multinomial
nb_m = MultinomialNB()
nb_m.fit(x_train,y_train)

# predicting features
y_pred_m = nb_m.predict(x_test)

# checking metrics
f1_score_multinomial = metrics.f1_score(y_test, y_pred_m)
accuracy_score_multinomial = metrics.accuracy_score(y_test, y_pred_m)

In [None]:
print("Bernoulli's f1_score: ", f1_score_bernoulli)
print("Bernoulli's accuracy score: ", accuracy_score_bernoulli)

print('*'*50)

print("Multinomial's f1_score: ", f1_score_multinomial)
print("Multinomial's accuracy score: ", accuracy_score_multinomial)

Bernoulli's f1_score:  0.8837209302325582
Bernoulli's accuracy score:  0.9712849964106246
**************************************************
Multinomial's f1_score:  0.9214092140921409
Multinomial's accuracy score:  0.9791816223977028


In [None]:
# We can say that, Multinomial performs slightly better than Bernoulli on this data.



---



**THANK YOU**



---

