In [1]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
import string
from nltk.stem import WordNetLemmatizer
import collections
from collections import Counter
from sklearn.model_selection import train_test_split as tts

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
train_data = {
    'mail':[
        'congrats you have achieved certificate',
        'send us your google password',
        'review your google password',
        'send us your review',
        'congrats you won lottery',
        'review our website',
        'please update your profile'
    ],

    'label':[
        'non-spam',
        'spam',
        'non-spam',
        'non-spam',
        'spam',
        'spam',
        'non-spam',
    ]
    
}
train_data=pd.DataFrame(train_data)


test_data = [
    'review our changes send us your certificate',
    'congrats your profile achieved our website',
]

train_data.head()

Unnamed: 0,mail,label
0,congrats you have achieved certificate,non-spam
1,send us your google password,spam
2,review your google password,non-spam
3,send us your review,non-spam
4,congrats you won lottery,spam


In [3]:
######## Your Code Here ########
def preprocessing(data):
    lemmatizer = WordNetLemmatizer()
    
    # Lowercase
    data = data.lower()

    # Removing HTML tags
    data = re.sub(r"<.*?>", " ", data)

    # Removing hyperlinks
    data = re.sub(r"https?://\S+", "", data)

    # Removing numbers
    data = re.sub(r"\b[0-9]+\b\s*", "", data)

    # Removing punctuations
    data = re.sub(f"[{re.escape(string.punctuation)}]", "", data)

    # Spliting sentence to words
    words = data.split()

    # Applying word lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    # Removing stopwords
    words = [word for word in words if word not in stopwords.words('english') and '_' not in word]

    return words


In [4]:
train_data.mail = train_data.mail.apply(preprocessing)
test_data = [preprocessing(data) for data in test_data]

In [5]:
######## Your Code Here ########
def naive_bayes_training(data, classes):
    probability = dict()
    count = dict()

    for c in classes:
        # find out probability of each class
        probability[c] = dict()
        probability[c]['count'] = len(data[data['label']==c])
        probability[c]['probability'] = probability[c]['count']/len(data)

        # prepare data for counting occurrence of each word
        mails = data[data['label'] == c]['mail'].to_list()
        tokens = list()
        for mail in mails:
            tokens.extend(mail)

        # find out number of occurrence of each word in a class
        count[c]= dict()
        count[c]['token_count'] =collections.Counter(tokens)
        count[c]['len'] = len(tokens)

        # find out number of unique tokens in our dataset
    v = len(set(list(count['spam']['token_count'].keys())+ list(count['non-spam']['token_count'].keys())))

        # using naive bayes with add-1 smoothing rule for calculating the probability P(word|c)
    for class_name in classes:
        for word, c in count[class_name]['token_count'].items():
            probability[class_name][word] = (c+1)/ (v + count[class_name]['len'])
    
    return probability, count, v 

In [6]:
classes = ['spam', 'non-spam']
model, count, v = naive_bayes_training(train_data, classes)

In [7]:
######## Your Code Here ########
def naive_bayes_testing(model,count, v, X_test, classes):
    y_pred = list()
    for sample in X_test:
        p = dict()
        for class_name in classes:
            p[class_name] = np.log(model[class_name]['probability'])
            for word in sample:
                if word in model[class_name]:
                    p[class_name] += np.log(model[class_name][word])
                else:
                    p[class_name] += np.log(1 / (v + count[class_name]['len']))

        y_pred.append(max(p, key=p.get))
    return y_pred

In [8]:
y_pred = naive_bayes_testing(model, count, v, test_data, classes)
y_pred

['non-spam', 'non-spam']