# Spam Text Classification
***Implemented By Naive Bias Bernaulli model***

In [291]:
import pandas as pd
import string

## Data Import And Cleaning

In [194]:
df = pd.read_csv('spam.csv')
df_copy = df.copy()

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [196]:
df['Unnamed: 2'] = df['Unnamed: 2'].astype(str)
df['Unnamed: 3'] = df['Unnamed: 3'].astype(str)
df['Unnamed: 4'] = df['Unnamed: 4'].astype(str)

In [197]:
df['v2'] = df['v2'].str.lower()
df['Unnamed: 2'] = df['Unnamed: 2'].str.lower()
df['Unnamed: 3'] = df['Unnamed: 3'].str.lower()
df['Unnamed: 4'] = df['Unnamed: 4'].str.lower()

In [198]:
df['Unnamed: 4'] = df['Unnamed: 4'].str.replace('nan', ' ')

In [199]:
punc = list(string.punctuation)
for i in punc:
    df['v2'] = df['v2'].str.replace(i, ' ', regex=False)
    df['Unnamed: 2'] = df['Unnamed: 2'].str.replace(i, ' ', regex=False)
    df['Unnamed: 3'] = df['Unnamed: 3'].str.replace(i, ' ', regex=False)
    df['Unnamed: 4'] = df['Unnamed: 4'].str.replace(i, ' ', regex=False)

## Partitioning Data

In [200]:
spam = df[df['v1'] != 'ham']

In [201]:
non_spam = df[df['v1'] != 'spam']

In [202]:
non_spam = non_spam.reset_index(drop=True)

In [203]:
spam = spam.reset_index(drop=True)

In [266]:
spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,spam,free entry in 2 a wkly comp to win fa cup fina...,,,
1,spam,freemsg hey there darling it s been 3 week s n...,,,
2,spam,winner as a valued network customer you have...,,,
3,spam,had your mobile 11 months or more u r entitle...,,,
4,spam,six chances to win cash from 100 to 20 000 po...,,,
...,...,...,...,...,...
742,spam,want explicit sex in 30 secs ring 02073162414...,,,
743,spam,asked 3mobile if 0870 chatlines inclu in free ...,,,
744,spam,had your contract mobile 11 mnths latest moto...,,,
745,spam,reminder from o2 to get 2 50 pounds free call...,,,


# Model Building

In [204]:
spam_distribution = {}

In [205]:
for messages in spam['v2'].values:
    words = set(messages.split())
    for word in words:
        if word in spam_distribution:
            spam_distribution[word] += 1
        else:
            spam_distribution[word] = 1

In [207]:
non_spam_distribution = {}

In [208]:
for k in ['v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']:
    for messages in non_spam[k].values:
        words = set(messages.split())
        for word in words:
            if word in non_spam_distribution:
                non_spam_distribution[word] += 1
            else:
                non_spam_distribution[word] = 1

In [209]:
n_spam = spam.shape[0]

In [210]:
n_non_spam = non_spam.shape[0]

In [211]:
phi = (n_spam)/(n_spam+n_non_spam)

In [260]:
total_distribution = {}
for k in ['v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']:
    for messages in df[k].values:
        words = set(messages.split())
        for word in words:
            if word in total_distribution:
                total_distribution[word] += 1
            else:
                total_distribution[word] = 1

# Predicting
*With use of Baye's Rule*

In [286]:
def prob_spam(message:str):
    message.lower()
    global n_spam
    global n_non_spam
    global spam_distribution
    global total_distribution
    words = message.split()
    prob = 1.0
    for word in words:
        if word in spam_distribution:
            marg = (spam_distribution[word] + 1)/(total_distribution[word] + 2)
        elif word in total_distribution:
            marg= 1/(total_distribution[word] + 2)
        else:
            marg= 0.5
        prob *= marg
    return prob

In [287]:
def prob_non__spam(message:str):
    message.lower()
    global n_spam
    global n_non_spam
    global non_spam_distribution
    global total_distribution
    words = message.split()
    prob = 1.0
    for word in words:
        if word in non_spam_distribution:
            marg= (non_spam_distribution[word] + 1)/(total_distribution[word] + 2)
        elif word in total_distribution:
            marg= 1/(total_distribution[word] + 2)
        else:
            marg= 0.5
        prob *= marg
    return prob

In [288]:
def is_spam(message:str):
    if prob_spam(message) > prob_non__spam(message):
        return True
    return False

In [289]:
is_spam('hey its me anju')

False