# Filtering spam messages with Naive Bayes theorem

#### The goal of this project was to train a model based on the Bayes probabity theorem to detect if an email is a SPAM or not.
- Preparing the data
- Extracing the features

## 1 - Load the messages and the labels from the txt file

In [4]:
import pandas as pd
import csv

# Function which returns a dataframe from our text file
def create_dataframe_from_file(filename):
    df = pd.DataFrame(columns=['Status','Message'])
    with open('messages.txt') as file:
        for line in file:
            Id,rest_of_string = line.split("\t")
            rest_of_string,go_back_line = rest_of_string.split("\n")
            df = df.append({'Status' : Id , 'Message' : rest_of_string} , ignore_index=True)
    return df

def create_dataframe_from_file(filename):
    df = pd.read_csv(filename, sep = '\t',names=['Status','Message'],encoding='utf-8',quoting=csv.QUOTE_NONE);
    return df

messages_df = create_dataframe_from_file("messages.txt")
messages_df.head(10)

Unnamed: 0,Status,Message
0,ham,Yup i've finished c ü there...
1,ham,Remember to ask alex about his pizza
2,ham,No da..today also i forgot..
3,ham,Ola would get back to you maybe not today but ...
4,ham,Fwiw the reason I'm only around when it's time...
5,ham,"Hello, my boytoy! I made it home and my consta..."
6,ham,Congrats kano..whr s the treat maga?
7,ham,Who u talking about?
8,ham,Yup...
9,ham,Ok...


## 2 - Clean the dataset and get info on it 

In [5]:
# We replace our "Ham" and "Spam" labels by 0s and 1s
cleaned_messages_df = messages_df.replace({'Status': {'ham': 0, 'spam': 1}})

# We get the number of text messages and the spam/ham count in our df
df_size = cleaned_messages_df.shape[0]
num_spams = (cleaned_messages_df.Status == 1).sum()
num_hams = (cleaned_messages_df.Status == 0).sum()
print("Total # of messages :",df_size)
print("# of spams :",num_spams)
print("# of nnon spams :",num_hams)
print(cleaned_messages_df.head())

Total # of messages : 5000
# of spams : 672
# of nnon spams : 4328
   Status                                            Message
0       0                     Yup i've finished c ü there...
1       0               Remember to ask alex about his pizza
2       0                       No da..today also i forgot..
3       0  Ola would get back to you maybe not today but ...
4       0  Fwiw the reason I'm only around when it's time...


## 3 - Split the data in training and testing examples
- The Spam values are replaced by 1 and Ham values by 0
- Then we choose to arbitraly to split our data on a 70-30 basis for our training purposes

In [6]:
# We choose to use 70% of this for training purpose
train_size = round(df_size*0.70)
test_size = df_size - train_size

# We create the train and test datasets
df_train = cleaned_messages_df[:train_size]
df_test = cleaned_messages_df[train_size:]

print("There are {} training examples.".format(train_size))
print("There are {} testing examples.".format(test_size))
# df_labels_ToList = df_test_label['Status'].tolist()

There are 3500 training examples.
There are 1500 testing examples.


## 4 - Create a dictionnary  a dictionnary from the words in the training dataset

In [7]:
from collections import Counter

# Create dictionnary
def make_Dictionary(dataset):
    # List of all the words
    all_words = []
    # Loop through the whole dataset
    for index, row in dataset.iterrows():
        # Get each string
        full_line = row['Message']
        # Split it to obtain all the words 
        words = full_line.split()
        # Add them to our tab containing all the words
        all_words += words
    # We use the counter function to create a tab with all the different words and the number of times they occur
    dictionary = Counter(all_words)
    
    for item in list(dictionary): 
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    # Get the 3000 first most common words
    dictionary = dictionary.most_common(3000)
    return dictionary

training_dict = make_Dictionary(df_train)
print("# of words : ", len(training_dict))

# of words :  3000


## 5 - Extract features from both the training data and test data.

For each message, it means checking for every word if it appears in the dictionnary. Each message is going to be a 3000 array long.

In [8]:
import numpy as np

def extract_features(dataset,dictionary):
    features_matrix = np.zeros((len(dataset), 3000))
    docID = 0
    for index, row in dataset.iterrows():
        # Get each string
        full_line = row['Message']
        # Split it to obtain all the words 
        words = full_line.split()
        # Go through all the words in the message
        for word in words:
            wordID = 0
            # We are going to check if the word is in the dictionnary or not
            for i, d in enumerate(dictionary):
                # If it is, set the flag to 1
                if d[0] == word:
                    wordID = i
                    features_matrix[docID, wordID] = words.count(word)
        # Now work on the next feature
        docID = docID + 1
    return features_matrix

extract_feats_train = extract_features(df_train,training_dict)
extract_feats_test = extract_features(df_test,training_dict)
print("The shape of the feature dataframe for training is",extract_feats_train.shape)
print("The shape of the feature dataframe for testing is",extract_feats_test.shape)

The shape of the feature dataframe for training is (3500, 3000)
The shape of the feature dataframe for testing is (1500, 3000)


## 6 - Use the model to make predictions for the test data.

## 7 - Measure the spam-filtering performance through the confusion matrix from the Sklearn library.

In [15]:
# Fait avec sklearn pour comparer
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

from sklearn import metrics


# Training Naive bayes classifier and its variants
# model2 = MultinomialNB()
model2 = BernoulliNB()
model2.fit(extract_feats_train, df_train['Status'])
# Test the unseen mails for Spam

results2 = model2.predict(extract_feats_test)
print(confusion_matrix(df_test['Status'], results2))
print (metrics.classification_report(df_test['Status'], results2))

[[1293    1]
 [  39  167]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1294
           1       0.99      0.81      0.89       206

    accuracy                           0.97      1500
   macro avg       0.98      0.90      0.94      1500
weighted avg       0.97      0.97      0.97      1500

