In [1]:
import pandas as pd
import tqdm
from datetime import datetime

## Loading

This dataset can be found on
https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/sms-spam-collection-dataset.zip/1

In [2]:
raw_data = pd.read_csv('data.csv', encoding='latin-1')

raw_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing

In [3]:
# Creating the DataFrame
data = raw_data.copy()

# Selecting only the columns that will be used
data = data[['v1', 'v2']]

# Renaming the columns
data.columns = ['Class', 'Body']

# Lowering chars in every row of the Body column
data['Body'] = data['Body'].str.lower()

# Replacing latin-1 special characters with regular characters
data['Body'] = data['Body'].replace(regex={
    r'[áàâäã]': 'a', r'[éèêë]': 'e',
    r'[íìîï]': 'i', r'[óòôöõ]': 'o',
    r'[úùûü]': 'u', 'ç': 'c'
})

# Replacing special characters with a whitespace
# Note that this can cause a repetiton of whitespaces, that is going to be handled below
data['Body'] = data['Body'].replace(to_replace='[\n|\t|\r]', value=' ', regex=True)

# Replacing anything that is not a letter, a whitespace, a digit or any scape character
data['Body'] = data['Body'].replace(to_replace='[^a-z 0-9]', value='', regex=True)

# Handling multiple whitespaces problem
data['Body'] = data['Body'].replace(to_replace='( )+', value=' ', regex=True)

# Trimming whitespaces
data['Body'] = data['Body'].str.strip()

# Showing its shape to see the number of rows and first 5 rows
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,Class,Body
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


## Creating the Naïve-Bayes model

In [4]:
class NaiveBayes:
    """ This is a class that defines the Naive-Bayes model.
  
    Attributes: 
        labels -- list that stores the possible labels for the model to predict
        labels_prob -- pandas DataFrame that will be used to store the overall probability of each label to happen
        words_labels_prob -- pandas DataFrame that contains the probability of a label happening given that a word is present
        vocabulary -- a set containing all of the known distinct words
    """
    
    def __init__(self):
        """ Constructor for the NaiveBayes Class.
        
        All of the attributes start empty so that they are populated upon calling the fit function.
        """
        
        self.labels = []
        
        self.labels_prob = pd.DataFrame(columns=['Label', 'Probability'])
        
        self.words_labels_prob = pd.DataFrame(columns=['Label', 'Word', 'Probability'])
        
        self.vocabulary = set()

    def fit(self, train_set, labels=['ham', 'spam']):
        """ Function that fit the model to given data. 

        Keyword arguments: 
            train_set -- data to be fitted on
            labels -- labels that can be found on the training set (default ['ham', 'spam']) 
        """
        # Overwriting the labels variable in case the user has changed from default
        self.labels = labels

        # Cleaning the class variables so that they will be clean to learn
        # If the instance has previously called this function they will have some data
        self.labels_prob = pd.DataFrame(columns=['Label', 'Probability'])
        self.words_labels_prob = pd.DataFrame(columns=['Label', 'Word', 'Probability'])

        # Cleaning and filling the vocabulary, list of all distinct words in the training set
        self.vocabulary = set()
        self.vocabulary = ' '.join(train_set['Body'].to_list())
        self.vocabulary = set(self.vocabulary.split())

        # Iterating through each label
        for label in labels:
            # Creating a list with all the emails wich are classified as the current label
            docs = train_set[train_set['Class'] == label].copy()
            docs = docs['Body'].to_list()

            # Calculating the probability of the answer being the current label
            # number of documents labeled as such / total number of documents
            prob_label = len(docs) / train_set.shape[0]
            
            # Appending it into the labels_prob DataFrame
            self.labels_prob = self.labels_prob.append({'Label': label, 'Probability': prob_label}, ignore_index=True)

            # Creating a list with all the words in the emails in docs(repetitions included)
            text = ' '.join(docs)
            text = text.split()

            # Storing the number of words in the text list
            words_in_text = len(text)

            # Calculating for each word in the vocabulary the probability of being the current label
            # (using vocabulary because it uses only distinct words, so it will run each word only once)
            for word in self.vocabulary:
                # Getting the number of times the current word shows up in the text
                word_occurrences = text.count(word)

                # Calculating the probability with the formula
                # (number of occurences of the word + 1) / (total number of words + total number of distinct words)
                # Standard Naive-Bayes formula using Laplacian smoothing 
                prob_word_label = (word_occurrences + 1) / (words_in_text + len(self.vocabulary))
                
                # Appending it into the words_labels_prob DataFrame
                self.words_labels_prob = self.words_labels_prob.append({
                    'Label': label, 'Word': word, 'Probability': prob_word_label
                }, ignore_index=True)
    
    def predict(self, test):
        """ Function that predicts a label for a given test subject. 

        Doesn't return the probability of the label (prediction[1]) prediction by choice.

        Keyword arguments: 
            test -- subject to be tested on.  

        Returns: 
            prediction[0] -- the prediction.
        """
        
        # Creating a list with all words in the test document
        # and filters it for known words
        words = test.split()
        words = [word for word in words if word in self.vocabulary]
        
        # Creating prediction variable, which will have the label with the highest probability
        # and the probability of the prediction
        prediction = ['Label', -1]
        
        # Calculating probability for each label
        for label in self.labels:
            # Fetching the probability for the current label in the DataFrame
            prob_label = self.labels_prob[self.labels_prob['Label'] == label].reset_index(drop=True)
            prob_label = prob_label['Probability'][0]
            
            # Fetching the probability of being the label for each word in the respective DataFrame
            for word in words:
                prob_word_label = self.words_labels_prob.query('Label == @label & Word == @word').reset_index(drop=True)
                prob_word_label = prob_word_label['Probability'][0]
                
                # Multiplyting the current probability by the current word_label probability
                # The final probability is the multiplication of all the probabilities of label given word and label overall
                prob_label *= prob_word_label
                
            # Replacing the label in the answer variable in case the probability of the current label is higher
            if prob_label > prediction[1]:
                prediction[0] = label
                prediction[1] = prob_label
                
        return prediction[0]

## Testing

In [5]:
# Joining (merging) both DataFrames (full and stop-word filtered) on theirs indexes
train_test_data = data.copy()

# Showing its shape and first rows to see if Body and Body_Filtered look the same
print(train_test_data.shape)
train_test_data.head()

(5572, 2)


Unnamed: 0,Class,Body
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [6]:
# Storing the number of rows in the dataset
data_len = train_test_data.shape[0]

# Creating a train set, it has all of the rows in the dataset
# The sample method gets a number of rows randomly sorted (sample method), in this case, all rows
train = train_test_data.sample(data_len).reset_index(drop=True)

# Setting the size of the train set
train_size = 2/3

test = train[train.index >= data_len * train_size].reset_index(drop=True)
train = train[train.index < data_len * train_size]

# Printing the start time
print(str(datetime.now()) + ': Starting.')

# Creating the model instance
naivebayes = NaiveBayes()

# Fitting the model to the training data
naivebayes.fit(train)

# Printing the time at the end of the training
print(str(datetime.now()) + ': End of Training.')

# Creating the list that will store the predictions
predictions = []

# Predicting and storing the predictions for each subject of the test set
for subject in tqdm.tqdm(test['Body']):
    predictions.append(naivebayes.predict(subject))

# Printing the end time
print(str(datetime.now()) + ': Finished.')

# Creating a compare DataFrame as a copy of the test one so it will remain intact
compare = test.copy()

# Adding predictions columns to the compare DataFrame
compare['Prediction'] = predictions

# Printing the number of correct answers and the number of tests
print('Correct:', str(compare[compare['Class'] == compare['Prediction']].shape[0]), 'out of', str(test.shape[0]))

2019-11-07 21:11:45.164377: Starting.
2019-11-07 21:12:18.932912: End of Training.


100%|██████████████████████████████████████████████████████████████████████████████| 1857/1857 [04:05<00:00,  7.94it/s]


2019-11-07 21:16:24.626165: Finished.
Correct: 1827 out of 1857
