# Spam Filter

In this project will be build a spam filter using a dataset with 5572, data analisys and machine learning.

In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# reading dataset
SMSSpam = pd.read_csv('SMSSpamCollection', sep = '\t', header = None, names = ['Label', 'SMS'])

In [3]:
SMSSpam.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
SMSSpam.Label.value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [5]:
train, test = train_test_split(SMSSpam.copy(), test_size = 0.2, random_state = 1)

In [6]:
train.Label.value_counts(normalize = True)

ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [7]:
len(train)

4457

percentage of ham and spam is equal to the complete dataset on the dataset to test

# Cleaning Dataset

In [8]:
train['SMS'] = train.SMS.str.replace('\W', ' ').str.lower().str.split()

In [9]:
# creating vocabulary list
vocabulary = list(set(train.SMS.sum()))

In [10]:
word_counts_per_sms = {unique_word: [0]*len(train.SMS) for unique_word in vocabulary}

for index, sms in enumerate(train.SMS):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [11]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,...,zindgi,zoe,zoom,zouk,zyada,èn,é,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
train_clean = pd.concat([train.reset_index(drop = True), word_counts], axis = 1)
train_clean.head()

Unnamed: 0,Label,SMS,0,00,000,008704050406,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zoom,zouk,zyada,èn,é,ü,〨ud,鈥
0,ham,"[hi, where, are, you, we, re, at, and, they, r...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[if, you, r, home, then, come, down, within, 5...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[when, re, you, guys, getting, back, g, said, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[tell, my, bad, character, which, u, dnt, lik,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, m, leaving, my, house, now]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculing constants

In [13]:
# probability of spam or ham
p_spam = train_clean.Label.value_counts(normalize = True).spam
p_ham = 1 - p_spam

# n spam or ham
n_spam = train_clean.Label.value_counts().spam
n_ham = train_clean.Label.value_counts().ham

# vocabulary
n_vocabulary = len(vocabulary)

alpha = 1

# Probabilities of the word be a spam or a ham

In [14]:
p_wham = dict()
p_wspam = dict()

for word in vocabulary:
    p_wham[word] = (train_clean.loc[train_clean.Label == 'ham', word].sum() + alpha)/(n_ham + alpha*n_vocabulary)
    p_wspam[word] = (train_clean.loc[train_clean.Label == 'spam', word].sum() + alpha)/(n_spam + alpha*n_vocabulary)    

# probability of the message be a spam

In [28]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message =  p_ham
    
    for word in message:
        if word in vocabulary:
            p_spam_given_message *= p_wspam[word]
            p_ham_given_message *= p_wham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'Equal proabilities, have a human classify this!'

In [29]:
test['predicted'] = test.SMS.apply(classify)

In [38]:
from sklearn.metrics import classification_report, confusion_matrix


print(classification_report(test.Label, test.predicted))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98       968
       spam       1.00      0.76      0.86       147

avg / total       0.97      0.97      0.97      1115



In [39]:
print(confusion_matrix(test.Label, test.predicted))

[[968   0]
 [ 35 112]]


All the mistakes was classifying a spam as a ham. That is good because if this filter be applied the probability that the message be filtered considering spam is very low. But 24% of the spam wasn't classified as spam