In [1]:
# Description: This program detects if an email is spam (1) or not (0)

In [2]:
#Import libraries
import numpy as np 
import pandas as pd 
# import nltk
# from nltk.corpus import stopwords
# import string

# sklearn train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### Dataset from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [3]:
# This line reads dataset in its raw format and creates rows and columns from it.
df = pd.read_csv('spamcollection/SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])
print(df)

     Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


## Data PreProcessing

In [6]:
df_clean = df.copy()

In [7]:
df_clean['SMS'] = df_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
df_clean['SMS'] = df_clean['SMS'].str.lower()
df_clean['SMS'] = df_clean['SMS'].str.split()
df_clean['SMS'].head()

  df_clean['SMS'] = df_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()


0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(df_clean['SMS'], df_clean['Label'], test_size=0.3, random_state=5)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(3900,)
(3900,)
(1672,)
(1672,)


## Prepare Volacbulary - List of all the words from the dataset

In [9]:
vocabulary = list(set(X_train.sum()))
vocabulary[11:20]

['anal',
 'logoff',
 'period',
 'lost',
 'bold',
 'epsilon',
 'build',
 'help',
 'aboutas']

In [10]:
len(vocabulary)

7235

## Calculate frequencies of the words for each message

In [11]:
word_count_per_sms = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_train], columns=vocabulary)

In [12]:
X_train = pd.concat([X_train.reset_index(), word_count_per_sms], axis=1).iloc[:,1:]

In [13]:
X_train.head()

Unnamed: 0,SMS,phone,advice,sale,spose,icicibank,finding,reason,urmom,sleepin,...,theirs,contract,forward,amount,faith,across,yards,summer,praps,ability
0,[nite],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[good, morning, plz, call, me, sir]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[i, am, late, i, will, be, there, at]",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[your, opinion, about, me, 1, over, 2, jada, 3...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[s, from, the, training, manual, it, show, the...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate values for the Bayes Formulae

In [14]:
Y_train_df = pd.DataFrame(data=Y_train.values, columns=['Label'])

In [15]:
alpha = 1
Nvoc = len(X_train.columns)-2
Pspam = Y_train_df.value_counts()['spam'] / Y_train_df.shape[0]
Pham = Y_train_df.value_counts()['ham'] / Y_train_df.shape[0]
Nspam = X_train.loc[Y_train_df['Label'] == 'spam', 'SMS'].apply(len).sum()
Nham = X_train.loc[Y_train_df['Label'] == 'ham', 'SMS'].apply(len).sum()
print(Pspam, Pham, Nspam, Nham)

0.13435897435897437 0.8656410256410256 13346 49434


In [16]:
def p_w_spam(word):
    if word in X_train:
        return (X_train.loc[Y_train_df['Label']=='spam', word].sum() + alpha)/ (Nspam + alpha*Nvoc)
    else:
        return 1

In [17]:
def p_w_ham(word):
    if word in X_train:
        return (X_train.loc[Y_train_df['Label']=='ham', word].sum() + alpha)/ (Nham + alpha*Nvoc)
    else:
        return 1

## Preparing Classifier

In [18]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [19]:
classify('secret')

'ham'

In [20]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

## Using test data

In [21]:
Y_test_df = pd.DataFrame(data=Y_test.values, columns=['Label'])
X_test_df = pd.DataFrame(data=X_test.values, columns=['SMS'])

X_test_df['predicted'] = X_test_df['SMS'].apply(classify)

In [22]:
correct = (X_test_df['predicted'] == Y_test_df['Label']).sum() / X_test_df.shape[0] * 100

In [23]:
print(f'Accuracy of the model is : {correct}%')

Accuracy of the model is : 99.04306220095694%


In [24]:
confusion_matrix(Y_test_df['Label'], X_test_df['predicted'])

array([[1445,    2,    2],
       [   0,    0,    0],
       [  12,    0,  211]])