# Filter SMS spam with Naive Bayes

# Import dataset

In [68]:
import pandas as pd
sms = pd.read_csv('SMSSpamCollection', sep='\t', names=['Label', 'SMS'])
sms.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Randomize dataset

In [69]:
# randomly mix the entire dataset, use random_state to ensure reproducibility of results
sms = sms.sample(frac=1, random_state=1) 

# Divide dataset on train and test dataset

In [95]:
sms_train = sms.iloc[:round(sms.shape[0]*0.8)].copy() # set first 80% of dataset to train set
sms_test = sms.iloc[round(sms.shape[0]*0.8):].copy() # set first 20% of dataset to test set

# Count the percent of spam in train, test and full datasets

In [96]:
# count and show the percent of spam in train dataset
sms_train_spam = sms_train[sms_train['Label'] == 'spam'].copy()
sms_train_percent = sms_train_spam.shape[0]/sms_train.shape[0]
print(sms_train_percent)
# count and show the percent of spam in test dataset
sms_test_spam = sms_test[sms_test['Label'] == 'spam'].copy()
sms_test_percent = sms_test_spam.shape[0]/sms_test.shape[0]
print(sms_test_percent)
# count and show the percent of spam in test dataset
sms_spam = sms[sms['Label'] == 'spam'].copy()
sms_percent = sms_spam.shape[0]/sms.shape[0]
print(sms_percent)

0.13458950201884254
0.1319569120287253
0.13406317300789664


As we can see, spam percents in train, test and full datasets are almost the same.

# Clean the dataset

In [129]:
# remove all non letter and digit symbols from full, train and test datasets
sms_clean = sms.copy()
sms_clean['SMS'] = sms_clean.loc[:, 'SMS'].str.replace('\W', ' ').str.lower()
sms_train_clean = sms_train.copy()
sms_train_clean['SMS'] = sms_train_clean.loc[:, 'SMS'].str.replace('\W', ' ').str.lower()
sms_test_clean = sms_test.copy()
sms_test_clean['SMS'] = sms_test.loc[:, 'SMS'].str.replace('\W', ' ').str.lower()

# Make a vocabulary

In [135]:
# initialize vocabulary
vocabulary = list()

# split all messages in full dataset on list values
sms_col_list = sms_clean['SMS'].str.split('\s+')

# add all words from dataset to vocabulary list
for row in sms_col_list:
    for word in row:
        vocabulary.append(word)
        

vocabulary = list(set(vocabulary))
vocabulary.remove('')

In [137]:
len(vocabulary)

8753