In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix  
import matplotlib.pyplot as plt
import regex as re

In [2]:
data = pd.read_csv("sms_spam.csv")
data

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [3]:
data['type'].value_counts(normalize=True)

type
ham     0.865985
spam    0.134015
Name: proportion, dtype: float64

In [4]:
data_randomized = data.sample(random_state=1,frac=1)
data_randomized

Unnamed: 0,type,text
1447,ham,Looks like u wil b getting a headstart im leav...
2032,ham,"I noe la... U wana pei bf oso rite... K lor, o..."
4432,ham,2mro i am not coming to gym machan. Goodnight.
4888,spam,Todays Vodafone numbers ending with 4882 are s...
5276,ham,"Hi. Hope ur day * good! Back from walk, table ..."
...,...,...
905,ham,"We're all getting worried over here, derek and..."
5192,spam,Our records indicate u maybe entitled to 5000 ...
3980,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,spam,Text & meet someone sexy today. U can find a d...


In [5]:
'''
Here we can't use train_test_split as the data is not processed. There are white spaces, punctuation
marks and so on. We need a bag of words to work on.
'''
training_test_index = round(len(data_randomized)*0.8)
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)
training_set.shape, test_set.shape

((4459, 2), (1115, 2))

In [6]:
training_set.head()

Unnamed: 0,type,text
0,ham,Looks like u wil b getting a headstart im leav...
1,ham,"I noe la... U wana pei bf oso rite... K lor, o..."
2,ham,2mro i am not coming to gym machan. Goodnight.
3,spam,Todays Vodafone numbers ending with 4882 are s...
4,ham,"Hi. Hope ur day * good! Back from walk, table ..."


In [7]:
# df["new_column"] = df['review'].str.replace('[^\w\s]','')
# x = training_set['text'][1447]
# with open('a.txt', 'w') as f:
#     print(x, file=f)

In [12]:
training_set['text'] = training_set['text'].str.lower()
training_set['text'] = training_set['text'].str.replace('\W', '')# Removes punctuation
training_set['text'] = training_set['text'].str.replace(r'[^\w\s]+', '')
training_set['text'][1]

'i noe la... u wana pei bf oso rite... k lor, other days den...'

In [9]:
vocabulary = set()
for text in training_set['text']:
   for word in text:
      vocabulary.add(word)
vocabulary = list(vocabulary)
len(vocabulary)

85

In [10]:
word_counts_per_text = {'secret': [2,1,1],
                       'prize': [2,0,1],
                       'claim': [1,0,1],
                       'now': [1,0,1],
                       'coming': [0,1,0],
                       'to': [0,1,0],
                       'my': [0,1,0],
                       'party': [0,1,0],
                       'winner': [0,0,1]
                      }
word_counts = pd.DataFrame(word_counts_per_text)
word_counts.head()

Unnamed: 0,secret,prize,claim,now,coming,to,my,party,winner
0,2,2,1,1,0,0,0,0,0
1,1,0,0,0,1,1,1,1,0
2,1,1,1,1,0,0,0,0,1


In [11]:
word_counts_per_sms = {unique_word: [0] * len(training_set['text']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['text']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
    
word_counts_per_sms.keys()

dict_keys(['\x93', 'é', 'y', '#', '鈥', '?', '<', 'x', ';', '*', 'u', '\x96', '=', '_', ' ', 'q', '7', 'ü', 'i', 'p', '…', ')', '‘', ',', '3', '\x94', 's', '’', 'h', 'j', '5', '“', 'v', '(', 'w', '9', 'f', 'n', '~', '1', ':', '4', 'l', '"', '!', ']', 'ú', 'c', '$', 'a', '¡', 'g', '£', '-', '—', 'b', '[', '2', '\\', 'r', '%', '┾', '|', '+', '^', '/', '@', 'e', '8', '〨', 't', '0', '>', 'd', '.', 'k', 'z', 'ì', 'o', '»', '6', '–', "'", 'm', '&'])

1. Remove punctuations
2. Create vocabulary
3. Do train test split
4. Feed to NB classifier
5. Share the accuracy