In [76]:
import pandas as pd

sms_spam = pd.read_csv('d:/sms_spam.csv')

print(sms_spam.shape)
sms_spam.head()

(5574, 2)


Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [77]:
sms_spam['type'].value_counts(normalize=True)

ham     0.865985
spam    0.134015
Name: type, dtype: float64

In [78]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)


In [79]:
# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

In [80]:
# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

In [81]:
print(training_set.shape)
print(test_set.shape)

(4459, 2)
(1115, 2)


In [82]:
training_set['type'].value_counts(normalize=True)

ham     0.865216
spam    0.134784
Name: type, dtype: float64

In [83]:
test_set['type'].value_counts(normalize=True)

ham     0.869058
spam    0.130942
Name: type, dtype: float64

In [84]:
# Before cleaning
training_set.head(3)

Unnamed: 0,type,text
0,ham,Looks like u wil b getting a headstart im leav...
1,ham,"I noe la... U wana pei bf oso rite... K lor, o..."
2,ham,2mro i am not coming to gym machan. Goodnight.


In [85]:
##Data clensing
training_set['text'] = training_set['text'].str.replace(r'\W', ' ') # Removes punctuation
training_set['text'] = training_set['text'].str.lower()
training_set.head(3)

  training_set['text'] = training_set['text'].str.replace(r'\W', ' ') # Removes punctuation


Unnamed: 0,type,text
0,ham,looks like u wil b getting a headstart im leav...
1,ham,i noe la u wana pei bf oso rite k lor o...
2,ham,2mro i am not coming to gym machan goodnight


In [86]:
'''
create the vocabulary
We transform each message in thetext column into a list by splitting the string at the space character — we're using the Series.str.split() method.
We initiate an empty list named vocabulary.
We iterate over the transformed text column.
Using a nested loop, we iterate over each message in the text column and append each string (word) to the vocabulary list.
We transform the vocabulary list into a set using the set() function. This will remove the duplicates from the vocabulary list.
We transform the vocabulary set back into a list using the list() function.
'''

"\ncreate the vocabulary\nWe transform each message in thetext column into a list by splitting the string at the space character — we're using the Series.str.split() method.\nWe initiate an empty list named vocabulary.\nWe iterate over the transformed text column.\nUsing a nested loop, we iterate over each message in the text column and append each string (word) to the vocabulary list.\nWe transform the vocabulary list into a set using the set() function. This will remove the duplicates from the vocabulary list.\nWe transform the vocabulary set back into a list using the list() function.\n"

In [87]:

training_set['text'] = training_set['text'].str.split()

vocabulary = []
for text in training_set['text']:
   for word in text:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [88]:
len(vocabulary)

7802

In [89]:
vocabulary

['gower',
 'eldest',
 'luck',
 'watching',
 '083',
 'meetins',
 'took',
 'football',
 'iraq',
 'diesel',
 'peaceful',
 'frontierville',
 'gym',
 'contract',
 'nok',
 'b4utele',
 'swatch',
 'long',
 '47',
 'receipts',
 'sky',
 'more',
 'tears',
 'txting',
 'infections',
 'fold',
 'waqt',
 'gamb',
 'temp',
 '89555',
 'srt',
 'crab',
 'gentle',
 'shitin',
 'theacusations',
 'l8rs',
 'smiley',
 'chief',
 'keyword',
 'tones',
 'doors',
 'predictive',
 'press',
 'burger',
 '12',
 'backdoor',
 'wonder',
 'funny',
 'id',
 '523',
 'arrival',
 'nat',
 'tootsie',
 'banter',
 'cum',
 'boo',
 'partner',
 'base',
 'nr31',
 'april',
 'uawake',
 'map',
 'billed',
 'maturity',
 'difference',
 'advice',
 'comedy',
 'wins',
 'hard',
 'haiz',
 'swt',
 'theirs',
 'tenants',
 'matter',
 'darlings',
 'application',
 'caller',
 'oz',
 'muah',
 'shola',
 'big',
 '08706091795',
 'cashto',
 'timi',
 'delhi',
 'chez',
 'answered',
 'leona',
 'roads',
 'needle',
 'wrc',
 'entry',
 'child',
 '62468',
 'business',
 

In [90]:
word_counts_per_text = {'secret': [2,1,1],
                       'prize': [2,0,1],
                       'claim': [1,0,1],
                       'now': [1,0,1],
                       'coming': [0,1,0],
                       'to': [0,1,0],
                       'my': [0,1,0],
                       'party': [0,1,0],
                       'winner': [0,0,1]
                      }


In [91]:
word_counts_per_text

{'secret': [2, 1, 1],
 'prize': [2, 0, 1],
 'claim': [1, 0, 1],
 'now': [1, 0, 1],
 'coming': [0, 1, 0],
 'to': [0, 1, 0],
 'my': [0, 1, 0],
 'party': [0, 1, 0],
 'winner': [0, 0, 1]}

In [92]:
word_counts = pd.DataFrame(word_counts_per_text)
word_counts.head()

Unnamed: 0,secret,prize,claim,now,coming,to,my,party,winner
0,2,2,1,1,0,0,0,0,0
1,1,0,0,0,1,1,1,1,0
2,1,1,1,1,0,0,0,0,1


In [93]:
'''
To create the dictionary we need for our training set

We start by initializing a dictionary named word_counts_per_text, where each key is a
unique word (a string) from the vocabulary, and each value is a list of the length of the training set, 
where each element in that list is a 0.
The code [0] * 5 outputs [0, 0, 0, 0, 0]. So the code [0] * len(training_set['text']) outputs a list of the 
length of training_set['text'].
We loop over training_set['text'] using the enumerate() function to get both the index and the text message (index and text).
Using a nested loop, we loop over text (where text is a list of strings, where each string represents a word in a message).
We increment word_counts_per_text[word][index] by 1.

word_counts_per_text = {unique_word: [0] * len(training_set['text']) for unique_word in vocabulary}
'''

"\nTo create the dictionary we need for our training set\n\nWe start by initializing a dictionary named word_counts_per_text, where each key is a\nunique word (a string) from the vocabulary, and each value is a list of the length of the training set, \nwhere each element in that list is a 0.\nThe code [0] * 5 outputs [0, 0, 0, 0, 0]. So the code [0] * len(training_set['text']) outputs a list of the \nlength of training_set['text'].\nWe loop over training_set['text'] using the enumerate() function to get both the index and the text message (index and text).\nUsing a nested loop, we loop over text (where text is a list of strings, where each string represents a word in a message).\nWe increment word_counts_per_text[word][index] by 1.\n\nword_counts_per_text = {unique_word: [0] * len(training_set['text']) for unique_word in vocabulary}\n"

In [94]:
word_counts_per_sms = {unique_word: [0] * len(training_set['text']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['text']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

In [95]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,gower,eldest,luck,watching,083,meetins,took,football,iraq,diesel,...,cann,lotsof,creep,devouring,verify,others,facebook,workage,pool,unjalur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,type,text,gower,eldest,luck,watching,083,meetins,took,football,...,cann,lotsof,creep,devouring,verify,others,facebook,workage,pool,unjalur
0,ham,"[looks, like, u, wil, b, getting, a, headstart...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[i, noe, la, u, wana, pei, bf, oso, rite, k, l...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[2mro, i, am, not, coming, to, gym, machan, go...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,spam,"[todays, vodafone, numbers, ending, with, 4882...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[hi, hope, ur, day, good, back, from, walk, ta...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['type'] == 'spam']
ham_messages = training_set_clean[training_set_clean['type'] == 'ham']

  spam_messages = training_set_clean[training_set_clean['type'] == 'spam']


ValueError: cannot reindex on an axis with duplicate labels

In [99]:
sms_spam.describe

<bound method NDFrame.describe of       type                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5569  spam  This is the 2nd time we have tried 2 contact u...
5570   ham               Will ü b going to esplanade fr home?
5571   ham  Pity, * was in mood for that. So...any other s...
5572   ham  The guy did some bitching but I acted like i'd...
5573   ham                         Rofl. Its true to its name

[5574 rows x 2 columns]>