In [1]:
import nltk
import pandas as pd
import numpy as np


In [2]:
fullCorpus =pd.read_csv('SMSSpamCollection.tsv',sep='\t',header=None)

In [3]:
fullCorpus.columns = ['label','body_text']

In [4]:
fullCorpus

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# What is the shape of the dataset?
print ('Input file has {} rows and {} columns'.format(len(fullCorpus) , len(fullCorpus.columns)))

Input file has 5572 rows and 2 columns


In [6]:
# How many spam and ham are there?
print('Out of {} rows, there are {} spams and {} hams'.format( len(fullCorpus),
                                                            len(fullCorpus[fullCorpus['label']=='spam']),
                                                            len(fullCorpus[fullCorpus['label']=='ham'])))

Out of 5572 rows, there are 747 spams and 4825 hams


In [7]:
# how much missing data?
print('Number of missing rows in labels {}'.format (fullCorpus['label'].isnull().sum()))
print('Number of missing rows in texts {}'.format (fullCorpus['body_text'].isnull().sum()))

Number of missing rows in labels 0
Number of missing rows in texts 0


## Remove Punctuation

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punct(text):
    text_nonpunct= "".join ([ char for char in text if char not in string.punctuation ])
    return text_nonpunct

fullCorpus['body_text_nonpunct']= fullCorpus['body_text'].apply(lambda x: remove_punct(x))

In [10]:
fullCorpus.head()

Unnamed: 0,label,body_text,body_text_nonpunct
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


## Remove Tokenization

In [11]:
import re

def tokenize(text):
    tokens= re.split('\W+',text)
    return tokens

fullCorpus['body_text_notokens']= fullCorpus['body_text_nonpunct'].apply(lambda x: tokenize(x.lower()))

In [12]:
fullCorpus.tail()

Unnamed: 0,label,body_text,body_text_nonpunct,body_text_notokens
5567,spam,This is the 2nd time we have tried 2 contact u...,This is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will ü b going to esplanade fr home?,Will ü b going to esplanade fr home,"[will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",Pity was in mood for that Soany other suggest...,"[pity, was, in, mood, for, that, soany, other,..."
5570,ham,The guy did some bitching but I acted like i'd...,The guy did some bitching but I acted like id ...,"[the, guy, did, some, bitching, but, i, acted,..."
5571,ham,Rofl. Its true to its name,Rofl Its true to its name,"[rofl, its, true, to, its, name]"


## Remove Stopwords

In [13]:
stopword= nltk.corpus.stopwords.words('english')

In [14]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

fullCorpus['body_text_nostop']= fullCorpus['body_text_notokens'].apply(lambda x: remove_stopwords(x))

In [15]:
fullCorpus.head()

Unnamed: 0,label,body_text,body_text_nonpunct,body_text_notokens,body_text_nostop
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


## Cleaning the text

In [16]:
ps= nltk.PorterStemmer()

In [17]:
def clean_text(text):
    text= "".join ([ char for char in text if char not in string.punctuation ])
    tokens= re.split('\W+',text)
    text = [ps.stem(word.lower()) for word in tokens if word not in stopword]
    return text

fullCorpus['Clean_text']=fullCorpus['body_text'].apply(lambda x: clean_text(x) )

In [18]:
fullCorpus.head()

Unnamed: 0,label,body_text,body_text_nonpunct,body_text_notokens,body_text_nostop,Clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, i, dont, think, goe, usf, live, around, ..."


In [21]:
fullCorpus= fullCorpus.drop(['body_text_nonpunct','body_text_notokens','body_text_nostop'],axis=1)

In [22]:
fullCorpus.head()

Unnamed: 0,label,body_text,Clean_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, goe, usf, live, around, ..."
