In [42]:
import csv
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

pd.set_option("display.max_colwidth", 250)

dataset = pd.read_csv("SMSSpamCollection", sep="\t", header=None)
dataset.columns = ["labels", "sms_msgs"]
col1 = dataset.groupby("labels") 
ratio = col1.count().values[0]/col1.count().values[1]

def remove_punctuation(text):
    without_punct = [t for t in text if t not in string.punctuation]
    return "".join(without_punct)

In [43]:
dataset["clean_msg"] = dataset["sms_msgs"].apply(lambda msg: remove_punctuation(msg))
dataset["tokenized_msg"] = dataset["clean_msg"].apply(lambda msg: word_tokenize(msg.lower()))

# removing stopwords
def remove_sw(msg):
    words = list()
    for m in msg:
        if m not in stopwords.words("english"):
            words.append(m)
    return words
dataset["without_sw"] = dataset["tokenized_msg"].apply(lambda msg: remove_sw(msg))

In [46]:
ps = PorterStemmer()

def stem_msg(msg):
    words = list()
    for m in msg:
        words.append(ps.stem(m))
    return words

dataset["stemmed_msg"] = dataset["without_sw"].apply(lambda msg: stem_msg(msg))

In [47]:
lemmatizer = WordNetLemmatizer()
def lemmatize_msg(sent=None):
    if sent is None:
        sent = list()
    else:
        sent = sent
    new_sent = list()
    for s in sent:
        new_sent.append(lemmatizer.lemmatize(s))
    return new_sent

dataset["lemmatized_msg"] = dataset["without_sw"].apply(lambda msg: lemmatize_msg(msg))
dataset.head()

Unnamed: 0,labels,sms_msgs,clean_msg,tokenized_msg,without_sw,stemmed_msg,lemmatized_msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, cine, there, got, amore, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to, 87121, to, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv, entri, questionstd, txt, ratetc, appli, 08452810075over18]","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receive, entry, questionstd, txt, ratetcs, apply, 08452810075over18s]"
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
