## Importing csv and Reading Raw Data

In [8]:
import pandas as pd
import re
import string
import nltk

pd.set_option('display.max_colwidth', 100)

stopword = nltk.corpus.stopwords.words('english')

df = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
df.columns = ['label', 'email']

df.head()

Unnamed: 0,label,email
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Cleaning Dataframe E-mail(text)

In [9]:
def clean_text(text):
    ## Removing punctuation like ' !"#$%&\'()*+, '
    text = "".join([word for word in text if word not in string.punctuation])
    
    ## Tokenizing the email text
    tokens = re.split('\W+', text)
    
    ## Removing Stopwords like 'he,him,is,a,for,the,as,but,do etc'
    text = [word for word in tokens if word not in stopword]
    return text

## Transforming the dataframe using Lambda fucntion
df['email_clean'] = df['email'].apply(lambda x: clean_text(x.lower()))

df.head()

Unnamed: 0,label,email,email_clean
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."
