# Text Processing Using NLTK Libraries

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Resources/emails.csv', usecols=['text', 'spam'])
data.head(5)

Unnamed: 0,text,spam
0,Subject: naturally it's your irresistible your...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## Rename Column


In [3]:
maps = {'text':"Email Content", 'spam':"Spam Messages"}
data.rename(columns=maps, inplace=True)
data.head()

Unnamed: 0,Email Content,Spam Messages
0,Subject: naturally it's your irresistible your...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## Expand Contractions


In [4]:
import contractions

In [5]:
text = data['Email Content'][0]
print("Original Text: ", text)
print("Expanded Text: ")
for i in text.split():
    print(contractions.fix(i), end = ' ')

Original Text:  Subject: naturally it's your irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we don't promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you'll see logo drafts within three business days . aff

## Lower Case


In [6]:
data['text'] = data['Email Content'].str.lower()
data['text'][0]

"subject: naturally it's your irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we don't promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you'll see logo drafts within three business days . affordability : yo

## Remove punctuation


In [7]:
import re

In [8]:
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['text'][3]

'subject 4 color printing special  request additional information now  click here  click here for a printable version of our order form  pdf format   phone   626  338  8090 fax   626  338  8102 e  mail  ramsey  goldengraphix  com  request additional information now  click here  click here for a printable version of our order form  pdf format   golden graphix  printing 5110 azusa canyon rd  irwindale  ca 91706 this e  mail message is an advertisement and  or solicitation of games54 and game5s4  '

## Remove digits and word contating digits


In [9]:
data['text'] = data['text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))
data['text'][3]

'subject  color printing special  request additional information now  click here  click here for a printable version of our order form  pdf format   phone        fax        e  mail  ramsey  goldengraphix  com  request additional information now  click here  click here for a printable version of our order form  pdf format   golden graphix  printing  azusa canyon rd  irwindale  ca  this e  mail message is an advertisement and  or solicitation of  and   '

## Remove stop words and specified words

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/anant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
def remove_stopwords(x):
    return " ".join([word for word in str(x).split() if word not in stop_words])

data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
data['text'][3]


'color printing special request additional information click click printable version order form pdf format phone fax e mail ramsey goldengraphix com request additional information click click printable version order form pdf format golden graphix printing azusa canyon rd irwindale ca e mail message advertisement solicitation'