In [1]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')

In [3]:
data = pd.read_json('cleaned_data.json')
data.head()

Unnamed: 0,Message,Category,file_name
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0,00001.7c53336b37003a9286aba55d2945844c
1,"Martin A posted:\n\nTassos Papadopoulos, the G...",0,00002.9c4069e25e1ef370c078db7ee85ff9ac
2,Man Threatens Explosion In Moscow \n\n\n\nThur...,0,00003.860e3c3cee1b42ead714c5c874fe25f7
3,Klez: The Virus That Won't Die\n\n \n\nAlready...,0,00004.864220c5b6930b209cc287c361c99af1
4,"> in adding cream to spaghetti carbonara, whi...",0,00005.bf27cdeaf0b8c4647ecd61b1d09da613


In [4]:
stop_words_set = set(stopwords.words('english'))

In [5]:
# function for removing stop words, non-alpha and stemming
def stem_and_clean(word_list,
                   stemmer = PorterStemmer(),
                   stop_words_set = set(stopwords.words('english'))
                  ):
    
    cleaned_words = []
    for word in word_list:
        if word not in stop_words_set and word.isalpha():
            stemmed_word = stemmer.stem(word)
            cleaned_words.append(stemmed_word)
    return cleaned_words

# function for removing html
def parse_html(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup.get_text()

message = ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!', '<html>', '</body']

stem_and_clean(message, stop_words_set)

['nobodi', 'expect', 'spanish', 'inquisit']

In [8]:
# use this function to create multiple columns with different steps
def apply_nlp(data, stop_words_set):

    # first we parse the content to remove html
    data['Parsed_Message'] = data['Message'].apply(parse_html)

    # second we tokenize and lowercase
    data['tokenized'] = data['Parsed_Message'].apply(lambda row : word_tokenize(row.lower()))

    # third we remove stop words and non-alpha content and stem
    data['tokenized_cleaned'] = data['tokenized'].apply(lambda row: stem_and_clean(row, stop_words_set))
    
    return data

In [13]:
def clean_message(message, stop_words_set):
    
    # remove html
    sans_html = parse_html(message)
    
    # tokenize and lower
    tokenized = word_tokenize(sans_html.lower())
    
    # stem, remove stop words and non-alpha
    cleaned_list = stem_and_clean(tokenized, stop_words_set)
    
    return message

data['cleaned_message'] = data['Message'].apply(lambda row : clean_message(row, stop_words_set))

  soup = BeautifulSoup(content, 'html.parser')
  soup = BeautifulSoup(content, 'html.parser')


In [14]:
data.cleaned_message

0           Date:        Wed, 21 Aug 2002 10:54:46 -05...
1       Martin A posted:\n\nTassos Papadopoulos, the G...
2       Man Threatens Explosion In Moscow \n\n\n\nThur...
3       Klez: The Virus That Won't Die\n\n \n\nAlready...
4       >  in adding cream to spaghetti carbonara, whi...
                              ...                        
5791    <html>\n\n<head>\n\n<meta http-equiv="content-...
5792    This is a multi-part message in MIME format.\n...
5793    Dear Subscriber,\n\n\n\nIf I could show you a ...
5794    ****Mid-Summer Customer Appreciation SALE!****...
5795    ATTN:SIR/MADAN      \n\n\n\n                  ...
Name: cleaned_message, Length: 5796, dtype: object

In [10]:
# data = apply_nlp(data, stop_words_set)
# data.head()

  soup = BeautifulSoup(content, 'html.parser')
  soup = BeautifulSoup(content, 'html.parser')


Unnamed: 0,Message,Category,file_name,Parsed_Message,tokenized,tokenized_cleaned
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0,00001.7c53336b37003a9286aba55d2945844c,"Date: Wed, 21 Aug 2002 10:54:46 -05...","[date, :, wed, ,, 21, aug, 2002, 10:54:46, -05...","[date, wed, aug, chri, garrigu, ca, reproduc, ..."
1,"Martin A posted:\n\nTassos Papadopoulos, the G...",0,00002.9c4069e25e1ef370c078db7ee85ff9ac,"Martin A posted:\n\nTassos Papadopoulos, the G...","[martin, a, posted, :, tassos, papadopoulos, ,...","[martin, post, tasso, papadopoulo, greek, scul..."
2,Man Threatens Explosion In Moscow \n\n\n\nThur...,0,00003.860e3c3cee1b42ead714c5c874fe25f7,Man Threatens Explosion In Moscow \n\n\n\nThur...,"[man, threatens, explosion, in, moscow, thursd...","[man, threaten, explos, moscow, thursday, augu..."
3,Klez: The Virus That Won't Die\n\n \n\nAlready...,0,00004.864220c5b6930b209cc287c361c99af1,Klez: The Virus That Won't Die\n\n \n\nAlready...,"[klez, :, the, virus, that, wo, n't, die, alre...","[klez, viru, wo, die, alreadi, prolif, viru, e..."
4,"> in adding cream to spaghetti carbonara, whi...",0,00005.bf27cdeaf0b8c4647ecd61b1d09da613,"> in adding cream to spaghetti carbonara, whi...","[>, in, adding, cream, to, spaghetti, carbonar...","[ad, cream, spaghetti, carbonara, effect, past..."
