In [24]:
import pandas as pd
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

In [25]:
df = pd.read_csv('uw.csv')

In [26]:
fileref = open('english3.txt', 'r', encoding="utf8")
all_words = fileref.read()

In [27]:
all_words = word_tokenize(all_words)

In [5]:
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

In [6]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation and c not in string.digits])
    return no_punct

In [7]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [8]:
def remove_not_english(text):
    eng_only = [w for w in text if w in all_words]
    return eng_only

In [9]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [10]:
stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [11]:
df['clean'] = df['Input'].apply(lambda x: remove_punctuation(x))
df['clean'].head()

0                      screams in  different languages
1    Families to sue over Legionnaires More than  f...
2    Pandemonium In Aba As Woman Delivers Baby With...
3    My emotions are a train wreck My body is a tra...
4    Alton brown just did a livestream and he burne...
Name: clean, dtype: object

In [12]:
df['clean'] = df['clean'].apply(lambda x: remove_html(x))
#df['Input'].head()

In [13]:
tokenizer = RegexpTokenizer(r'\w+')

In [14]:
df['clean'] = df['clean'].apply(lambda x: tokenizer.tokenize(x.lower()))
df['clean']

0                     [screams, in, different, languages]
1       [families, to, sue, over, legionnaires, more, ...
2       [pandemonium, in, aba, as, woman, delivers, ba...
3       [my, emotions, are, a, train, wreck, my, body,...
4       [alton, brown, just, did, a, livestream, and, ...
                              ...                        
1859    [trollkrattos, juan, carlos, salvador, the, se...
1860    [devonbreneman, hopefully, it, doesnt, electro...
1861    [businesses, are, deluged, with, invokces, mak...
1862    [breaking, police, officers, arrested, for, ab...
1863    [news, refugio, oil, spill, may, have, been, c...
Name: clean, Length: 1864, dtype: object

In [15]:
df['clean'] = df['clean'].apply(lambda x: remove_stopwords(x))
df['clean']

0                         [screams, different, languages]
1       [families, sue, legionnaires, families, affect...
2       [pandemonium, aba, woman, delivers, baby, with...
3       [emotions, train, wreck, body, train, wreck, i...
4       [alton, brown, livestream, burned, butter, tou...
                              ...                        
1859    [trollkrattos, juan, carlos, salvador, secret,...
1860    [devonbreneman, hopefully, doesnt, electrocute...
1861    [businesses, deluged, invokces, make, stand, c...
1862    [breaking, police, officers, arrested, abusing...
1863    [news, refugio, oil, spill, may, costlier, big...
Name: clean, Length: 1864, dtype: object

In [16]:
df['clean'] = df['clean'].apply(lambda x: remove_not_english(x))
df['clean']

0                         [screams, different, languages]
1       [families, sue, legionnaires, families, affect...
2       [pandemonium, aba, woman, delivers, baby, with...
3       [emotions, train, wreck, body, train, wreck, w...
4       [alton, brown, burned, butter, touched, hot, p...
                              ...                        
1859    [juan, carlos, salvador, secret, tips, get, ri...
1860            [hopefully, electrocute, heated, blanket]
1861    [businesses, deluged, make, stand, colour, lik...
1862    [breaking, police, officers, arrested, abusing...
1863    [news, oil, spill, may, costlier, bigger, proj...
Name: clean, Length: 1864, dtype: object

In [17]:
df['stemmed'] = df['clean'].apply(lambda x: word_stemmer(x))
df['stemmed']

0                                   scream differ languag
1       famili sue legionnair famili affect fatal outb...
2       pandemonium aba woman deliv babi without face ...
3                 emot train wreck bodi train wreck wreck
4       alton brown burn butter touch hot plate soon m...
                              ...                        
1859    juan carlo salvador secret tip get riot point ...
1860                         hope electrocut heat blanket
1861    busi delug make stand colour like rise top pay...
1862    break polic offic arrest abus children boot ca...
1863           news oil spill may costlier bigger project
Name: stemmed, Length: 1864, dtype: object

In [18]:
df['lemmazed'] = df['clean'].apply(lambda x: word_lemmatizer(x))
df['lemmazed']

0                           [scream, different, language]
1       [family, sue, legionnaire, family, affected, f...
2       [pandemonium, aba, woman, delivers, baby, with...
3       [emotion, train, wreck, body, train, wreck, wr...
4       [alton, brown, burned, butter, touched, hot, p...
                              ...                        
1859    [juan, carlos, salvador, secret, tip, get, rio...
1860            [hopefully, electrocute, heated, blanket]
1861    [business, deluged, make, stand, colour, likel...
1862    [breaking, police, officer, arrested, abusing,...
1863    [news, oil, spill, may, costlier, bigger, proj...
Name: lemmazed, Length: 1864, dtype: object

In [21]:
#df.drop(['Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)


In [23]:
df.to_csv('words_clean.csv')