In [4]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# Noise Removal

In [5]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

# Tokenization

In [25]:
def tokenize(text):
    return nltk.word_tokenize(text)

# Normalization

In [28]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    return [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    return [word.lower() for word in words]

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    return [p.number_to_words(word) if word.isdigit() else word for word in words]

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    return [word for word in words if word not in stopwords.words('english')]

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    return [stemmer.stem(word) for word in words]

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word, pos='v') for word in words]

In [29]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

In [23]:
sample = """<h1>Title Goes Here</h1>
<b>Bolded Text</b>
<i>Italicized Text</i>
<img src="this should all be gone"/>
<a href="this will be gone, too">But this will still be here!</a>
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?
[Some text we don't want to keep is in here]
¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.
[This is some other unwanted text]
John: "Well, well, well."
James: "There, there. There, there."
&nbsp;&nbsp;
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}
{Here is more stuff in single curly braces.}
[DELETE]
</body>
</html>"""

In [27]:
sample = replace_contractions(denoise_text(sample))
words = tokenize(sample)
normalize(words)

['title',
 'goes',
 'bolded',
 'text',
 'italicized',
 'text',
 'still',
 'run',
 'ran',
 'running',
 'stop',
 'running',
 'talked',
 'talking',
 'talked',
 'running',
 'ran',
 'talking',
 'runner',
 'sebastian',
 'nicolas',
 'alejandro',
 'jeronimo',
 'going',
 'store',
 'tomorrow',
 'morning',
 'something',
 'wrong',
 'sentence',
 'anymore',
 'know',
 'could',
 'dinner',
 'restaurant',
 'favorite',
 'movie',
 'franchises',
 'order',
 'indiana',
 'jones',
 'marvel',
 'cinematic',
 'universe',
 'star',
 'wars',
 'back',
 'future',
 'harry',
 'potter',
 'billy',
 'know',
 'great',
 'little',
 'house',
 'got',
 'john',
 'well',
 'well',
 'well',
 'james',
 'lot',
 'reasons',
 'one hundred and one',
 'reasons',
 'one million',
 'reasons',
 'actually',
 'go',
 'get',
 'two',
 'tutus',
 'two',
 'different',
 'stores',
 'twenty-two',
 'forty-five',
 'one thousand and sixty-seven',
 'four hundred and forty-five',
 'stuff',
 'inside',
 'double',
 'curly',
 'braces',
 'stuff',
 'single',
 'curl

In [32]:
stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)

Stemmed:
 ['titl', 'goe', 'her', 'bold', 'text', 'it', 'text', 'but', 'thi', 'wil', 'stil', 'be', 'her', '!', 'i', 'run', '.', 'he', 'ran', '.', 'she', 'is', 'run', '.', 'wil', 'they', 'stop', 'run', '?', 'i', 'talk', '.', 'she', 'was', 'talk', '.', 'they', 'talk', 'to', 'them', 'about', 'run', '.', 'who', 'ran', 'to', 'the', 'talk', 'run', '?', '¡sebastián', ',', 'nicolá', ',', 'alejandro', 'and', 'jéronimo', 'ar', 'going', 'to', 'the', 'stor', 'tomorrow', 'morn', '!', 'someth', '...', 'is', '!', 'wrong', '(', ')', 'with.', ',', ';', 'thi', ':', ':', 'sent', '.', 'i', 'can', 'not', 'do', 'thi', 'anym', '.', 'i', 'did', 'not', 'know', 'them', '.', 'why', 'could', 'not', 'you', 'hav', 'din', 'at', 'the', 'resta', '?', 'my', 'favorit', 'movy', 'franch', ',', 'in', 'ord', ':', 'indian', 'jon', ';', 'marvel', 'cinem', 'univers', ';', 'star', 'war', ';', 'back', 'to', 'the', 'fut', ';', 'harry', 'pot', '.', 'do', 'not', 'do', 'it', '...', '.', 'just', 'do', 'not', '.', 'bil', '!', 'i', 'kno