In [2]:
import pandas as pd
import numpy as np
import nltk

In [3]:
import requests
data = requests.get('http://www.gutenberg.org/files/1399/1399-h/1399-h.htm')
content = data.content
print(content[1930:2300])

b'Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org\r\n\r\n\r\nTitle: Anna Karenina\r\n\r\nAuthor: Leo Tolstoy\r\n\r\nRelease Date: July 01, 1998 [EBook #1399]\r\nLast Updated: January 11, 2020\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF GUTENBERG EBOOK ANNA KARENINA ***\r\n\r\n\r\n\r\n\r\nProduced by David Brannan, Andrew Sly and David Widger.\r'


In [4]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)

In [5]:
sample_text = clean_content[1932:2721]
sample_text

'Happy families are all alike; every unhappy family is unhappy in its own way.\nEverything was in confusion in the Oblonskys’ house. The wife had\ndiscovered that the husband was carrying on an intrigue with a French girl, who\nhad been a governess in their family, and she had announced to her husband that\nshe could not go on living in the same house with him. This position of affairs\nhad now lasted three days, and not only the husband and wife themselves, but\nall the members of their family and household, were painfully conscious of it.\nEvery person in the house felt that there was no sense in their living\ntogether, and that the stray people brought together by chance in any inn had\nmore in common with one another than they, the members of the family and\nhousehold of the Oblonskys.'

# Sentence tokenization

### use nltk sentence tokenizer

In [6]:
default_st = nltk.sent_tokenize

sample_sentences = default_st(text=sample_text)
print('Total sentences in sample_text:', len(sample_sentences), '\n')
print('Sample text sentences : \n', sample_sentences)


Total sentences in sample_text: 5 

Sample text sentences : 
 ['Happy families are all alike; every unhappy family is unhappy in its own way.', 'Everything was in confusion in the Oblonskys’ house.', 'The wife had\ndiscovered that the husband was carrying on an intrigue with a French girl, who\nhad been a governess in their family, and she had announced to her husband that\nshe could not go on living in the same house with him.', 'This position of affairs\nhad now lasted three days, and not only the husband and wife themselves, but\nall the members of their family and household, were painfully conscious of it.', 'Every person in the house felt that there was no sense in their living\ntogether, and that the stray people brought together by chance in any inn had\nmore in common with one another than they, the members of the family and\nhousehold of the Oblonskys.']


In [7]:
### Now, as you can see, the tokenizer is quite intelligent. It doesn’t just use periods to delimit sentences,
### but also considers other punctuation and capitalization of words. We can also tokenize text of other 
### languages using some pretrained models present in NLTK.

### or we can use the punkt sentence tokenizer from nltk

In [8]:
punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
print('Total sentences in sample_text:', len(sample_sentences), '\n')
print('Sample text sentences : \n', np.array(sample_sentences))

Total sentences in sample_text: 5 

Sample text sentences : 
 ['Happy families are all alike; every unhappy family is unhappy in its own way.'
 'Everything was in confusion in the Oblonskys’ house.'
 'The wife had\ndiscovered that the husband was carrying on an intrigue with a French girl, who\nhad been a governess in their family, and she had announced to her husband that\nshe could not go on living in the same house with him.'
 'This position of affairs\nhad now lasted three days, and not only the husband and wife themselves, but\nall the members of their family and household, were painfully conscious of it.'
 'Every person in the house felt that there was no sense in their living\ntogether, and that the stray people brought together by chance in any inn had\nmore in common with one another than they, the members of the family and\nhousehold of the Oblonskys.']


### regextokenizer

The last tokenizer we cover in sentence tokenization is using an instance of the RegexpTokenizer class to tokenize text into sentences, where we will use specific regular expression-based patterns to segment sentences. The following snippet shows how to use a regex pattern to tokenize sentences.

In [9]:
### remove \n from text
sample_text2 = sample_text.replace("\n", " ")

SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st = nltk.tokenize.RegexpTokenizer(
            pattern=SENTENCE_TOKENS_PATTERN,
            gaps=True)
sample_sentences = regex_st.tokenize(sample_text2)
print('Total sentences in sample_text:', len(sample_sentences), '\n')
print('Sample text sentences : \n', np.array(sample_sentences))

Total sentences in sample_text: 5 

Sample text sentences : 
 ['Happy families are all alike; every unhappy family is unhappy in its own way.'
 'Everything was in confusion in the Oblonskys’ house.'
 'The wife had discovered that the husband was carrying on an intrigue with a French girl, who had been a governess in their family, and she had announced to her husband that she could not go on living in the same house with him.'
 'This position of affairs had now lasted three days, and not only the husband and wife themselves, but all the members of their family and household, were painfully conscious of it.'
 'Every person in the house felt that there was no sense in their living together, and that the stray people brought together by chance in any inn had more in common with one another than they, the members of the family and household of the Oblonskys.']


# Word Tokenization 

### default nltk word tokenizer

In [10]:
default_wt = nltk.word_tokenize
words = default_wt(sample_text)
np.array(words)

array(['Happy', 'families', 'are', 'all', 'alike', ';', 'every',
       'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way',
       '.', 'Everything', 'was', 'in', 'confusion', 'in', 'the',
       'Oblonskys', '’', 'house', '.', 'The', 'wife', 'had', 'discovered',
       'that', 'the', 'husband', 'was', 'carrying', 'on', 'an',
       'intrigue', 'with', 'a', 'French', 'girl', ',', 'who', 'had',
       'been', 'a', 'governess', 'in', 'their', 'family', ',', 'and',
       'she', 'had', 'announced', 'to', 'her', 'husband', 'that', 'she',
       'could', 'not', 'go', 'on', 'living', 'in', 'the', 'same', 'house',
       'with', 'him', '.', 'This', 'position', 'of', 'affairs', 'had',
       'now', 'lasted', 'three', 'days', ',', 'and', 'not', 'only', 'the',
       'husband', 'and', 'wife', 'themselves', ',', 'but', 'all', 'the',
       'members', 'of', 'their', 'family', 'and', 'household', ',',
       'were', 'painfully', 'conscious', 'of', 'it', '.', 'Every',
       'person', '

### toktoktokenizer

In [11]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
words = tokenizer.tokenize(sample_text)
np.array(words)

array(['Happy', 'families', 'are', 'all', 'alike', ';', 'every',
       'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way.',
       'Everything', 'was', 'in', 'confusion', 'in', 'the', 'Oblonskys',
       '’', 'house.', 'The', 'wife', 'had', 'discovered', 'that', 'the',
       'husband', 'was', 'carrying', 'on', 'an', 'intrigue', 'with', 'a',
       'French', 'girl', ',', 'who', 'had', 'been', 'a', 'governess',
       'in', 'their', 'family', ',', 'and', 'she', 'had', 'announced',
       'to', 'her', 'husband', 'that', 'she', 'could', 'not', 'go', 'on',
       'living', 'in', 'the', 'same', 'house', 'with', 'him.', 'This',
       'position', 'of', 'affairs', 'had', 'now', 'lasted', 'three',
       'days', ',', 'and', 'not', 'only', 'the', 'husband', 'and', 'wife',
       'themselves', ',', 'but', 'all', 'the', 'members', 'of', 'their',
       'family', 'and', 'household', ',', 'were', 'painfully',
       'conscious', 'of', 'it.', 'Every', 'person', 'in', 'the', 'house',
  

### regextokenizer

We now look at how to use regular expressions and the RegexpTokenizer class to tokenize sentences into words. Remember that there are two main parameters that are useful in tokenization—the regex pattern for building the tokenizer and the gaps parameter, which, if set to true, is used to find the gaps between the tokens. Otherwise, it is used to find the tokens themselves.

In [12]:
TOKEN_PATTERN = r'\w+'
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,
                                gaps=False)
words = regex_wt.tokenize(sample_text)
np.array(words)

array(['Happy', 'families', 'are', 'all', 'alike', 'every', 'unhappy',
       'family', 'is', 'unhappy', 'in', 'its', 'own', 'way', 'Everything',
       'was', 'in', 'confusion', 'in', 'the', 'Oblonskys', 'house', 'The',
       'wife', 'had', 'discovered', 'that', 'the', 'husband', 'was',
       'carrying', 'on', 'an', 'intrigue', 'with', 'a', 'French', 'girl',
       'who', 'had', 'been', 'a', 'governess', 'in', 'their', 'family',
       'and', 'she', 'had', 'announced', 'to', 'her', 'husband', 'that',
       'she', 'could', 'not', 'go', 'on', 'living', 'in', 'the', 'same',
       'house', 'with', 'him', 'This', 'position', 'of', 'affairs', 'had',
       'now', 'lasted', 'three', 'days', 'and', 'not', 'only', 'the',
       'husband', 'and', 'wife', 'themselves', 'but', 'all', 'the',
       'members', 'of', 'their', 'family', 'and', 'household', 'were',
       'painfully', 'conscious', 'of', 'it', 'Every', 'person', 'in',
       'the', 'house', 'felt', 'that', 'there', 'was', 'no', 'se

In [13]:
# pattern to identify tokens by using gaps between tokens
GAP_PATTERN = r'\s+'
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,
                                gaps=True)
words = regex_wt.tokenize(sample_text)
np.array(set(words))

array({'intrigue', 'affairs', 'families', 'announced', 'governess', 'with', 'stray', 'another', 'every', 'is', 'Oblonskys.', 'This', 'family,', 'girl,', 'had', 'position', 'they,', 'Oblonskys’', 'more', 'common', 'now', 'discovered', 'no', 'and', 'members', 'there', 'way.', 'together', 'who', 'were', 'could', 'been', 'person', 'alike;', 'house.', 'painfully', 'not', 'house', 'an', 'inn', 'together,', 'same', 'own', 'all', 'household,', 'a', 'felt', 'chance', 'themselves,', 'to', 'Every', 'one', 'carrying', 'that', 'any', 'husband', 'than', 'of', 'go', 'household', 'was', 'brought', 'Happy', 'people', 'but', 'three', 'their', 'French', 'she', 'by', 'unhappy', 'living', 'Everything', 'wife', 'on', 'the', 'in', 'lasted', 'him.', 'sense', 'family', 'days,', 'conscious', 'its', 'confusion', 'only', 'her', 'it.', 'The', 'are'},
      dtype=object)

### word punc tokenizer

Besides the base RegexpTokenizer class , there are several derived classes that perform different types of word tokenization. The WordPunktTokenizer uses the pattern r'\w+|[^\w\s]+' to tokenize sentences into independent alphabetic and non-alphabetic tokens.


In [14]:
wordpunkt_wt = nltk.WordPunctTokenizer()
words = wordpunkt_wt.tokenize(sample_text)
np.array(set(words))

array({'intrigue', 'affairs', 'families', 'announced', 'governess', 'with', 'stray', 'another', 'every', 'is', 'it', 'This', 'had', 'position', 'alike', 'more', '’', 'common', 'now', 'discovered', 'no', 'and', 'members', 'there', 'together', 'who', 'were', 'could', 'been', 'person', 'him', 'painfully', 'not', 'house', 'an', 'inn', 'same', 'own', 'all', 'a', 'felt', 'chance', 'they', 'to', 'Every', 'one', 'carrying', 'that', 'any', 'way', 'husband', 'themselves', 'than', 'of', 'go', 'household', 'was', 'Happy', 'brought', 'people', 'but', 'three', 'their', 'girl', 'days', 'French', 'she', 'by', 'unhappy', ';', 'living', 'Everything', 'wife', 'on', 'the', 'in', 'lasted', ',', 'sense', 'family', 'conscious', 'its', 'confusion', 'only', 'her', '.', 'Oblonskys', 'The', 'are'},
      dtype=object)

### whitespace tokenizer

The WhitespaceTokenizer tokenizes sentences into words based on whitespace, like tabs, newlines, and spaces. The following snippet shows demonstrations of these tokenizers.


In [15]:
whitespace_wt = nltk.WhitespaceTokenizer()
words = whitespace_wt.tokenize(sample_text)
np.array(words)

array(['Happy', 'families', 'are', 'all', 'alike;', 'every', 'unhappy',
       'family', 'is', 'unhappy', 'in', 'its', 'own', 'way.',
       'Everything', 'was', 'in', 'confusion', 'in', 'the', 'Oblonskys’',
       'house.', 'The', 'wife', 'had', 'discovered', 'that', 'the',
       'husband', 'was', 'carrying', 'on', 'an', 'intrigue', 'with', 'a',
       'French', 'girl,', 'who', 'had', 'been', 'a', 'governess', 'in',
       'their', 'family,', 'and', 'she', 'had', 'announced', 'to', 'her',
       'husband', 'that', 'she', 'could', 'not', 'go', 'on', 'living',
       'in', 'the', 'same', 'house', 'with', 'him.', 'This', 'position',
       'of', 'affairs', 'had', 'now', 'lasted', 'three', 'days,', 'and',
       'not', 'only', 'the', 'husband', 'and', 'wife', 'themselves,',
       'but', 'all', 'the', 'members', 'of', 'their', 'family', 'and',
       'household,', 'were', 'painfully', 'conscious', 'of', 'it.',
       'Every', 'person', 'in', 'the', 'house', 'felt', 'that', 'there',
     

### Building Robust Tokenizers with NLTK and spaCy
For a typical NLP pipeline, I recommend leveraging state-of-the-art libraries like NLTK and spaCy and using some of their robust utilities to build a custom function to perform both sentence- and word-level tokenization. A simple example is depicted in the following snippets. We start with looking at how we can leverage NLTK.


In [16]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

sents = tokenize_text(sample_text)
np.array(sents)

array([list(['Happy', 'families', 'are', 'all', 'alike', ';', 'every', 'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way', '.']),
       list(['Everything', 'was', 'in', 'confusion', 'in', 'the', 'Oblonskys', '’', 'house', '.']),
       list(['The', 'wife', 'had', 'discovered', 'that', 'the', 'husband', 'was', 'carrying', 'on', 'an', 'intrigue', 'with', 'a', 'French', 'girl', ',', 'who', 'had', 'been', 'a', 'governess', 'in', 'their', 'family', ',', 'and', 'she', 'had', 'announced', 'to', 'her', 'husband', 'that', 'she', 'could', 'not', 'go', 'on', 'living', 'in', 'the', 'same', 'house', 'with', 'him', '.']),
       list(['This', 'position', 'of', 'affairs', 'had', 'now', 'lasted', 'three', 'days', ',', 'and', 'not', 'only', 'the', 'husband', 'and', 'wife', 'themselves', ',', 'but', 'all', 'the', 'members', 'of', 'their', 'family', 'and', 'household', ',', 'were', 'painfully', 'conscious', 'of', 'it', '.']),
       list(['Every', 'person', 'in', 'the', 'house', 'felt', 'th

We can also get to the level of word-level tokenization by leveraging list comprehensions, as depicted in the following code.

In [17]:
words = [word for sentence in sents for word in sentence]
np.array(words)

array(['Happy', 'families', 'are', 'all', 'alike', ';', 'every',
       'unhappy', 'family', 'is', 'unhappy', 'in', 'its', 'own', 'way',
       '.', 'Everything', 'was', 'in', 'confusion', 'in', 'the',
       'Oblonskys', '’', 'house', '.', 'The', 'wife', 'had', 'discovered',
       'that', 'the', 'husband', 'was', 'carrying', 'on', 'an',
       'intrigue', 'with', 'a', 'French', 'girl', ',', 'who', 'had',
       'been', 'a', 'governess', 'in', 'their', 'family', ',', 'and',
       'she', 'had', 'announced', 'to', 'her', 'husband', 'that', 'she',
       'could', 'not', 'go', 'on', 'living', 'in', 'the', 'same', 'house',
       'with', 'him', '.', 'This', 'position', 'of', 'affairs', 'had',
       'now', 'lasted', 'three', 'days', ',', 'and', 'not', 'only', 'the',
       'husband', 'and', 'wife', 'themselves', ',', 'but', 'all', 'the',
       'members', 'of', 'their', 'family', 'and', 'household', ',',
       'were', 'painfully', 'conscious', 'of', 'it', '.', 'Every',
       'person', '

In a similar way, we can leverage spaCy to perform sentence- and word-level tokenizations really quickly, as depicted in the following snippets.

In [18]:
import spacy
nlp = spacy.load('en', parse = True, tag=True, entity=True)
text_spacy = nlp(sample_text)
sents = np.array(list(text_spacy.sents))
sents

array([Happy families are all alike; every unhappy family is unhappy in its own way.
,
       Everything was in confusion in the Oblonskys’ house.,
       The wife had
discovered that the husband was carrying on an intrigue with a French girl, who
had been a governess in their family, and she had announced to her husband that
she could not go on living in the same house with him.,
       This position of affairs
had now lasted three days, and not only the husband and wife themselves, but
all the members of their family and household, were painfully conscious of it.
,
       Every person in the house felt that there was no sense in their living
together, and that the stray people brought together by chance in any inn had
,
       more in common with one another than they, the members of the family and
household of the Oblonskys.], dtype=object)

### REMOVING ACCENTED CHARACTERS
Usually in any text corpus, you might be dealing with accented characters/letters, especially if you only want to analyze the English language. Hence, we need to make sure that these characters are converted and standardized into ASCII characters. This shows a simple example — converting é to e. The following function is a simple way of tackling this task.


In [19]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
remove_accented_chars('Sómě Áccěntěd těxt')
'Some Accented text'

'Some Accented text'

### EXPANDING CONTRACTIONS

Contractions are shortened versions of words or syllables. These exist in written and spoken forms. Shortened versions of existing words are created by removing specific letters and sounds. In the case of English contractions, they are often created by removing one of the vowels from the word. Examples include “is not” to “isn’t” and “will not” to “won’t”, where you can notice the apostrophe being used to denote the contraction and some of the vowels and other letters being removed.

By nature, contractions pose a problem for NLP and text analytics because, to start with, we have a special apostrophe character in the word. Besides this, we also have two or more words represented by a contraction and this opens a whole new can of worms when we try to tokenize them or standardize the words. Hence, there should be some definite process for dealing with contractions when processing text.





In [20]:
from contractions import CONTRACTION_MAP
import re
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [39]:
expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

### REMOVING SPECIAL CHARACTERS
Special characters and symbols are usually non-alphanumeric characters or even occasionally numeric characters (depending on the problem), which add to the extra noise in unstructured text. Usually, simple regular expressions (regexes) can be used to remove them. The following code helps us remove special characters.


In [21]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!",
                          remove_digits=True)

'Well this was fun What do you think '

### case conversions

In [22]:
# lowercase
text = 'The quick brown fox jumped over The Big Dog'
print(text.lower())

# uppercase
print(text.upper())

# title case
print(text.title())


the quick brown fox jumped over the big dog
THE QUICK BROWN FOX JUMPED OVER THE BIG DOG
The Quick Brown Fox Jumped Over The Big Dog


### correcting repeating texts

We will now utilize the WordNet corpus to check for valid words at each stage and terminate the loop once it is obtained. This introduces the semantic correction needed for our algorithm, as illustrated in the following snippet.


In [23]:
from nltk.corpus import wordnet
old_word = 'finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1
while True:
    # check for semantically correct word
    if wordnet.synsets(old_word):
        print("Final correct word:", old_word)
        break
    # remove one repeated character
    new_word = repeat_pattern.sub(match_substitution,
                                  old_word)
    if new_word != old_word:
        print('Step: {} Word: {}'.format(step, new_word))
        step += 1 # update step
        # update old word to last substituted state
        old_word = new_word
        continue
    else:
        print("Final word:", new_word)
        break

Step: 1 Word: finalllyy
Step: 2 Word: finallly
Step: 3 Word: finally
Final correct word: finally


We can build a better version of this code by writing the logic in a function, as depicted here, to make it more generic to deal with incorrect tokens from a list of tokens.


In [24]:
from nltk.corpus import wordnet
def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

In [25]:
sample_sentence = 'My schooool is realllllyyy amaaazingggg'
correct_tokens = remove_repeated_characters(nltk.word_tokenize(sample_sentence))
' '.join(correct_tokens)

'My school is really amazing'

## stemming

Word stems are also often known as the base form of a word and we can create new words by attaching affixes to them. This process is known as inflection. The reverse of this is obtaining the base form of a word from its inflected form and this is known as stemming. Consider the word “JUMP”, you can add affixes to it and form several new words like “JUMPS”, “JUMPED”, and “JUMPING”. In this case, the base word is “JUMP” and this is the word stem. If we were to carry out stemming on any of its three inflected forms, we would get the base form.

Stemming helps us standardize words to their base stem irrespective of their inflections, which helps many applications like classifying or clustering text or even in information retrieval.

In [26]:
# porter stemmer

from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped'), ps.stem('lying'), ps.stem('strange')

('jump', 'jump', 'jump', 'lie', 'strang')

In [27]:
# lancaster stemmer

from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped'), ls.stem('lying'), ls.stem('strange')

('jump', 'jump', 'jump', 'lying', 'strange')

 The RegexpStemmer uses regular expressions to identify the morphological affixes in words and any part of the string matching them is removed. You can see how the stemming results are different from the previous stemmers and is based completely on our custom defined rules based on regular expressions. 


In [28]:
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)
rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped'), rs.stem('lying'), rs.stem('strange')

('jump', 'jump', 'jump', 'ly', 'strange')

The Porter stemmer is used most frequently, but you should choose your stemmer based on your problem and after trial and error. The following is a basic function that can be used for stemming text.

In [29]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

 If needed, you can build your own stemmer with your own defined rules!

## LEMATIZATION 

The process of lemmatization is very similar to stemming, where we remove word affixes to get to a base form of the word. However in this case, this base form is also known as the root word but not the root stem. The difference between the two is that the root stem may not always be a lexicographically correct word, i.e., it may not be present in the dictionary but the root word, also known as the lemma, will always be present in the dictionary.

In [30]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('men', 'n'))

# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

car
men
run
eat
sad
fancy


This function basically finds the base form or lemma for a given word using the word and its part of speech by checking the WordNet corpus and uses a recursive technique for removing affixes from the word until a match is found in WordNet. If no match is found, the input word is returned unchanged. The part of speech is extremely important because if that is wrong, the lemmatization will not be effective, as you can see in the following snippet.


In [31]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))

ate
fancier


#### SpaCy makes things a lot easier since it performs parts of speech tagging and effective lemmatization for each token in a text document without you worrying about if you are using lemmatization effectively. The following function can be leveraged for performing effective lemmatization, thanks to spaCy!


In [32]:
import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

In [33]:
test = nlp(text)
test[0].lemma_, test[3].lemma_

('-PRON-', 'crash')

let's try this with our sample sentences from anna karenina

In [34]:
sample_sentences[4]

'Every person in the house felt that there was no sense in their living together, and that the stray people brought together by chance in any inn had more in common with one another than they, the members of the family and household of the Oblonskys.'

In [35]:
lemmatize_text(sample_sentences[4])

'every person in the house feel that there be no sense in their living together , and that the stray people bring together by chance in any inn have more in common with one another than they , the member of the family and household of the Oblonskys .'

### Removing Stopwords

Stopwords are words that have little or no significance and are usually removed from text when processing it so as to retain words having maximum significance and context. Stopwords usually occur most frequently if you aggregate a corpus of text based on singular tokens and checked their frequencies. Words like “a,” “the,” “and,” and so on are stopwords. There is no universal or exhaustive list of stopwords and often each domain or language has its own set of stopwords.

In [36]:
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    # this will remove the white spaces from the tokens
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
remove_stopwords("The, and, if are stopwords, computer is not")


', , stopwords , computer'

In [37]:
remove_stopwords(sample_sentences[4])

'Every person house felt sense living together , stray people brought together chance inn common one another , members family household Oblonskys .'

There is no universal stopword list, but we use a standard English language stopwords list from NLTK. You can also add your own domain-specific stopwords as needed. In the previous function, we leverage the use of NLTK, which has a list of stopwords for English, and use it to filter out all tokens that correspond to stopwords. This output shows us a reduced number of tokens compared to what we had earlier and you can compare and check the tokens that were removed as stopwords. 

### BRINGING IT ALL TOGETHER — BUILDING A TEXT NORMALIZER
Let’s now bring everything we learned together and chain these operations to build a text normalizer to preprocess text data. We focus on including the major components often used for text wrangling in our custom function.


In [40]:
import re
from bs4 import BeautifulSoup
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

from contractions import CONTRACTION_MAP
import re
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    # this will remove the white spaces from the tokens
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True,
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        normalized_corpus.append(doc)
    return normalized_corpus