## Installing Dependencies

In [1]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Case Conversion

In [2]:
text = "The quick brown for jumped over The Big Dog"
text

'The quick brown for jumped over The Big Dog'

In [3]:
text.lower()

'the quick brown for jumped over the big dog'

In [4]:
text.upper()

'THE QUICK BROWN FOR JUMPED OVER THE BIG DOG'

In [5]:
text.title()

'The Quick Brown For Jumped Over The Big Dog'

## Tokenization

In [6]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

In [7]:
import nltk

nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [8]:
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


In [9]:
import spacy
nlp = spacy.load("en")

text_spacy = nlp(sample_text)
print(text_spacy)

US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts.


In [10]:
[obj.text for obj in text_spacy.sents]

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [11]:
print([obj.text for obj in text_spacy])

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'", 'Summit', "'", ',', 'beating', 'the', 'previous', 'record', '-', 'holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


## Removing HTML tags & noise

In [12]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

************************* */
hr {
    width: 45%;
    /* adjust to ape original work */
    margin-top: 1em;
    /* space above & below */
    margin-bottom: 1em;
    margin-left: auto;
    /* these two ensure a.. */
    margin-right: auto;
    /* ..centered rule */
    clear: both;
    /* don't let sidebars & floats overlap rule */
    }
/* ************************************************************************
 * Images and captions
 * ********************************************************************** */
img {
    /* the default inline image has */
    border: 1px solid black;
    /* a thin black line border.. */
    padding: 6px;
    /* ..spaced a bit out from the graphic */
    }</style>
<link rel="schema.dc" href="http://purl.org/dc/elements/1.1/">
<link rel="schema.dcterms" href="http://purl.org/dc/terms/">
<meta name="dc.title" content="The Bible, King James version, Book 1: Genesis">
<meta name="dc.language" content="en">
<meta name="dcterms.sou

In [13]:
import re 
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(["iframe", "script"])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r"[\r|\n|\r\n]+", "\n", stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])



    /* or left?? */
    text-indent: 1em;
    /* first-line indent */
    }
/* suppress indentation on paragraphs following heads  */
h2 + p, h3 + p, h4 + p {
    text-indent: 0
    }
/* tighter spacing for list item paragraphs */
dd, li {
    margin-top: 0.25em;
    margin-bottom: 0;
    line-height: 1.2em;
    /* a bit closer than p's */
    }
/* ************************************************************************
 * Head 2 is for chapter heads. 
 * ********************************************************************** */
h2 {
    /* text-align:center;  left-aligned by default. */
    margin-top: 3em;
    /* extra space above.. */
    margin-bottom: 2em;
    /* ..and below */
    clear: both;
    /* don't let sidebars overlap */
    }
/* ***************************************


## Removing Accented Characters

In [14]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")
    return text

In [15]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [16]:
remove_accented_chars(s)

'Some Accented text'

## Removing Special Characters, Numbers and Symbols

In [21]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r"[^a-ZA-Z0-9\s]" if not remove_digits else r"[^a-zA-Z\s]"
    text = re.sub(pattern, "", text)
    return text

In [22]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

In [23]:
remove_special_characters(s, remove_digits=True)

'Well this was fun See you at  What do you think  '

## Expanding Contractions

In [25]:
!pip install contractions
!pip install textsearch

Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 12.8 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 92.3 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.21


In [26]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [27]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [29]:
contractions.fix(s)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

## Stemming

In [30]:
# Porter Stemmer

from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem("jumping"), ps.stem("jumps"), ps.stem("jumped")

('jump', 'jump', 'jump')

In [32]:
ps.stem("lying")

'lie'

In [33]:
ps.stem("strange")

'strang'

## Lemmatization

In [34]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [35]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word, pos='n') method of nltk.stem.wordnet.WordNetLemmatizer instance



In [36]:
# Lemmatize nouns

print(wnl.lemmatize("cars", "n"))
print(wnl.lemmatize("boxes", "n"))

car
box


In [37]:
# Lemmatize verbs

print(wnl.lemmatize("running", "v"))
print(wnl.lemmatize("ate", "v"))

run
eat


In [38]:
# Lemmatize adjectives

print(wnl.lemmatize("saddest", "a"))
print(wnl.lemmatize("fancier", "a"))

sad
fancy


In [39]:
# Ineffective Lemmatization

print(wnl.lemmatize("ate", "n"))
print(wnl.lemmatize("fancier", "v"))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


In [42]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

## Tokenize

In [43]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [44]:
lemmatized_text = " ".join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

## POS Tagging

In [45]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


## Tag conversion to WordNet Tags

In [55]:
from nltk.corpus import wordnet

def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens


In [56]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

[('The', 'n'), ('brown', 'a'), ('foxes', 'n'), ('are', 'v'), ('quick', 'a'), ('and', 'n'), ('they', 'n'), ('are', 'v'), ('jumping', 'v'), ('over', 'n'), ('the', 'n'), ('sleeping', 'v'), ('lazy', 'a'), ('dogs', 'n'), ('!', 'n')]


## Effective Lemmatization

In [57]:
lemmatized_text = " ".join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

'The brown fox be quick and they be jump over the sleep lazy dog !'

## Your turn: Define a function such that you put all the above steps together so that it does the following
Function name is wordnet_lemmatize_text(...)
Input is a variable text which should take in a document (bunch of words)
Call the earlier defined functions and utilize them
Return lemmatized text as the output (as a string)


In [58]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    lemmatized_text = " ".join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return lemmatized_text

## Your Turn: Now call the function on the below sentence and test it


In [59]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [60]:
wordnet_lemmatize_text(s)

'The brown fox be quick and they be jump over the sleep lazy dog !'

## Lemmatization with Spacy

In [62]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [63]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [64]:
spacy_lemmatize_text(s)

'the brown fox be quick and they be jump over the sleep lazy dog !'

## Stopword Removal

In [65]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


In [66]:
stop_words = nltk.corpus.stopwords.words("english")
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [67]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [68]:
remove_stopwords(s, is_lower_case=False)

'brown foxes quick jumping sleeping lazy dogs !'

## Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list


In [69]:
stop_words.remove("the")
stop_words.append("brown")

In [70]:
remove_stopwords(s, is_lower_case=False, stopwords=stop_words)

'The foxes quick jumping the sleeping lazy dogs !'