# Text Normalization


In [None]:
import spacy
import unicodedata
import re
import nltk
import collections
from nltk.tokenize.toktok import ToktokTokenizer

## HTML Tags

In [None]:
import requests
from bs4 import BeautifulSoup

data = requests.get('https://en.wikipedia.org/wiki/Lucerne') # Note: you've just learnt another way to pull data from Wikipedia
content = data.content
print(content[:500])

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Lucerne - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"afd1fa9c-3543-429f-9b7c-581e88879806","wgCSPN'


In [None]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    ## You can include more HTML preprocessing here to better the output.
    stripped_html_elements = soup.findAll(name='div',attrs={'id':'mw-content-text'})
    stripped_text = ' '.join([h.get_text() for h in stripped_html_elements])
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[:5000])

City in Switzerland
.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}For other uses, see Lucerne (disambiguation).
Municipality in Switzerland.mw-parser-output .infobox-subbox{padding:0;border:none;margin:-3px;width:auto;min-width:100%;font-size:100%;clear:none;float:none;background-color:transparent}.mw-parser-output .infobox-3cols-child{margin:auto}.mw-parser-output .ib-settlement{width:23em;border-collapse:collapse;line-height:1.2em}.mw-parser-output .ib-settlement td,.mw-parser-output .ib-settlement th{border-top:1px solid #a2a9b1;padding:0.4em 0.6em 0.4em 0.6em}.mw-parser-output .ib-settlement .mergedtoprow .infobox-full-data,.mw-parser-output .ib-settlement .mergedtoprow .infobox-header,.mw-parser-output .ib-settlement .mergedtoprow .infobox-data,.mw-parser-output .ib-settlement .mergedtoprow .infobox-label,.mw-

💬 Discuss what else would you need to remove to clean up the Wikipedia page.

## Stemming

- Stemming is the process where we standardize word forms into their base stem irrespective of their inflections.
- The `nltk` provides several popular stemmers for English:
    - `nltk.stem.PorterStemmer`
    - `nltk.stem.LancasterStemmer`
    - `nltk.stem.RegexpStemmer`
    - `nltk.stem.SnowballStemmer`

- We can compare the results of different stemmers.

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer

words = ['jumping', 'jumps', 'jumped', 'jumpy']
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer('english')
rs = RegexpStemmer('ing$|s$|ed$|y$', min=4) # set the minimum of the string to stem


In [None]:
[ps.stem(w) for w in words]

['jump', 'jump', 'jump', 'jumpi']

In [None]:
[ls.stem(w) for w in words]

['jump', 'jump', 'jump', 'jumpy']

In [None]:
[ss.stem(w) for w in words]

['jump', 'jump', 'jump', 'jumpi']

In [None]:
[rs.stem(w) for w in words]

['jump', 'jump', 'jump', 'jump']

💬  Discuss what differences can you observe using different stemmers.

## Lemmatization


- Lemmatization is similar to Stemming.
- It is a process where we remove word affixes to get the **root word** but not the **root stem**.
- These root words, i.e., lemmas, are lexicographically correct words and always present in the dictionary. We often refer to them as dictionary words.

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parse','entity'])

In [None]:
text = 'The next class will be, like today, online. Let me know if you would not like that.'
text_tagged = nlp(text)

In [None]:
# Discuss with you group the outputs and try to identify the POS tags. Are they all correct?
for t in text_tagged:
    print(t.text + '/'+ t.lemma_ + '/'+ t.pos_)

The/the/DET
next/next/ADJ
class/class/NOUN
will/will/VERB
be/be/AUX
,/,/PUNCT
like/like/SCONJ
today/today/NOUN
,/,/PUNCT
online/online/ADV
././PUNCT
Let/let/VERB
me/-PRON-/PRON
know/know/VERB
if/if/SCONJ
you/-PRON-/PRON
would/would/VERB
not/not/PART
like/like/VERB
that/that/DET
././PUNCT


In [None]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text('The next class will be, like today, online. Let me know if you would not like that.')


'the next class will be , like today , online . let me know if you would not like that .'

In [None]:
# Now get the stem and the lemma to the following word: 'refrigerate'.
# 💬 Discuss the differences


## Redundant Whitespaces

- Very often we would see redundant duplicate whitespaces in texts. 
- Sometimes, when we remove special characters (punctuations, digits etc.), we may replace those characters with whitespaces (not empty string), which may lead to duplicate whitespaces in texts.

In [None]:
def remove_redundant_whitespaces(text):
    text = re.sub(r'\s+'," ", text)
    return text.strip()

In [None]:
s = "We are humans     and we   often add          an initial space or  two.  "
print(s)
print (remove_redundant_whitespaces(s))

We are humans     and we   often add          an initial space or  two.  
We are humans and we often add an initial space or two.
