# NLP - Text Normalization
### With NLTK and spaCy

Created by Andres Segura-Tinoco  
Created on May 15, 2021

In [None]:
# Import libraries
import spacy
from spacy.lang.en import English

In [None]:
# Verify installed spacy version
spacy.__version__

'2.2.4'

## <span>1. Create NLP model</span>

In [None]:
# Document composed of a paragraph from the book The Adventures of Sherlock Holmes, by Arthur Conan Doyle
book_en = """
          To Sherlock Holmes she is always _the_ woman. I have seldom heard him
          mention her under any other name. In his eyes she eclipses and
          predominates the whole of her sex. It was not that he felt any emotion
          akin to love for Irene Adler. All emotions, and that one particularly,
          were abhorrent to his cold, precise but admirably balanced mind. He
          was, I take it, the most perfect reasoning and observing machine that
          the world has seen, but as a lover he would have placed himself in a
          false position. He never spoke of the softer passions, save with a gibe
          and a sneer. They were admirable things for the observer—excellent for
          drawing the veil from men’s motives and actions. But for the trained
          reasoner to admit such intrusions into his own delicate and finely
          adjusted temperament was to introduce a distracting factor which might
          throw a doubt upon all his mental results. Grit in a sensitive
          instrument, or a crack in one of his own high-power lenses, would not
          be more disturbing than a strong emotion in a nature such as his. And
          yet there was but one woman to him, and that woman was the late Irene
          Adler, of dubious and questionable memory.
          """

# Data quality
book_en = book_en.replace("\n          ", "").lower()
book_en

'to sherlock holmes she is always _the_ woman. i have seldom heard himmention her under any other name. in his eyes she eclipses andpredominates the whole of her sex. it was not that he felt any emotionakin to love for irene adler. all emotions, and that one particularly,were abhorrent to his cold, precise but admirably balanced mind. hewas, i take it, the most perfect reasoning and observing machine thatthe world has seen, but as a lover he would have placed himself in afalse position. he never spoke of the softer passions, save with a gibeand a sneer. they were admirable things for the observer—excellent fordrawing the veil from men’s motives and actions. but for the trainedreasoner to admit such intrusions into his own delicate and finelyadjusted temperament was to introduce a distracting factor which mightthrow a doubt upon all his mental results. grit in a sensitiveinstrument, or a crack in one of his own high-power lenses, would notbe more disturbing than a strong emotion in a na

In [None]:
# Create NLP model for English language
nlp = spacy.load('en_core_web_sm')
doc_en = nlp(book_en)

## <span>2. Remove Stopwords</span>

In [None]:
# Bag of words
doc_sw = []
doc_words = []

for token in doc_en:
  word = str(token).replace(',', '').strip()

  if not token.is_stop and len(word) > 2:
    if word not in doc_words:
      doc_words.append(word)
  else:
    if word not in doc_sw:
      doc_sw.append(word)

print('Count of Stopwords:', len(doc_sw))
print('Count of words:', len(doc_words))

Count of Stopwords: 54
Count of words: 80


## <span>3. Stemming</span>

Process in which terms are transformed to their root in order to reduce the size of the vocabulary. It is carried by applying word reduction rules.

Two of the most common stemming algorithms are:
- Porter
- Snowball

In [None]:
from nltk.stem.porter import *

### 3.1. Porter Stemmer

In [None]:
stemmer = PorterStemmer()

for word in doc_words:
  root_word = stemmer.stem(word)
  if word != root_word:
    print(word + ' --> ' + root_word)

holmes --> holm
himmention --> himment
eyes --> eye
eclipses --> eclips
andpredominates --> andpredomin
irene --> iren
emotions --> emot
particularly --> particularli
abhorrent --> abhorr
precise --> precis
admirably --> admir
balanced --> balanc
hewas --> hewa
reasoning --> reason
observing --> observ
machine --> machin
thatthe --> thatth
placed --> place
afalse --> afals
position --> posit
passions --> passion
admirable --> admir
things --> thing
observer --> observ
excellent --> excel
fordrawing --> fordraw
motives --> motiv
actions --> action
trainedreasoner --> trainedreason
intrusions --> intrus
delicate --> delic
finelyadjusted --> finelyadjust
temperament --> tempera
introduce --> introduc
distracting --> distract
results --> result
sensitiveinstrument --> sensitiveinstru
lenses --> lens
notbe --> notb
disturbing --> disturb
emotion --> emot
nature --> natur
ireneadler --> ireneadl
dubious --> dubiou
questionable --> question
memory --> memori


### 3.2. Snowball Stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
stemmer = SnowballStemmer(language='english')

for word in doc_words:
  root_word = stemmer.stem(word)
  if word != root_word:
    print(word + ' --> ' + root_word)

holmes --> holm
himmention --> himment
eyes --> eye
eclipses --> eclips
andpredominates --> andpredomin
irene --> iren
emotions --> emot
particularly --> particular
abhorrent --> abhorr
precise --> precis
admirably --> admir
balanced --> balanc
hewas --> hewa
reasoning --> reason
observing --> observ
machine --> machin
thatthe --> thatth
placed --> place
afalse --> afals
position --> posit
passions --> passion
admirable --> admir
things --> thing
observer --> observ
excellent --> excel
fordrawing --> fordraw
motives --> motiv
actions --> action
trainedreasoner --> trainedreason
intrusions --> intrus
delicate --> delic
finelyadjusted --> finelyadjust
temperament --> tempera
introduce --> introduc
distracting --> distract
results --> result
sensitiveinstrument --> sensitiveinstru
lenses --> lens
notbe --> notb
disturbing --> disturb
emotion --> emot
nature --> natur
ireneadler --> ireneadl
questionable --> question
memory --> memori


## <span>4. Lemmatization</span>

It performs a morphological analysis using reference dictionaries to create equivalence classes between words.

For example, for the token “eclipses”, a Stmm rule would return the term “eclips“, while through Lmmt we would get the term “eclipse“.

In [None]:
for token in doc_en:
  word = str(token).replace(',', '').strip()

  if not token.is_stop and len(word) > 2:
    root_word = token.lemma_
    if word != root_word:
      print(word + ' --> ' + root_word)

heard --> hear
eyes --> eye
eclipses --> eclipse
andpredominates --> andpredominate
felt --> feel
emotions --> emotion
observing --> observe
seen --> see
placed --> place
spoke --> speak
softer --> soft
passions --> passion
things --> thing
men --> man
motives --> motive
actions --> action
intrusions --> intrusion
results --> result
lenses --> lense


<hr>
You can contact me on <a href="https://twitter.com/SeguraAndres7" target="_blank">Twitter</a> | <a href="https://github.com/ansegura7/" target="_blank">GitHub</a> | <a href="https://www.linkedin.com/in/andres-segura-tinoco/" target="_blank">LinkedIn</a>