# Natural Language Processing
## Part 3: Stemming vs Lemmatization

This Jupyter notebook demonstrates how to process text data before running machine learning and rule-based algoritthms to 

In [1]:
# Import NLTK
import nltk
nltk.download('wordnet')

#import stem package
from nltk import stem
from nltk.corpus import wordnet

# import lemmitzer package
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/avielstern/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# stemmer
porter = stem.porter.PorterStemmer()

# lemmitzation
lem = stem.WordNetLemmatizer()

In [3]:
word_list1 = ['play', 'playing', 'played']
word_list2 = ['feet', 'foot', 'foots', 'footing']
word_list3 = ['organize', 'organizing', 'organization']
word_list4 = ['benefactor', 'benevolent', 'beneficial']
word_list5 = ['universe', 'university']

print("['play', 'playing', 'played'] -------------------->", [porter.stem(word) for word in word_list1])
print("['feet', 'foot', 'foots', 'footing'] -------------> ", [porter.stem(word) for word in word_list2])
print("['organize', 'organizing', 'organization'] -------> ", [porter.stem(word) for word in word_list3])
print("['benefactor', 'benevolent', 'beneficial'] -------> ", [porter.stem(word) for word in word_list4])
print("['universe', 'university'] -------> ", [porter.stem(word) for word in word_list5])

['play', 'playing', 'played'] --------------------> ['play', 'play', 'play']
['feet', 'foot', 'foots', 'footing'] ------------->  ['feet', 'foot', 'foot', 'foot']
['organize', 'organizing', 'organization'] ------->  ['organ', 'organ', 'organ']
['benefactor', 'benevolent', 'beneficial'] ------->  ['benefactor', 'benevol', 'benefici']
['universe', 'university'] ------->  ['univers', 'univers']


In [4]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
word1  = 'cars'
word2 = 'revolution'
print(stemmer.stem(word1), stemmer.stem(word2))

car revolut


Excerpt From: Sowmya Vajjala, Bodhisattwa Majumder, Anuj Gupta & Harshit Surana. “Practical Natural Language Processing.” Apple Books. https://books.apple.com/us/book/practical-natural-language-processing/id1519103544

In [5]:
print("['play', 'playing', 'played'] -------------------->", [lem.lemmatize(word) for word in word_list1])
print("['feet', 'foot', 'foots', 'footing'] -------------> ", [lem.lemmatize(word) for word in word_list2])
print("['organize', 'organizing', 'organization'] -------> ", [lem.lemmatize(word) for word in word_list3])
print("['benefactor', 'benevolent', 'beneficial'] -------> ", [lem.lemmatize(word) for word in word_list4])
print("['universe', 'university'] -------> ", [lem.lemmatize(word) for word in word_list5])

['play', 'playing', 'played'] --------------------> ['play', 'playing', 'played']
['feet', 'foot', 'foots', 'footing'] ------------->  ['foot', 'foot', 'foot', 'footing']
['organize', 'organizing', 'organization'] ------->  ['organize', 'organizing', 'organization']
['benefactor', 'benevolent', 'beneficial'] ------->  ['benefactor', 'benevolent', 'beneficial']
['universe', 'university'] ------->  ['universe', 'university']


In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a")) #a is for adjective”

good


In [9]:
print(lemmatizer.lemmatize("better", pos="a")) #a is for adjective”

good


In [10]:
print(lemmatizer.lemmatize("better", pos="n")) #a is for adjective”

better


In [11]:
print(lemmatizer.lemmatize("better")) #a is for adjective”

better


In [12]:
print(lemmatizer.lemmatize("ponies"))
print(lemmatizer.lemmatize("caresses"))
print(lemmatizer.lemmatize("cats"))

pony
caress
cat


**Conjugatation of 'To Be'**

    Present Tense:
        I am 
        You are	
        He/She/It is
        
        We are
        You are
        They are
        
    Past Tense:
        I was	
        You were	
        He/She/It was
        
        We were
        You were
        They were


In [13]:
print('I am ---> To', lemmatizer.lemmatize("am", pos="v")) #v is for verb”
print('You are --> To', lemmatizer.lemmatize("are", pos="v")) #v is for verb”
print('He is --> To', lemmatizer.lemmatize("is", pos="v")) #v is for verb”
print('They were --> To', lemmatizer.lemmatize("were", pos="v")) #v is for verb”

I am ---> To be
You are --> To be
He is --> To be
They were --> To be


Different forms of confjugation of run: 

    I run
    She runs
    We are running
    They ran


In [14]:
print(lemmatizer.lemmatize("run", pos="v")) #v is for verb”
print(lemmatizer.lemmatize("runs", pos="v")) #v is for verb”
print(lemmatizer.lemmatize("ran", pos="v")) #v is for verb”
print(lemmatizer.lemmatize("running", pos="v")) #v is for verb”

run
run
run
run


In [16]:
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("runs"))
print(lemmatizer.lemmatize("ran")) 
print(lemmatizer.lemmatize("running"))

run
run
ran
running


In [17]:
word_list = ['am', 'are', 'is']
print("['play', 'playing', 'played'] -------------------->", [porter.stem(word) for word in word_list])

['play', 'playing', 'played'] --------------------> ['am', 'are', 'is']


Excerpt From: Sowmya Vajjala, Bodhisattwa Majumder, Anuj Gupta & Harshit Surana. “Practical Natural Language Processing.” Apple Books. https://books.apple.com/us/book/practical-natural-language-processing/id1519103544

More complicated unusual verb conjucation

In [18]:
print('beheld', lem.lemmatize('beheld', pos = 'v'))
print('witheld', lem.lemmatize('withheld', pos = 'v'))
print('flung', lem.lemmatize('flung', pos = 'v'))

beheld behold
witheld withhold
flung fling


adjectives

In [52]:
# Testing adjectives ending in ly
print('timely', lem.lemmatize('timely', pos = 'r'))
print('actively', lem.lemmatize('actively', pos = 'a'))

timely timely
actively actively


In [110]:
# comparative adjective
print('closer', lem.lemmatize('closer', pos = 'a'))

#superlative adjective
print('closest', lem.lemmatize('closest', pos = 'a'))

# comparative adjective:
print('smaller', lem.lemmatize('smaller', pos = 'a'))

# superlative adjective
print('smallest', lem.lemmatize('smallest', pos = 'a'))

# Dry
print('drier', lem.lemmatize('drier', pos = 'a'))
print('driest', lem.lemmatize('driest', pos = 'a'))

closer close
closest close
smaller small
smallest small
drier dry
driest dry


Adverbs

In [109]:
# adverb
print('farther', lem.lemmatize('farther', pos = 'r'))
# superlative adverb
print('farthest', lem.lemmatize('farthest', pos = 'r'))

# adverb
print('loudly', lem.lemmatize('loudly', pos = 'r'))
# superlative adverb
print('loudest', lem.lemmatize('loudest', pos = 'r'))

farther far
farthest farthest
loudly loudly
loudest loudest


Unusual verb conjugation: https://www.theenglishspace.com/grammar/glossary/irregular-verbs/unusual-irregular-verbs.html


When punctuation is important: 

In [19]:
# One final fling until I go back to school
print('flung', lem.lemmatize('flung', pos = 'n'))

 
# Fling a rubber band
print('flung', lem.lemmatize('flung', pos = 'v'))

flung flung
flung fling


Fling as a noun means something compeltely different than as a verb, however fling will be grouped together as the same word, which causes error when running a model.

In [1]:
import spacy
sp = spacy.load("en_core_web_sm")

words = ['better','ran', 'are', 'running', 'were', 'shared', 'organize', 'university', 
         'awoken', 'arose', 'beheld', 'sped', 'withhold', 'flung', 'cats', 'timely', 'actively', 'tighter', 'smaller', 'farther', 'driest', 'farthest', 'loudly']

spacy_words = []

for i in words:
    token = sp(i)
    for word in token:
        spacy_words.append(word.lemma_)
    

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [343]:
tags = nltk.pos_tag(words)
# extract pos
tag = list(dict(tags).values())

for i in range(0, len(tag)): 
    if tag[i] == 'JJR' or tag[i] == 'JJ' or tag[i] == 'JJS':
        tag[i] = 'a'
    elif tag[i] == 'VBP' or tag[i] == 'VBG' or tag[i] == 'VBD' or tag[i] == 'VBN':
        tag[i] = 'v'
    elif tag[i] == 'NN' or tag[i]== 'NNS':
        tag[i] = 'n'
    elif tag[i] == 'RB':
        tag[i] = 'r'
    else:
        pass
    
word_list = []
for i in range(0, len(tag)):
    word_list.append([words[i], tag[i]])
    
nltk_words = []

for word in word_list:
    print(word[0],'-->', lem.lemmatize(word[0], pos = word[1]))
    nltk_words.append(lem.lemmatize(word[0], pos = word[1]))

better --> good
ran --> ran
are --> be
running --> run
were --> be
shared --> share
organize --> organize
university --> university
awoken --> awake
arose --> arose
beheld --> beheld
sped --> speed
withhold --> withhold
flung --> flung
cats --> cat
timely --> timely
actively --> actively
tighter --> tight
smaller --> small
farther --> farther
driest --> driest
farthest --> farthest
loudly --> loudly


In [352]:
for i in range(0, len(lem_words)):
    if nltk_words[i] != spacy_words[i]:
        print('NLTK:', nltk_words[i], ', Spacy:', spacy_words[i])

NLTK: good , Spacy: well
NLTK: ran , Spacy: run
NLTK: arose , Spacy: arise
NLTK: farther , Spacy: far
NLTK: driest , Spacy: dry


Excerpt From: Sowmya Vajjala, Bodhisattwa Majumder, Anuj Gupta & Harshit Surana. “Practical Natural Language Processing.” Apple Books. https://books.apple.com/us/book/practical-natural-language-processing/id1519103544