# Lemmatization

exmaples of tags:
<br/>"The" is tagged as determiner (DT)
<br/>"quick" is tagged as adjective (JJ)
<br/>"brown" is tagged as adjective (JJ)
<br/>"fox" is tagged as noun (NN)
<br/>"jumps" is tagged as verb (VBZ)
<br/>"over" is tagged as preposition (IN)
<br/>"the" is tagged as determiner (DT)
<br/>"lazy" is tagged as adjective (JJ)
<br/>"dog" is tagged as noun (NN)

import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng') 

In [38]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

sentence = "The children were running faster while the geese flew higher over the fallen leaves."

tokens = word_tokenize(sentence)
print('tokens',tokens)

tagged_tokens = pos_tag(tokens)
print('tagged_token', tagged_tokens)

# Converts the POS tag to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

lemmatized_sentence_without_pos = []
lemmatized_sentence_with_pos = []
for word, tag in tagged_tokens:
    if word.lower() in ['is', 'am', 'are']:
        lemmatized_sentence.append(word)
    else:
        lemmatized_sentence_without_pos.append(
            lemmatizer.lemmatize(word))
        lemmatized_sentence_with_pos.append(
            lemmatizer.lemmatize(word, get_wordnet_pos(tag)))

print()
print("Original Sentence: ", sentence)
print()
print("Lemmatized Sentence Without POS: ", ' '.join(lemmatized_sentence_without_pos))
print("Lemmatized Sentence With POS: ", ' '.join(lemmatized_sentence_with_pos))

tokens ['The', 'children', 'were', 'running', 'faster', 'while', 'the', 'geese', 'flew', 'higher', 'over', 'the', 'fallen', 'leaves', '.']
tagged_token [('The', 'DT'), ('children', 'NNS'), ('were', 'VBD'), ('running', 'VBG'), ('faster', 'RB'), ('while', 'IN'), ('the', 'DT'), ('geese', 'JJ'), ('flew', 'VBD'), ('higher', 'JJR'), ('over', 'IN'), ('the', 'DT'), ('fallen', 'VBN'), ('leaves', 'NNS'), ('.', '.')]

Original Sentence:  The children were running faster while the geese flew higher over the fallen leaves.

Lemmatized Sentence Without POS:  The child were running faster while the goose flew higher over the fallen leaf .
Lemmatized Sentence With POS:  The child be run faster while the geese fly high over the fall leaf .


## Tag makes a difference!

Different words convey different meaning as different parts of speech. This retains the essence of each word better than stemming.

In [39]:
word = "leaves"
print('as noun:', lemmatizer.lemmatize(word, 'n'))
print('as verb:', lemmatizer.lemmatize(word, 'v'))

as noun: leaf
as verb: leave


In [40]:
word = "left"
print('as nound:', lemmatizer.lemmatize(word, 'n'))
print('as verb:', lemmatizer.lemmatize(word, 'v'))

as nound: left
as verb: leave
