# Lemmatization 

Lemmatization returns the canonical form of the sentence.

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
lemmatizer = WordNetLemmatizer()
words = 'duck geese cats books'.split(' ')
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['duck', 'goose', 'cat', 'book']

In [20]:
# lemmatize function has a parameter for part of speech, which is set to noun as default. If you would like to lemmatize a verb or an adjective, yo uahve to explicitly set it.
print(lemmatizer.lemmatize('loved', 'v'))
print(lemmatizer.lemmatize('loved'))
print(lemmatizer.lemmatize('worse', 'a'))

love
loved
bad


## Combining part of speech with lemmatization

In [5]:
pos_mapping = {
    'JJ': 'a',
    'JJR': 'a',
    'JJS': 'a',
    'NN': 'n',
    'NNS': 'n',
    'VBD': 'v',
    'VBG': 'v',
    'VBN': 'v',
    'VBP': 'v',
    'VBZ': 'v'
}
accepted_pos = {'a', 'v', 'n'}

In [30]:
def map_pos(word):
    return pos_mapping[word] if word in pos_mapping else word

def lemmatize_long_text(text):
    words = nltk.tokenize.word_tokenize(text)
    words = nltk.pos_tag(words)
    words = [(word_tuple[0], map_pos(word_tuple[1])) for word_tuple in words]
    words = [(lemmatizer.lemmatize(word_tuple[0], word_tuple[1] if word_tuple[1] in accepted_pos else 'n'),
              word_tuple[1]) for word_tuple in words]
    return words

In [31]:
filename = './data/sherlock_holmes_1.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()
print(lemmatize_long_text(text))

[('To', 'TO'), ('Sherlock', 'NNP'), ('Holmes', 'NNP'), ('she', 'PRP'), ('be', 'v'), ('always', 'RB'), ('_the_', 'a'), ('woman', 'n'), ('.', '.'), ('I', 'PRP'), ('have', 'v'), ('seldom', 'v'), ('heard', 'RB'), ('him', 'PRP'), ('mention', 'VB'), ('her', 'PRP'), ('under', 'IN'), ('any', 'DT'), ('other', 'a'), ('name', 'n'), ('.', '.'), ('In', 'IN'), ('his', 'PRP$'), ('eye', 'n'), ('she', 'PRP'), ('eclipse', 'v'), ('and', 'CC'), ('predominate', 'v'), ('the', 'DT'), ('whole', 'n'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'n'), ('.', '.'), ('It', 'PRP'), ('be', 'v'), ('not', 'RB'), ('that', 'IN'), ('he', 'PRP'), ('felt', 'v'), ('any', 'DT'), ('emotion', 'n'), ('akin', 'n'), ('to', 'TO'), ('love', 'VB'), ('for', 'IN'), ('Irene', 'NNP'), ('Adler', 'NNP'), ('.', '.'), ('All', 'DT'), ('emotion', 'n'), (',', ','), ('and', 'CC'), ('that', 'IN'), ('one', 'CD'), ('particularly', 'RB'), (',', ','), ('be', 'v'), ('abhorrent', 'a'), ('to', 'TO'), ('his', 'PRP$'), ('cold', 'n'), (',', ','), ('precise', '

In [32]:
lemmatize_long_text('She will be loved')

[('She', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('love', 'v')]