# Basics Again

In [1]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns

In [5]:
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [56]:
text = "I’m Gonna (going to) Make Him $5.5 an offers he can’t refuse easily\n\n. I don't like              violence. I'm a businessman. Blood is a bigger expense. Knowing is not Doing"

In [57]:
# word tokenize uses TreebankWordTokenizer and PunktSentenceTokenizer together. It produces dots for sentence ends only. A dot does not necessarily mean the end of a sentence.
words = word_tokenize(text)
words

['I',
 '’',
 'm',
 'Gon',
 'na',
 '(',
 'going',
 'to',
 ')',
 'Make',
 'Him',
 '$',
 '5.5',
 'an',
 'offers',
 'he',
 'can',
 '’',
 't',
 'refuse',
 'easily',
 '.',
 'I',
 'do',
 "n't",
 'like',
 'violence',
 '.',
 'I',
 "'m",
 'a',
 'businessman',
 '.',
 'Blood',
 'is',
 'a',
 'bigger',
 'expense',
 '.',
 'Knowing',
 'is',
 'not',
 'Doing']

In [53]:
sent_tokenize(text)

['I’m Gonna (going to) Make Him $5.5 an offers he can’t refuse easily.',
 "I don't like violence.",
 "I'm a businessman.",
 'Blood is a bigger expense.',
 'Knowing is not Doing']

In [54]:
# POS 
wordsAndPos = nltk.pos_tag(words) # creates TreeBank POS
wordsAndPos

[('I', 'PRP'),
 ('’', 'VBP'),
 ('m', 'JJ'),
 ('Gon', 'NNP'),
 ('na', 'NN'),
 ('(', '('),
 ('going', 'VBG'),
 ('to', 'TO'),
 (')', ')'),
 ('Make', 'NNP'),
 ('Him', 'NNP'),
 ('$', '$'),
 ('5.5', 'CD'),
 ('an', 'DT'),
 ('offers', 'NNS'),
 ('he', 'PRP'),
 ('can', 'MD'),
 ('’', 'VB'),
 ('t', 'JJ'),
 ('refuse', 'NN'),
 ('easily', 'RB'),
 ('.', '.'),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('like', 'VB'),
 ('violence', 'NN'),
 ('.', '.'),
 ('I', 'PRP'),
 ("'m", 'VBP'),
 ('a', 'DT'),
 ('businessman', 'NN'),
 ('.', '.'),
 ('Blood', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bigger', 'JJR'),
 ('expense', 'NN'),
 ('.', '.'),
 ('Knowing', 'NNP'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('Doing', 'VBG')]

# Stemmer and Lemmatizers
There are few things to note:
1. Sometimes a word can be a verb or a NP. So, lemmatizer cannot take decision whether to stem it or not. So, in such cases, we need to supply the POS tags
2. It cannot lemmatize unknown words
3. Even when POS is given, if the word is capitalized, lemmatizer may ignore the POS completely!

In [58]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("dogs"))
print(lemmatizer.lemmatize("unknowns"))
print(lemmatizer.lemmatize("newers")) # cannot lemmatize it as it's not in the vocabulary


print(lemmatizer.lemmatize("newers", wordnet.VERB))  # cannot lemmatize it as it's not in the vocabulary


print(lemmatizer.lemmatize("knowing"))
print(lemmatizer.lemmatize("knowing", wordnet.VERB))  # converted to know

print(lemmatizer.lemmatize("Doing"))
print(lemmatizer.lemmatize("Doing", wordnet.VERB))  # converted to know

print(lemmatizer.lemmatize("doing"))
print(lemmatizer.lemmatize("doing", wordnet.VERB))  # converted to know
print(lemmatizer.lemmatize("DOING"))
print(lemmatizer.lemmatize("DOING", wordnet.VERB))  # converted to know

print(lemmatizer.lemmatize(text)) # NO POS

dog
unknown
newers
newers
knowing
know
Doing
Doing
doing
do
DOING
DOING
I’m Gonna (going to) Make Him $5.5 an offers he can’t refuse easily

. I don't like              violence. I'm a businessman. Blood is a bigger expense. Knowing is not Doing


In [40]:

def treebankToWordnetPOS(treebankTag: str) -> str:
    """
    Wordnet Lemmatizer does not understand treebank POS. So, we need to convert
    """
    if treebankTag.startswith("J"):
        return wordnet.ADJ
    if treebankTag.startswith("V"):
        return wordnet.VERB
    if treebankTag.startswith("N"):
        return wordnet.NOUN
    if treebankTag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [43]:

# for word in words:
#     print(word, '=>', lemmatizer.lemmatize(word))
    
    
for word, pos in wordsAndPos:
    wPos = treebankToWordnetPOS(pos)
    print(wPos.name)
    print(word, '=>', lemmatizer.lemmatize(word, wPos))

AttributeError: 'str' object has no attribute 'name'