# ***Advanced NLP Tasks***
-----------------------------

## ***Parts of Speech (PoS) Tagging***

In [3]:
import nltk

In [2]:
# There are different classes of words.
# Conjunctions, Caedinals, Determiners, Prepositions, Adjectives, Modals, Nouns, Possessives, Pronouns, Adverbs, Symbols,
# Verbs... etc

In [4]:
# Modal verbs.

nltk.help.upenn_tagset("MD")

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [7]:
# To do POS tagging, split the sentence into tokens

sentence = "Children below 18 years old shouldn't drink alchoholic beverages before bed."

In [10]:
tokens = nltk.word_tokenize(sentence)
print(*tokens, sep = ", ")

Children, below, 18, years, old, should, n't, drink, alchoholic, beverages, before, bed, .


In [11]:
nltk.pos_tag(tokens)

[('Children', 'NNP'),
 ('below', 'IN'),
 ('18', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('alchoholic', 'JJ'),
 ('beverages', 'NNS'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [12]:
# Children - Plural noun
# below - 
# 18 - 
# years - Singular noun
# old - asjective
# should - modal verb
# n't - 
# drink - verb
# alchoholic - adjective
# beverages - Singular noun
# before - 
# bed - 

In [13]:
# Ambiguity in POS tagging.

# Visiting aunts can be a nuisance.
# How to interpret this sentence?

# Paying visits to aunts can be a nuisance.
# or
# Aunts who visit you could be a nuisance.

sentence = "Visiting aunts can be a nuisance."

In [14]:
# In the first interpretation, visiting is a verb 
# In the second interpretation, visiting is an adjective

In [15]:
nltk.pos_tag(nltk.word_tokenize(sentence))

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN'),
 ('.', '.')]

In [16]:
# NLTK categorizes visiting as a gerund verb
# So you are visting the aunts
# aunts - Plural noun
# can - 

# NLTK considers the first interpretation.

## ***Parsing the sentence structure***
-------------------------------

In [29]:
text = "Romeo loves Juliet"

In [30]:
# Noun - Romeo, Juliet
# Verb - loves

# Subject - Romeo
# Verb - loves
# Object - Juliet

In [31]:
tokens = nltk.word_tokenize(text)

In [32]:
nltk.pos_tag(tokens)

[('Romeo', 'NNP'), ('loves', 'VBZ'), ('Juliet', 'NNP')]

In [33]:
# If we break down the sentence structure of Romeo loves Juliet

# The sentence comprises of two parts.
# A noun phrase followed by a verb phrase

# Noun phrase - Romeo
# Verb phrase - loves Juliet
# This verb phrase is made of a verb and a noun phrase
# verb - loves
# noun phrase - Juliet

In [34]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> "Romeo" | "Juliet"
V -> "loves"
""")

In [35]:
parser = nltk.ChartParser(grammar)

In [36]:
parser.parse_all(tokens)

[Tree('S', [Tree('NP', ['Romeo']), Tree('VP', [Tree('V', ['loves']), Tree('NP', ['Juliet'])])])]

# ***Ambiguity in Parsing***
-----------------------------------

In [37]:
# I saw the man with a telescope.
# How to interpret this sentence?

# I saw the man carrying a telescope.
# or
# I saw the man through a telescope.

In [38]:
# Ambiguity may exist even if the sentence is grammatically correct.

In [6]:
# Break down of the sentence structure

# The sentence could be broken down into a noun phrase and a verb phrase
# Noun phrase - I
# verb phrase - saw the man with a telescope.

# This verb phrase can be further broken down into
# a verb - saw
# noun phrase - the man with a telescope.

# But here's the ambiguity.
# There are two ways to break the verb pharse
# instead of breaking it as a verb followed by a verb phrase
# We can break it as a verb phrase followed by a preposition phrase

# verb phrase - saw the man
# preposition phrase - with a telescope.
# (This structure assumes that He saw the man through a telescope.)

In [2]:
sentence = "I saw the man with a telescope"

In [5]:
tokens = nltk.word_tokenize(sentence)
tokens

['I', 'saw', 'the', 'man', 'with', 'a', 'telescope']

In [8]:
grammar = nltk.data.load(r"./grammar.cfg")

In [10]:
parser = nltk.ChartParser(grammar)

In [12]:
parser.parse_all(tokens)

[Tree('S', [Tree('NP', ['I']), Tree('VP', [Tree('VP', [Tree('V', ['saw']), Tree('NP', [Tree('DT', ['the']), Tree('N', ['man'])])]), Tree('PP', [Tree('P', ['with']), Tree('NP', [Tree('DT', ['a']), Tree('N', ['telescope'])])])])]),
 Tree('S', [Tree('NP', ['I']), Tree('VP', [Tree('V', ['saw']), Tree('NP', [Tree('DT', ['the']), Tree('N', ['man']), Tree('PP', [Tree('P', ['with']), Tree('NP', [Tree('DT', ['a']), Tree('N', ['telescope'])])])])])])]

In [13]:
for tree in parser.parse_all(tokens):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (DT the) (N man)))
    (PP (P with) (NP (DT a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (DT the) (N man) (PP (P with) (NP (DT a) (N telescope))))))


In [14]:
# NLTK bundles a set of parsed tree structures of some corpora

from nltk.corpus import treebank

In [20]:
treebank.parsed_sents("wsj_0001.mrg")

[Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]), Tree('S', [Tree('NP-SBJ', [Tree('NNP', ['Mr.']), Tree('NNP', ['Vinken'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP-PRD', [Tree('NP', [Tree('NN', ['chairman'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NP', [Tree('NNP', ['Elsevier']), Tree('NNP', ['N.V.'])]), Tree(',', [',']), Tree('NP', [Tree('DT', ['the']), Tree('NNP', ['Dutch']), Tree('VBG', ['publishing']), Tree('NN', ['group'])])])])])]), Tree('.', ['.'])])]

In [21]:
for tree in treebank.parsed_sents("wsj_0001.mrg"):
    print(tree)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
(S
  (NP-SBJ (NNP Mr.) (NNP Vinken))
  (VP
    (VBZ is)
    (NP-PRD
      (NP (NN chairman))
      (PP
        (IN of)
        (NP
          (NP (NNP Elsevier) (NNP N.V.))
          (, ,)
          (NP (DT the) (NNP Dutch) (VBG publishing) (NN group))))))
  (. .))


In [22]:
# Uncommon usages of words
# Means that the old people are maneouvoring the boat.

sentence = "The old man the boat"
tokens = nltk.word_tokenize(sentence)
nltk.pos_tag(tokens)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [23]:
# NLTK assumes "man" as a noun
# But it truly functions as a verb in this sentence.

In [24]:
# Here the sentence is grammatically and syntactically correct
# But has no meaning.
# NLTK classifies Colourless as noun and green as adjective instead of classifying both of them as adjectives.

sentence = "Colourless green ideas sleep furiously"
tokens = nltk.word_tokenize(sentence)
nltk.pos_tag(tokens)

[('Colourless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]