In [1]:
import nltk
from nltk.tokenize import word_tokenize

import matplotlib

In [2]:
#nltk.download()

In [3]:
sentences = [
    "Go north",
    "Go into the cave",
    "Pick up the map",
    "Open the green door",
    "Unlock the wooden door with the large rusty key",
    "Put the map on the table",
    "Go to sleep",
    "Lie down on the bed and go to sleep"
]

In [4]:
sent = sentences[0]
sent

'Go north'

In [5]:
def get_pos_tags(sentence):
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

tags = get_pos_tags("The quick brown fox jumped over the lazy dog.")
assert tags == [
    ('The', 'DT'),
    ('quick', 'JJ'),
    ('brown', 'NN'),
    ('fox', 'NN'),
    ('jumped', 'VBD'),
    ('over', 'IN'),
    ('the', 'DT'),
    ('lazy', 'JJ'),
    ('dog', 'NN'),
    ('.', '.')
]

In [6]:
for sent in sentences:
    print(get_pos_tags(sent))

[('Go', 'NNP'), ('north', 'RB')]
[('Go', 'VB'), ('into', 'IN'), ('the', 'DT'), ('cave', 'NN')]
[('Pick', 'NNP'), ('up', 'RP'), ('the', 'DT'), ('map', 'NN')]
[('Open', 'VB'), ('the', 'DT'), ('green', 'JJ'), ('door', 'NN')]
[('Unlock', 'IN'), ('the', 'DT'), ('wooden', 'JJ'), ('door', 'NN'), ('with', 'IN'), ('the', 'DT'), ('large', 'JJ'), ('rusty', 'NN'), ('key', 'NN')]
[('Put', 'VB'), ('the', 'DT'), ('map', 'NN'), ('on', 'IN'), ('the', 'DT'), ('table', 'NN')]
[('Go', 'VB'), ('to', 'TO'), ('sleep', 'VB')]
[('Lie', 'NNP'), ('down', 'RB'), ('on', 'IN'), ('the', 'DT'), ('bed', 'NN'), ('and', 'CC'), ('go', 'VB'), ('to', 'TO'), ('sleep', 'VB')]


Problem:
 - [NLTK Thinks that Imperatives are Nouns](https://stackoverflow.com/q/9406093/1609514)

In [7]:
%%time

def get_pos_tags(sentence):
    words = ['You'] + word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    return pos_tags[1:]

for sent in sentences:
    print(get_pos_tags(sent))

[('Go', 'VBP'), ('north', 'JJ')]
[('Go', 'VBP'), ('into', 'IN'), ('the', 'DT'), ('cave', 'NN')]
[('Pick', 'VBP'), ('up', 'RP'), ('the', 'DT'), ('map', 'NN')]
[('Open', 'VBP'), ('the', 'DT'), ('green', 'JJ'), ('door', 'NN')]
[('Unlock', 'VBP'), ('the', 'DT'), ('wooden', 'JJ'), ('door', 'NN'), ('with', 'IN'), ('the', 'DT'), ('large', 'JJ'), ('rusty', 'NN'), ('key', 'NN')]
[('Put', 'VBP'), ('the', 'DT'), ('map', 'NN'), ('on', 'IN'), ('the', 'DT'), ('table', 'NN')]
[('Go', 'VBP'), ('to', 'TO'), ('sleep', 'VB')]
[('Lie', 'VBP'), ('down', 'RP'), ('on', 'IN'), ('the', 'DT'), ('bed', 'NN'), ('and', 'CC'), ('go', 'VB'), ('to', 'TO'), ('sleep', 'VB')]
CPU times: user 6.94 ms, sys: 2.65 ms, total: 9.59 ms
Wall time: 7.89 ms


In [8]:
import os
os.listdir('../stanford-tagger')

['stanford-postagger-gui.sh',
 'sample-output.txt',
 'stanford-postagger.jar',
 '.DS_Store',
 'build.xml',
 'stanford-postagger-gui.bat',
 'TaggerDemo.java',
 'models',
 'stanford-postagger.bat',
 'TaggerDemo2.java',
 'sample-input.txt',
 'stanford-postagger-4.2.0-sources.jar',
 'stanford-postagger.sh',
 'stanford-postagger-4.2.0-javadoc.jar',
 'README.txt',
 'stanford-postagger-4.2.0.jar',
 'LICENSE.txt',
 'data']

In [9]:
from nltk import StanfordTagger
from nltk.tag import StanfordPOSTagger

st = StanfordPOSTagger('../stanford-tagger/models/english-bidirectional-distsim.tagger', path_to_jar='../stanford-tagger/stanford-postagger.jar') 

In [10]:
%%time

def get_pos_tags(sentence):
    words = word_tokenize(sentence)
    return st.tag(words)

for sent in sentences:
    print(get_pos_tags(sent))

[('Go', 'VB'), ('north', 'RB')]
[('Go', 'VB'), ('into', 'IN'), ('the', 'DT'), ('cave', 'NN')]
[('Pick', 'VB'), ('up', 'RP'), ('the', 'DT'), ('map', 'NN')]
[('Open', 'VB'), ('the', 'DT'), ('green', 'JJ'), ('door', 'NN')]
[('Unlock', 'VB'), ('the', 'DT'), ('wooden', 'JJ'), ('door', 'NN'), ('with', 'IN'), ('the', 'DT'), ('large', 'JJ'), ('rusty', 'JJ'), ('key', 'NN')]
[('Put', 'VB'), ('the', 'DT'), ('map', 'NN'), ('on', 'IN'), ('the', 'DT'), ('table', 'NN')]
[('Go', 'VB'), ('to', 'TO'), ('sleep', 'VB')]
[('Lie', 'NN'), ('down', 'RP'), ('on', 'IN'), ('the', 'DT'), ('bed', 'NN'), ('and', 'CC'), ('go', 'VB'), ('to', 'IN'), ('sleep', 'NN')]
CPU times: user 42.9 ms, sys: 112 ms, total: 155 ms
Wall time: 13.4 s


In [11]:
# Times slower
14.1 * 1000 / 9.5

1484.2105263157894

In [12]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [13]:
def get_pos_tags(sentence):
    words = ['You'] + word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

for sent in sentences:
    print(get_pos_tags(sent))

[('You', 'PRP'), ('Go', 'VBP'), ('north', 'JJ')]
[('You', 'PRP'), ('Go', 'VBP'), ('into', 'IN'), ('the', 'DT'), ('cave', 'NN')]
[('You', 'PRP'), ('Pick', 'VBP'), ('up', 'RP'), ('the', 'DT'), ('map', 'NN')]
[('You', 'PRP'), ('Open', 'VBP'), ('the', 'DT'), ('green', 'JJ'), ('door', 'NN')]
[('You', 'PRP'), ('Unlock', 'VBP'), ('the', 'DT'), ('wooden', 'JJ'), ('door', 'NN'), ('with', 'IN'), ('the', 'DT'), ('large', 'JJ'), ('rusty', 'NN'), ('key', 'NN')]
[('You', 'PRP'), ('Put', 'VBP'), ('the', 'DT'), ('map', 'NN'), ('on', 'IN'), ('the', 'DT'), ('table', 'NN')]
[('You', 'PRP'), ('Go', 'VBP'), ('to', 'TO'), ('sleep', 'VB')]
[('You', 'PRP'), ('Lie', 'VBP'), ('down', 'RP'), ('on', 'IN'), ('the', 'DT'), ('bed', 'NN'), ('and', 'CC'), ('go', 'VB'), ('to', 'TO'), ('sleep', 'VB')]


In [14]:
def get_pos_tags(sentence):
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

def parse_chunk(tags, grammar):
    chunk_parser = nltk.RegexpParser(grammar)
    tree = chunk_parser.parse(tags)
    return tree

sentences = [
    "door",
    "the door",
    "green door",
    "that green door",
    "a bright green door",
    "large wooden door",
    "wooden leg",
    "my leg",
    "wooden leg"
]
grammar = r"NP: {<DT|PP\$>?<JJ>*<NN>}"
for sentence in sentences:
    tree = parse_chunk(get_pos_tags(sentence), grammar)
    print(tree)

(S (NP door/NN))
(S (NP the/DT door/NN))
(S (NP green/JJ door/NN))
(S (NP that/DT green/JJ door/NN))
(S (NP a/DT bright/JJ green/JJ door/NN))
(S (NP large/JJ wooden/JJ door/NN))
(S (NP wooden/JJ leg/NN))
(S my/PRP$ (NP leg/NN))
(S (NP wooden/JJ leg/NN))


In [15]:
np1 = nltk.Tree('NP', ['the', 'bright', 'green', 'envelope'])
print(np1)

(NP the bright green envelope)


In [16]:
vp1 = nltk.Tree('VP', ['pick', 'up'])
print(vp1)

(VP pick up)


In [17]:
p1 = nltk.Tree('PRP$', ['You'])
print(p1)

(PRP$ You)


In [18]:
tree = nltk.Tree('S', [p1, vp1, np1])
print(tree)

(S (PRP$ You) (VP pick up) (NP the bright green envelope))


In [19]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        # Now we know that t.node is defined
        print('(', t.label(), end=" ")
        for child in t:
            traverse(child)
        print(')', end=" ")

traverse(tree)

( S ( PRP$ You ) ( VP pick up ) ( NP the bright green envelope ) ) 

In [20]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/billtubbs/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [23]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/billtubbs/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [27]:
sentence = "you pick up the bright green envelope"
words = word_tokenize(sentence)
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)
print(tree)

(S you/PRP pick/VBP up/RP the/DT bright/JJ green/JJ envelope/NN)


In [28]:
traverse(tree)

( S ('you', 'PRP') ('pick', 'VBP') ('up', 'RP') ('the', 'DT') ('bright', 'JJ') ('green', 'JJ') ('envelope', 'NN') ) 

In [33]:
sentence = "the bright green envelope"
words = word_tokenize(sentence)
parser = nltk.ChartParser(grammar, nltk.parse.BU_STRATEGY)
trees = parser.nbest_parse(sent, trace=2)

TypeError: 'module' object is not iterable

In [31]:
help(nltk.parse)

Help on package nltk.parse in nltk:

NAME
    nltk.parse - NLTK Parsers

DESCRIPTION
    Classes and interfaces for producing tree structures that represent
    the internal organization of a text.  This task is known as "parsing"
    the text, and the resulting tree structures are called the text's
    "parses".  Typically, the text is a single sentence, and the tree
    structure represents the syntactic structure of the sentence.
    However, parsers can also be used in other domains.  For example,
    parsers can be used to derive the morphological structure of the
    morphemes that make up a word, or to derive the discourse structure
    for a set of utterances.
    
    Sometimes, a single piece of text can be represented by more than one
    tree structure.  Texts represented by more than one tree structure are
    called "ambiguous" texts.  Note that there are actually two ways in
    which a text can be ambiguous:
    
        - The text has multiple correct parses.
        -

In [34]:
treebank_string = """(S (NP-SBJ (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)) ) (PP (IN of) (NP (DT the) (NNS students) ))) (VP (VBD passed)))"""

In [35]:
treebank_string

'(S (NP-SBJ (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)) ) (PP (IN of) (NP (DT the) (NNS students) ))) (VP (VBD passed)))'

In [38]:
t = nltk.Tree.fromstring(treebank_string)
print(t)

(S
  (NP-SBJ
    (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)))
    (PP (IN of) (NP (DT the) (NNS students))))
  (VP (VBD passed)))


In [39]:
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")

sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']

parser = nltk.ChartParser(groucho_grammar)

for tree in parser.parse(sent):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [146]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det AP | N PP | AP PP | NP PP | 'You'
VP -> V NP | V PP | V NP PP
AP -> J N | J AP
Det -> 'the' | 'a' | 'an' | 'my'
N -> 'bed' | 'door' | 'key' | 'handle'
V -> 'sit' | 'open' | 'close' | 'unlock' | 'go'
P -> 'on' | 'in' | 'with' | 'to'
J -> 'big' | 'wooden'
""")

sent = ['You', 'sit', 'on', 'the', 'big', 'wooden', 'bed']
#sent = ['You', 'open', 'the', 'wooden', 'door', 'with', 'the', 'handle']

parser = nltk.ChartParser(grammar)

for tree in parser.parse(sent):
    print(tree)

(S
  (NP You)
  (VP
    (V sit)
    (PP (P on) (NP (Det the) (AP (J big) (AP (J wooden) (N bed)))))))
