# A Command Parser For Interactive Text Games using NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import matplotlib

nltk.__version__

'3.8.1'

In [2]:
#nltk.download()

In [2]:
sentences = [
    "Go north",
    "Go into the cave",
    "Pick up the map",
    "Open the green door",
    "Unlock the wooden door with the large rusty key",
    "Put the map on the table",
    "Go to sleep",
    "Lie down on the bed and go to sleep"
]

In [3]:
for sentence in sentences:
    words = word_tokenize(sentence)
    print(words)

['Go', 'north']
['Go', 'into', 'the', 'cave']
['Pick', 'up', 'the', 'map']
['Open', 'the', 'green', 'door']
['Unlock', 'the', 'wooden', 'door', 'with', 'the', 'large', 'rusty', 'key']
['Put', 'the', 'map', 'on', 'the', 'table']
['Go', 'to', 'sleep']
['Lie', 'down', 'on', 'the', 'bed', 'and', 'go', 'to', 'sleep']


## Tagging words using a corpus and chunk parser

In [281]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [23]:
# Problem: imperative sentences are not handled well
sentence = "Eat the breadroll"
words = word_tokenize(sentence)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('Eat', 'NNP'), ('the', 'DT'), ('breadroll', 'NN')]


In [24]:
sentence = "eat the breadroll"
words = word_tokenize(sentence)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('eat', 'VB'), ('the', 'DT'), ('breadroll', 'NN')]


In [28]:
# Problem: imperative sentences are not handled well
sentence = "You eat the breadroll"
words = word_tokenize(sentence)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('You', 'PRP'), ('eat', 'VBP'), ('the', 'DT'), ('breadroll', 'NN')]


In [29]:
# Problem: imperative sentences are not handled well
sentence = "You eat the breadroll!"
words = word_tokenize(sentence)
pos_tags = nltk.pos_tag(words)
print(pos_tags)

[('You', 'PRP'), ('eat', 'VBP'), ('the', 'DT'), ('breadroll', 'NN'), ('!', '.')]


In [77]:
def get_pos_tags(sentence):
    words = word_tokenize(sentence)
    mod_words = ['You', words[0].lower()] + words[1:]
    pos_tags = nltk.pos_tag(mod_words)
    return pos_tags[1:]

grammar = "NP: {<DT>?<JJ>*<NN>}"
grammar = r"NP: {<DT|PP\$>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
for sentence in sentences:
    tree = chunk_parser.parse(get_pos_tags(sentence))
    print(tree)

(S go/VBP north/JJ)
(S go/VBP into/IN (NP the/DT cave/NN))
(S pick/VBP up/RP (NP the/DT map/NN))
(S open/VBP (NP the/DT green/JJ door/NN))
(S
  unlock/VBP
  (NP the/DT wooden/JJ door/NN)
  with/IN
  (NP the/DT large/JJ rusty/NN)
  (NP key/NN))
(S put/VBP (NP the/DT map/NN) on/IN (NP the/DT table/NN))
(S go/VBP to/TO sleep/VB)
(S
  lie/VBP
  down/RP
  on/IN
  (NP the/DT bed/NN)
  and/CC
  go/VB
  to/TO
  sleep/VB)


## Constructing a tree

In [37]:
np1 = nltk.Tree('NP', ['the', 'bright', 'green', 'envelope'])
print(np1)

(NP the bright green envelope)


In [38]:
vp1 = nltk.Tree('VP', ['pick', 'up'])
print(vp1)

(VP pick up)


In [39]:
p1 = nltk.Tree('PRP', ['You'])
print(p1)

(PRP You)


In [45]:
# Combining them
tree = nltk.Tree('S', [p1, vp1, np1])
print(tree)

(S (PRP You) (VP pick up) (NP the bright green envelope))


In [46]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        # Now we know that t.node is defined
        print('(', t.label(), end=" ")
        for child in t:
            traverse(child)
        print(')', end=" ")

traverse(tree)

( S ( PRP You ) ( VP pick up ) ( NP the bright green envelope ) ) 

In [20]:
#nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/billtubbs/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [23]:
#nltk.download('words')

[nltk_data] Downloading package words to /Users/billtubbs/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [47]:
sentence = "you pick up the bright green envelope"
words = word_tokenize(sentence)
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)
print(tree)

(S you/PRP pick/VBP up/RP the/DT bright/JJ green/JJ envelope/NN)


In [43]:
traverse(tree)

( S ('you', 'PRP') ('pick', 'VBP') ('up', 'RP') ('the', 'DT') ('bright', 'JJ') ('green', 'JJ') ('envelope', 'NN') ) 

In [31]:
help(nltk.parse)

Help on package nltk.parse in nltk:

NAME
    nltk.parse - NLTK Parsers

DESCRIPTION
    Classes and interfaces for producing tree structures that represent
    the internal organization of a text.  This task is known as "parsing"
    the text, and the resulting tree structures are called the text's
    "parses".  Typically, the text is a single sentence, and the tree
    structure represents the syntactic structure of the sentence.
    However, parsers can also be used in other domains.  For example,
    parsers can be used to derive the morphological structure of the
    morphemes that make up a word, or to derive the discourse structure
    for a set of utterances.
    
    Sometimes, a single piece of text can be represented by more than one
    tree structure.  Texts represented by more than one tree structure are
    called "ambiguous" texts.  Note that there are actually two ways in
    which a text can be ambiguous:
    
        - The text has multiple correct parses.
        -

In [34]:
treebank_string = """(S (NP-SBJ (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)) ) (PP (IN of) (NP (DT the) (NNS students) ))) (VP (VBD passed)))"""

In [35]:
treebank_string

'(S (NP-SBJ (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)) ) (PP (IN of) (NP (DT the) (NNS students) ))) (VP (VBD passed)))'

In [38]:
t = nltk.Tree.fromstring(treebank_string)
print(t)

(S
  (NP-SBJ
    (NP (QP (IN at) (JJS least) (CD nine) (NNS tenths)))
    (PP (IN of) (NP (DT the) (NNS students))))
  (VP (VBD passed)))


## Parsing using a grammar definition

In [342]:
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")

sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']

parser = nltk.ChartParser(groucho_grammar)

for tree in parser.parse(sent):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [343]:
sentence = 'the little yellow dog barked at the black cat'
words = word_tokenize(sentence)
pos_tags = nltk.pos_tag(words)
assert pos_tags == [
    ("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
    ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  
    ("the", "DT"), ("black", "JJ"), ("cat", "NN")
]

In [344]:
# Example from docs
#  - https://www.nltk.org/book_1ed/ch07.html
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(pos_tags)
print(tree)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT black/JJ cat/NN))


In [345]:
def get_pos_tags(sentence):
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    return pos_tags

sentences = [
    "door",
    "wooden door",
    "the door",
    "the wooden door",
    "the heavy wooden door"
]

grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)

for sentence in sentences:
    tree = chunk_parser.parse(get_pos_tags(sentence))
    print(tree)

(S (NP door/NN))
(S (NP wooden/JJ door/NN))
(S (NP the/DT door/NN))
(S (NP the/DT wooden/JJ door/NN))
(S (NP the/DT heavy/JJ wooden/JJ door/NN))


In [346]:
sentences = [
    "open door",
    "you open door",
    "you open the door",
    "close door",
    "you close door",
    "you close the door",
    "you open wooden door",
    "you open the wooden door",
    "you open the heavy wooden door",
    "you close the open door"
]

grammar = """
S: {<VBP><NP>}
NP: {<DT>?<JJ>*<NP>}
"""
# PP: {<IN>+<NP>}

chunk_parser = nltk.RegexpParser(grammar)

for sentence in sentences:
    words = word_tokenize(sentence)
    tree = chunk_parser.parse(get_pos_tags(sentence))
    print(tree)

(S open/JJ door/NN)
(S you/PRP open/VBP door/VB)
(S you/PRP open/VBP the/DT door/NN)
(S close/JJ door/NN)
(S you/PRP close/VBP door/JJ)
(S you/PRP close/VBP the/DT door/NN)
(S you/PRP open/VBP wooden/JJ door/NN)
(S you/PRP open/VBP the/DT wooden/JJ door/NN)
(S you/PRP open/VBP the/DT heavy/JJ wooden/JJ door/NN)
(S you/PRP close/VBP the/DT open/JJ door/NN)


In [347]:
#  Syntactic Categories
#  
#  Symbol  Meaning               Example
#
#  S       sentence              the man walked
#  NP      noun phrase           a dog
#  VP      verb phrase           saw a park
#  PP      prepositional phrase  with a telescope
#  Det     determiner            the
#  N       noun                  dog
#  V       verb                  walked
#  P       preposition           in

#  noun phrases (NP)
#  verb phrases (VP)
#  prepositional phrases (PP)
#  nominals (Nom)

grammar = nltk.CFG.fromstring("""
VP -> V NP | V PP | V NP PP
NP -> N | Nom | Det N | Det Nom | NP PP
Nom -> J N | J Nom
PP -> P NP
Det -> 'the' | 'a' | 'an'
N -> 'bed' | 'door' | 'key' | 'handle' | 'box' | 'window'
V -> 'sit' | 'open' | 'close' | 'unlock' | 'go'
P -> 'on' | 'in' | 'with' | 'to'
J -> 'small' | 'big' | 'wooden' | 'heavy' | 'open' | 'closed' | 'round'
""")

sentences = [
    "open door",
    "open the door",
    "close door",
    "close the door",
    "open wooden door",
    "open the wooden door",
    "open the heavy wooden door",
    "close the open door",
    "open the heavy wooden door",
    "unlock the door",
    "unlock the door with the key",
    "unlock the wooden door with the big key",
    "unlock the wooden door with the round window"
]

parser = nltk.ChartParser(grammar)

for sentence in sentences:
    print(f'"{sentence}":')
    words = word_tokenize(sentence)
    #words = ['sit', 'on', 'the', 'big', 'wooden', 'bed']
    for tree in parser.parse(words):
        print(tree)
    print()

"open door":
(VP (V open) (NP (N door)))

"open the door":
(VP (V open) (NP (Det the) (N door)))

"close door":
(VP (V close) (NP (N door)))

"close the door":
(VP (V close) (NP (Det the) (N door)))

"open wooden door":
(VP (V open) (NP (Nom (J wooden) (N door))))

"open the wooden door":
(VP (V open) (NP (Det the) (Nom (J wooden) (N door))))

"open the heavy wooden door":
(VP
  (V open)
  (NP (Det the) (Nom (J heavy) (Nom (J wooden) (N door)))))

"close the open door":
(VP (V close) (NP (Det the) (Nom (J open) (N door))))

"open the heavy wooden door":
(VP
  (V open)
  (NP (Det the) (Nom (J heavy) (Nom (J wooden) (N door)))))

"unlock the door":
(VP (V unlock) (NP (Det the) (N door)))

"unlock the door with the key":
(VP
  (V unlock)
  (NP (Det the) (N door))
  (PP (P with) (NP (Det the) (N key))))
(VP
  (V unlock)
  (NP (NP (Det the) (N door)) (PP (P with) (NP (Det the) (N key)))))

"unlock the wooden door with the big key":
(VP
  (V unlock)
  (NP (Det the) (Nom (J wooden) (N door)))

In [357]:
nouns = ['a', 'b', 'c']


"N -> 'a' | 'b' | 'c'"

In [348]:
for sentence in sentences:
    words = word_tokenize(sentence)
    for tree in parser.parse(words):
        tree.pretty_print()

      VP     
  ____|___    
 |        NP 
 |        |   
 V        N  
 |        |   
open     door

      VP         
  ____|___        
 |        NP     
 |     ___|___    
 V   Det      N  
 |    |       |   
open the     door

       VP     
   ____|___    
  |        NP 
  |        |   
  V        N  
  |        |   
close     door

       VP         
   ____|___        
  |        NP     
  |     ___|___    
  V   Det      N  
  |    |       |   
close the     door

       VP           
  _____|_____        
 |           NP     
 |           |       
 |          Nom     
 |      _____|___    
 V     J         N  
 |     |         |   
open wooden     door

          VP                
  ________|____              
 |             NP           
 |     ________|_____        
 |    |             Nom     
 |    |         _____|___    
 V   Det       J         N  
 |    |        |         |   
open the     wooden     door

           VP                     
  _________|____           

In [202]:
grammar = """
S: {<>}
NP: {<DT>?<JJ>*<NN>}
PP: {<IN>+<NP>}
"""
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(pos_tags)

sentences = [
    #"open the heavy wooden door with key"
    "open the heavy wooden door with the key"
    #"open the heavy wooden door with the big key"
]
for sentence in sentences:
    tree = chunk_parser.parse(get_pos_tags(sentence))
    #print(tree)
    tree.pretty_print()

                             S                                      
    _________________________|_______________________                
   |               |                                 PP             
   |               |                           ______|_____          
   |               NP                         |            NP       
   |        _______|__________________        |       _____|____     
open/VBP the/DT heavy/JJ wooden/JJ door/NN with/IN the/DT     key/NN



In [168]:
tree.pretty_print()

           S                              
    _______|_______                        
   |               NP                     
   |        _______|__________________     
open/VBP the/DT heavy/JJ wooden/JJ door/NN



## Traversing the resulting tree

In [181]:
def traverse(t, indent=""):
    try:
        t.label()
    except AttributeError:
        print(indent + str(t))
    else:
        # Now we know that t.label is defined
        print(indent + '(', t.label())
        for child in t:
            traverse(child, indent=indent + '  ')
        print(indent + ')')

traverse(tree)

( S
  ('open', 'VBP')
  ( NP
    ('the', 'DT')
    ('heavy', 'JJ')
    ('wooden', 'JJ')
    ('door', 'NN')
  )
)


In [182]:
tree[0]

('open', 'VBP')

In [304]:
grammar = nltk.CFG.fromstring("""
S -> VP
VP -> V NP | V PP | V NP PP
PP -> P NP
NP -> Det N | Det AP | N PP | AP PP | NP PP
AP -> J N | J AP
Det -> 'the' | 'a' | 'an' | 'my'
N -> 'bed' | 'door' | 'key' | 'handle'
V -> 'sit' | 'open' | 'close' | 'unlock' | 'go'
P -> 'on' | 'in' | 'with' | 'to'
J -> 'big' | 'wooden'
""")

sent = ['sit', 'on', 'the', 'big', 'wooden', 'bed']
#sent = ['open', 'the', 'wooden', 'door', 'with', 'the', 'handle']

parser = nltk.ChartParser(grammar)

for tree in parser.parse(sent):
    print(tree)

(S
  (VP
    (V sit)
    (PP (P on) (NP (Det the) (AP (J big) (AP (J wooden) (N bed)))))))


In [186]:
from nltk import CFG

grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP
VP -> V NP | VP PP
Det -> 'a' | 'the'
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
""")

grammar

<Grammar with 14 productions>

In [360]:
from parser import Parser

NameError: name 'self' is not defined