-
Notifications
You must be signed in to change notification settings - Fork 2
/
nlparser.py
444 lines (380 loc) · 18.5 KB
/
nlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
# Simple semantic parser that converts sentences into first-order logic.
# @author Percy Liang
import sys, collections, re
from logic import *
############################################################
# NLP (Tokenization, Part-of-speech tagging, Lemmatization)
class NaturalLanguageProcessor:
def __init__(self, sentence):
self.sentence = sentence
self.process()
# Ensure everything is lowercased:
self.tokens = [x.lower() for x in self.tokens]
def process(self):
"""Create 2 lists of the same length:
- self.tokens: raw text tokens (lemmatized)
- self.pos_tags: part-of-speech tags or word classes
"""
raise NotImplementedError('preprocess() not implemented.')
# Use NLTK to analyze the sentence
class NLTKProcessor(NaturalLanguageProcessor):
def process(self):
try:
self.processWithNLTK()
except Exception as e:
# Fall back to SimpleProcessor
print('WARNING: Falling back to SimpleProcessor: ', e, file=sys.stderr)
fallbackProcessor = SimpleProcessor(self.sentence)
self.tokens = fallbackProcessor.tokens
self.pos_tags = fallbackProcessor.pos_tags
def processWithNLTK(self):
import nltk
### Perform tokenization.
self.tokens = nltk.word_tokenize(self.sentence)
### Part-of-speech tagging
self.pos_tags = [x[1] for x in nltk.pos_tag(self.tokens)]
# Hack arround errors in crappy POS tagger
# Mistakingly tags VBZ as NNS after a NNP (e.g., John lives in a house)
for i in range(len(self.pos_tags)-1):
if self.pos_tags[i] == 'NNP' and self.pos_tags[i+1] == 'NNS':
self.pos_tags[i+1] = 'VBZ'
for i in range(len(self.pos_tags)):
if self.tokens[i] == 'Does': self.pos_tags[i] = 'VBZ' # Mistagged as NNP
if self.tokens[i] == 'Did': self.pos_tags[i] = 'VBZ' # Mistagged as NNP
if self.tokens[i] == 'red': self.pos_tags[i] = 'JJ' # Mistagged as VBN
if self.tokens[i] == 'lives': self.pos_tags[i] = 'VBZ' # Mistagged as NNS
if self.tokens[i] == 'likes': self.pos_tags[i] = 'VBZ' # Mistagged as IN
if self.tokens[i] == 'like': self.pos_tags[i] = 'VB' # Mistagged as IN
# Lemmatize
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def lemmatize(token, tag):
# Only lemmatize with the proper POS
if token.lower() == 'people': return 'person'
if tag[0] in 'VN':
return lemmatizer.lemmatize(token.lower(), tag[0].lower())
else:
return token
self.tokens = [lemmatize(token, tag) for token, tag in zip(self.tokens, self.pos_tags)]
# A fallback NLP system
class SimpleProcessor(NaturalLanguageProcessor):
def split(self, x):
# Add spaces around punctuations
x = re.sub(r'([.,?!:;])', r' \1 ', x)
return x.strip().split()
def tag(self, w):
if w[0].isupper(): return 'NNP'
if w.endswith('ing'): return 'VBG'
if w.endswith('ed'): return 'VBD'
if w in ('.', ',', '?', '!', ':', ';'): return w
return 'NN'
def process(self):
self.tokens = self.split(self.sentence)
self.pos_tags = [self.tag(token) for token in self.tokens]
# Use tags from the specified categories
def getCategoryProcessor(categories):
"""Return a new NaturalLanguageProcessor class where the pos_tags are
assigned according to the specified categories.
|categories| is a dictionay from a category (e.g., "Noun", "PersonName", "NN")
to a list of words in that category.
Each word cannot belong to multiple categories. Unrecognized words will be
tagged as 'Other'
Example: If |categories| is
{
'Noun': ['cats', 'dogs', 'animals'],
'Verb': ['run', 'walk'],
}
Then the pos_tags for 'Cats run fast' will be ['Noun', 'Verb', 'Other'].
"""
wordToCategory = {}
for cat in categories:
for word in categories[cat]:
assert word not in wordToCategory, \
"%s is in both %s and %s" % (word, wordToCategory[word], cat)
capital = word.capitalize()
assert capital not in wordToCategory, \
"%s is in both %s and %s" % (capital, wordToCategory[capital], cat)
wordToCategory[word] = cat
wordToCategory[capital] = cat
class CategoryProcessor(SimpleProcessor):
def tag(self, w):
return wordToCategory.get(w, 'Other')
return CategoryProcessor
############################################################
# Representation of a natural language utterance.
class Utterance:
def __init__(self, sentence, processorClass=NLTKProcessor):
# Preprocess
sentence = re.sub(r"\ban\b", 'a', sentence)
sentence = re.sub(r"\bdon't\b", 'do not', sentence)
sentence = re.sub(r"\bdoesn't\b", 'does not', sentence)
sentence = re.sub(r"\bit's\b", 'it is', sentence)
sentence = re.sub(r"\bIt's\b", 'It is', sentence)
self.sentence = sentence
# Process the sentence
nlProcessor = processorClass(self.sentence)
self.tokens = nlProcessor.tokens
self.pos_tags = nlProcessor.pos_tags
def __str__(self):
return ' '.join('%s/%s' % x for x in zip(self.tokens, self.pos_tags))
############################################################
# A Grammar maps a sequence of natural language tokens to a set of ParserDerivations.
# Try to look inside a lambda (not perfect)
def lambda_rstr(x):
if isinstance(x, tuple): return str(tuple(map(lambda_rstr, x)))
if callable(x):
v = Constant('v')
try:
return "(%s => %s)" % (v, lambda_rstr(x(v)))
except:
try:
return "(%s => %s)" % (v, lambda_rstr(x(lambda a : Constant('v('+str(a)+')'))))
except:
pass
return rstr(x)
#print lambda_rstr(lambda x : lambda y : Atom('A', x, y))
# A grammar rule parses the right-hand side |rhs| into a |lhs|
class GrammarRule:
def __init__(self, lhs, rhs, sem, score = 0):
self.lhs = lhs
self.rhs = rhs
self.sem = sem
self.score = score
def __str__(self):
return "{%s -> %s, score=%s}" % (self.lhs, self.rhs, self.score)
############################################################
BASE_OBJECTS = ['Garfield', 'Pluto', 'Jon']
def createBaseLanguageProcessor():
# Defines a mapping from each in-domain word to its word class.
# This automatically creates rules such as
# $Noun -> cat
# with the string "cat" as the denotation
categories = {
'Noun': ['cat', 'tabby', 'mammal', 'person'],
'Verb': ['likes', 'feeds'],
'Name': BASE_OBJECTS,
}
return getCategoryProcessor(categories)
def createBaseEnglishGrammar():
# Create a basic grammar consisting a set of rules.
# Please see nlparser.py for more information on the GrammarRule class.
rules = []
# Parse if it's a question or statement.
rules.append(GrammarRule('$ROOT', ['$Statement'], lambda args: ('tell', args[0])))
rules.append(GrammarRule('$ROOT', ['$Question'], lambda args: ('ask', args[0])))
rules.append(GrammarRule('$Statement', ['$Clause', '.'], lambda args: args[0]))
rules.append(GrammarRule('$Question', ['$Clause', '?'], lambda args: args[0]))
# Add rule for 'every $Noun is a $Noun'
rules.append(GrammarRule('$Clause', ['every', '$Noun', 'is', 'a', '$Noun'],
lambda args: Forall('$x', Implies(Atom(args[0].title(), '$x'), Atom(args[1].title(), '$x')))))
# Add rule for '$Name is a $Noun'
rules.append(GrammarRule('$Clause', ['$Name', 'is', 'a', '$Noun'],
lambda args: Atom(args[1].title(), args[0].lower())))
# Add rule for 'is $Name a $Noun?'
rules.append(GrammarRule('$Question', ['is', '$Clause-be', '?'],
lambda args: args[0]))
rules.append(GrammarRule('$Clause-be', ['$Name', 'a', '$Noun'],
lambda args: Atom(args[1].title(), args[0].lower())))
# Add rule for '$Name $Verb $Name'
rules.append(GrammarRule('$Clause', ['$Name', '$Verb', '$Name'],
lambda args: Atom(args[1].title(), args[0].lower(), args[2].lower())))
# Add rule for '$Name $Verb some $Noun'
rules.append(GrammarRule('$Clause', ['$Name', '$Verb', 'some', '$Noun'],
lambda args: Exists('$y', And(Atom(args[2].title(), '$y'), Atom(args[1].title(), args[0].lower(), '$y')))))
return rules
def createToyGrammar():
rules = []
rules.append(GrammarRule('$S', ['the', '$B'], lambda args : ('ask', args[0])))
rules.append(GrammarRule('$B', ['rain'], lambda args : Atom('Rain')))
return rules
############################################################
# Parser takes an utterance, a grammar and returns a set of ParserDerivations,
# each of which contains a logical form.
# A Derivation includes a logical form, a rule, children derivations (if any) and a score.
class ParserDerivation():
def __init__(self, form, rule, children, score):
self.form = form
self.rule = rule
self.children = children
self.score = score
def dump(self, indent=""):
print("%s%s: score=%s, rule: %s" % (indent, lambda_rstr(self.form), self.score, self.rule))
for child in self.children:
child.dump(indent + " ")
# Move unary rules (RHS has 1 argument) to the bottom of the list in topological order
# Complain if cycles occur (e.g. "A -> B" and "B -> A" cannot appear simultaneously.)
def sortRules(rules):
unaries, others = [], []
colors, children = {}, collections.defaultdict(set)
for rule in rules:
if len(rule.rhs) == 1:
unaries.append(rule)
a, b = rule.lhs, rule.rhs[0]
colors[a] = colors[b] = 'white'
children[a].add(b)
else:
others.append(rule)
# Do topological sort on the symbols in unary rules.
sortedSymbols = []
def visit(symbol):
if colors[symbol] == 'gray':
raise Exception('Cyclic unary rules detected!')
if colors[symbol] == 'white':
colors[symbol] = 'gray'
for child in children[symbol]:
visit(child)
colors[symbol] = 'black'
sortedSymbols.append(symbol)
while 'white' in list(colors.values()):
# Find a symbol with white color and visit it.
whiteSymbols = [x for x in colors if colors[x] == 'white']
visit(whiteSymbols[0])
# Sort the rules
def key(rule):
a, b = rule.lhs, rule.rhs[0]
return (sortedSymbols.index(a), sortedSymbols.index(b))
unaries.sort(key=key)
return others + unaries
# Return a sorted list of ParserDerivations.
# Standard bottom-up CKY parsing
def parseUtterance(utterance, rules, verbose=0):
def isCat(x): return x.startswith('$')
tokens = utterance.tokens
n = len(tokens)
beamSize = 5
rules = sortRules(rules)
# 0 start mid -> split end n
def applyRule(start, end, mid, rule, rhsIndex, children, score):
#print "applyRule: start=%s end=%s mid=%s, rule=%s[%s], children=%s" % (start, end, mid, rule, rhsIndex, children)
# Need to arrive at end of tokens exactly when run out of rhs
if (mid == end) != (rhsIndex == len(rule.rhs)):
return
if rhsIndex == len(rule.rhs):
deriv = ParserDerivation(rule.sem([child.form for child in children]), rule, children, score + rule.score)
if verbose >= 3:
print("applyRule: %s:%s%s %s += %s" % (start, end, tokens[start:end], rule.lhs, rstr(deriv.form)))
chart[start][end][rule.lhs].append(deriv)
return
a = rule.rhs[rhsIndex]
if isCat(a): # Category
for split in range(mid+1, end+1):
#print "split", mid, split
for child in chart[mid][split].get(a, {}):
applyRule(start, end, split, rule, rhsIndex+1, children + [child], score + child.score)
else: # Token
if tokens[mid] == a:
applyRule(start, end, mid+1, rule, rhsIndex+1, children, score)
# Initialize
chart = [None] * n # start => end => category => top K derivations (logical form, score)
for start in range(0, n):
chart[start] = [None] * (n+1)
for end in range(start+1, n+1):
chart[start][end] = collections.defaultdict(list)
# Initialize with POS tags
for start in range(0, n):
# Don't tag these words, because coupla and negation items need to be treated specially
if utterance.tokens[start] in ('be', 'not'): continue
chart[start][start+1]['$'+utterance.pos_tags[start]].append(ParserDerivation(utterance.tokens[start], None, [], 0))
# Parse
for length in range(1, n+1): # For each span length...
for start in range(n - length + 1): # For each starting position
end = start + length
for rule in rules:
applyRule(start, end, start, rule, 0, [], 0)
# Prune derivations
cell = chart[start][end]
for cat in list(cell.keys()):
cell[cat] = sorted(cell[cat], key = lambda deriv : -deriv.score)[0:beamSize]
derivations = chart[0][n]['$ROOT']
if verbose >= 1:
print("parseUtterance: %d derivations" % len(derivations))
if verbose >= 2:
for deriv in derivations:
if verbose >= 3:
deriv.dump(" ")
else:
print(" %s: score=%s" % (rstr(deriv.form), deriv.score))
return derivations
############################################################
# Train the grammar rule scores to get the training examples correct.
# Also acts as a unit test (we should get 100% accuracy).
def trainGrammar(rules):
# Training examples
examples = []
# Zeroary
examples.append(('It is raining.', ('tell', Atom('Rain'))))
examples.append(('It is summer.', ('tell', Atom('Summer'))))
examples.append(('It is wet.', ('tell', Atom('Wet'))))
examples.append(('It is not summer.', ('tell', Not(Atom('Summer')))))
# Simple sentences
examples.append(('John is happy.', ('tell', Atom('Happy', 'john'))))
examples.append(('John is not happy.', ('tell', Not(Atom('Happy', 'john')))))
examples.append(('John is a cat.', ('tell', Atom('Cat', 'john'))))
examples.append(('John is not a dog.', ('tell', Not(Atom('Dog', 'john')))))
examples.append(('John was born in Seattle.', ('tell', Atom('Bear_in', 'john', 'seattle'))))
examples.append(('John lives in Seattle.', ('tell', Atom('Live_in', 'john', 'seattle'))))
examples.append(('John lives in New York.', ('tell', Atom('Live_in', 'john', 'new_york'))))
examples.append(('John does not live in New York.', ('tell', Not(Atom('Live_in', 'john', 'new_york')))))
# Miscellaneous
examples.append(('New York is a big city.', ('tell', And(Atom('Big', 'new_york'), Atom('City', 'new_york')))))
examples.append(('If it is raining, it is wet.', ('tell', Implies(Atom('Rain'), Atom('Wet')))))
# Coordination
examples.append(('John and Bill are cats.', ('tell', And(Atom('Cat', 'john'), Atom('Cat', 'bill')))))
examples.append(('John is either happy or sad.', ('tell', Xor(Atom('Happy', 'john'), Atom('Sad', 'john')))))
examples.append(('John lives in Seattle or Portland.', ('tell', Or(Atom('Live_in', 'john', 'seattle'), Atom('Live_in', 'john', 'portland')))))
examples.append(('Either it is raining or it is snowing.', ('tell', And(Or(Atom('Rain'), Atom('Snow')), Not(And(Atom('Rain'), Atom('Snow')))))))
# Quantification
examples.append(('Cats are animals.', ('tell', Forall('$x1', Implies(Atom('Cat', '$x1'), Atom('Animal', '$x1'))))))
examples.append(('A cat is an animal.', ('tell', Forall('$x1', Implies(Atom('Cat', '$x1'), Atom('Animal', '$x1'))))))
examples.append(('If a person lives in California, he is happy.', ('tell', Forall('$x1', Implies(And(Atom('Person', '$x1'), Atom('Live_in', '$x1', 'california')), Atom('Happy', '$x1'))))))
examples.append(('John visited every city.', ('tell', Forall('$x1', Implies(Atom('City', '$x1'), Atom('Visit', 'john', '$x1'))))))
examples.append(('Every city was visited by John.', ('tell', Forall('$x1', Implies(Atom('City', '$x1'), Atom('Visit_by', '$x1', 'john'))))))
examples.append(('Every person likes some cat.', ('tell', Forall('$x1', Implies(Atom('Person', '$x1'), Exists('$x2', And(Atom('Cat', '$x2'), Atom('Like', '$x1', '$x2'))))))))
examples.append(('No city is perfect.', ('tell', Not(Exists('$x1', And(Atom('City', '$x1'), Atom('Perfect', '$x1')))))))
examples.append(('Does John live in Sweden?', ('ask', Atom('Live_in', 'john', 'sweden'))))
### Train the model using Perceptron
print("============================================================")
print("Training the grammar on %d examples" % len(examples))
numUpdates = 0
def updateWeights(deriv, incr):
if deriv.rule:
deriv.rule.score += incr
for child in deriv.children:
updateWeights(child, incr)
# target, pred are both (mode, form)
# Need to use unify because the variables could be different
def isCompatible(target, pred):
return target == pred
#return target[0] == pred[0] and unify(target[1], pred[1], {})
for iteration in range(0, 10):
print('-- Iteration %d' % iteration)
numMistakes = 0
for x, y in examples:
# Predict on example
utterance = Utterance(x)
derivations = parseUtterance(utterance, rules)
targetDeriv = None
for deriv in derivations:
if isCompatible(y, deriv.form):
targetDeriv = deriv
break
if targetDeriv == None:
print("Impossible to get correct: %s => %s" % (x, rstr(y)))
print(" Utterance:", utterance)
print(" Derivations:")
for deriv in derivations:
print(' ', rstr(deriv.form))
continue
predDeriv = derivations[0]
if predDeriv != targetDeriv:
print("Mistake: %s => %s, predicted %s" % (x, rstr(y), rstr(predDeriv.form)))
numMistakes += 1
# Update weights
numUpdates += 1
stepSize = 1.0 # / math.sqrt(numUpdates)
updateWeights(targetDeriv, +stepSize)
updateWeights(predDeriv, -stepSize)
if numMistakes == 0: break
print('Rules with non-zero weights:')
for rule in rules:
if rule.score != 0:
print(' ', rule)