Permalink
Browse files

Tokens now carry their position in text too.

  • Loading branch information...
1 parent 608d42b commit 5c0bd6e66fea26a5f0929d0fcf2a1b05b57a2b3a @bl0b committed Dec 15, 2011
Showing with 52 additions and 10 deletions.
  1. +33 −4 jupyLR/automaton.py
  2. +3 −3 jupyLR/parser.py
  3. +2 −0 jupyLR/stack.py
  4. +13 −2 jupyLR/tokenizer.py
  5. +1 −1 sandbox.py
View
@@ -3,12 +3,16 @@
from parser import parser
from itertools import ifilter, chain
from stack import stack
+#from lr import kernel
+from tokenizer import token_line_col
+
+INITIAL_TOKEN = ('#', '#', 0)
class Automaton(parser):
def __init__(self, start_sym, grammar, scanner):
- parser.__init__(self, start_sym, grammar)
+ parser.__init__(self, start_sym, grammar, scanner.tokens.keys())
self.scanner = scanner
def validate_ast(self, ast):
@@ -17,14 +21,35 @@ def validate_ast(self, ast):
"""
return True
+ def error_detected(self, cur_tok, last_states):
+ """Overload this method in subclasses to implement error recovery
+ or notification.
+ """
+ line, column = token_line_col(self.text, cur_tok)
+ print "Error detected at line %i, column %i:" % (line, column)
+ #for st in last_states:
+ # print self.itemsetstr(kernel(self.LR0[st.data]))
+ # print "Expected", ', '.join(kw
+ # for kw in self.kw_set
+ # if len(self.ACTION[st.data][kw]) > 0)
+ print "Expected", ', '.join(kw for st in last_states
+ for kw in self.kw_set
+ if len(self.ACTION[st.data][kw]) > 0)
+ return False
+
def recognize(self, token_stream):
S = stack(self)
#toki = iter(token_stream)
S.shift(None, None, 0)
S.count_active = 1
+ prev_tok = INITIAL_TOKEN
for cur_tok in token_stream:
if len(S.active) == 0:
- break
+ if not self.error_detected(prev_tok, S.previously_active):
+ break
+ else:
+ continue
+ prev_tok = cur_tok
# Reduce phase
for i, node in S.enumerate_active(): # S.active may grow
state = node.data
@@ -43,11 +68,15 @@ def recognize(self, token_stream):
# Merge states
S.merge()
# Check if there are accepting states, and return their outputs
- if cur_tok == ('$', '$'):
+ if cur_tok[0] == '$':
acc = S.accepts()
if acc:
return acc
+ else:
+ self.error_detected(cur_tok, S.active)
return None
def __call__(self, text):
- return self.recognize(chain(self.scanner(text), [('$', '$')]))
+ self.text = text # for the sake of the error detection/recovery
+ return self.recognize(chain(self.scanner(text),
+ [('$', '$', len(self.text))]))
View
@@ -15,7 +15,7 @@ def rules(start, grammar, kw):
edit_rule_commit = True
next_edit_rule_commit = True
kw.add(edit_rule)
- for tokname, tokvalue in lr_grammar_scanner(grammar):
+ for tokname, tokvalue, tokpos in lr_grammar_scanner(grammar):
if tokname == 'minus':
next_edit_rule_commit = False
if tokname == 'word':
@@ -46,8 +46,8 @@ def ruleset(rules):
class parser(object):
- def __init__(self, start_sym, grammar):
- self.kw_set = set()
+ def __init__(self, start_sym, grammar, scanner_kw=[]):
+ self.kw_set = set(scanner_kw)
self.kw_set.add('$')
self.R, counter = ruleset(rules(start_sym, grammar, self.kw_set))
self.I = set((r, i) for r in xrange(counter)
View
@@ -24,6 +24,7 @@ def __init__(self, A):
self.active = []
self.A = A
self.count_active = 0
+ self.previously_active = []
def enumerate_active(self):
i = 0
@@ -80,6 +81,7 @@ def reduce(self, node, ruleidx):
def merge(self):
merged_s = {}
+ self.previously_active = self.active[:self.count_active]
for node in self.active[self.count_active:]:
state = node.data
if state in merged_s:
View
@@ -1,4 +1,4 @@
-__all__ = ["TokenizerException", "Scanner", "make_scanner"]
+__all__ = ["TokenizerException", "Scanner", "make_scanner", "token_line_col"]
import re
from itertools import chain
@@ -9,6 +9,16 @@
#/python-lexical-analysis-and-tokenization
+def token_line_col(text, tok):
+ line = text.count('\n', 0, tok[2]) + 1
+ offset = text.rfind('\n', 0, tok[2])
+ if offset == -1:
+ column = tok[2] + 1
+ else:
+ column = offset + 1
+ return line, column
+
+
class TokenizerException(Exception):
pass
@@ -22,8 +32,9 @@ def tokenize_iter(text, token_re, discard_names={}, discard_values={}):
pos = m.end()
tokname = m.lastgroup
tokvalue = m.group(tokname)
+ tokpos = m.start()
if tokname not in discard_names and tokvalue not in discard_values:
- yield tokname, tokvalue
+ yield tokname, tokvalue, tokpos
if pos != len(text):
msg = 'tokenizer stopped at pos %r of %r in "%s" at "%s"' % (
pos, len(text), text, text[pos:pos + 3])
View
@@ -22,4 +22,4 @@
#print pal.action_to_str()
-b = Automaton('E', 'E = a E = a a E = E E', Scanner(a='a'))
+b = Automaton('E', 'E = a E = a a E = E E', Scanner(a='a', b='b'))

0 comments on commit 5c0bd6e

Please sign in to comment.