Merge branch 'rowcol' into 0.9.x-rowcol

- bring in (mostly) correct line/column number reporting
calmjs · Jun 13, 2017 · 124ab8a · 124ab8a
2 parents e988052 + a6c5eaf
commit 124ab8a
Show file tree

Hide file tree

Showing 10 changed files with 1,320 additions and 766 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,25 @@
 Changelog
 =========
 
+0.9.1 - Unreleased
+------------------
+
+- Corrected the line number reporting for the lexer, and correct the
+  propagation of that to the parser and the Node subclasses.  Fixes the
+  incorrect implementation added by `moses-palmer/slimit@8f9a39c7769
+  <https://github.com/moses-palmer/slimit/commit/8f9a39c7769>`_ (where
+  the line numbers are tabulated incorrectly when comments are present,
+  and also the yacc tracking added by `moses-palmer/slimit@6aa92d68e0
+  <https://github.com/moses-palmer/slimit/commit/6aa92d68e0>`_ (where
+  the custom lexer class does not provide the position attributes
+  required by ply).
+- Implemented bookkeeping of column number.
+- The repr form of Node now shows the line/col number info by default;
+  the visit method of the ReprVisitor class have not been changed, only
+  the invocation of it via the callable form has as that is the call
+  target for __repr__.  This is a good time to mention that named
+  methods afford the most control for usage as documented already.
+
 0.9.0 - 2017-06-09
 ------------------
 

diff --git a/src/calmjs/parse/asttypes.py b/src/calmjs/parse/asttypes.py
@@ -26,14 +26,34 @@
 
 
 class Node(object):
-    def __init__(self, children=None, p=None):
+    def __init__(self, children=None):
         self._children_list = [] if children is None else children
-        self.setpos(p)
-
-    def setpos(self, p):
-        self.lexpos = None if p is None else p.lexpos(1)
-        self.lineno = None if p is None else p.lineno(1)
-        # print 'setpos', self, p, self.lexpos, self.lineno
+        self.lexpos = self.lineno = self.colno = None
+
+    def setpos(self, p, idx=1):
+        self.lexpos = p.lexpos(idx)
+        self.lineno = p.lineno(idx)
+        # YaccProduction does not provide helpers for colno, so query
+        # for a helper out of class and see if it helps...
+        self.colno = (
+            p.lexer.lookup_colno(self.lineno, self.lexpos) if callable(
+                getattr(p.lexer, 'lookup_colno', None)) else 0
+        )
+
+        # the very ugly debugger invocation for locating the special
+        # cases that are required
+
+        # if not self.lexpos and not self.lineno:
+        #     print('setpos', self.__class__.__name__, p.stack,
+        #           self.lexpos, self.lineno, self.colno)
+        #     # uncomment when yacc_tracking is True
+        #     # import pdb;pdb.set_trace()
+        #     # uncomment when yacc_tracking is False
+        #     # import sys
+        #     # from traceback import extract_stack
+        #     # _src = extract_stack(sys._getframe(1), 1)[0].line
+        #     # if '# require yacc_tracking' not in _src:
+        #     #     import pdb;pdb.set_trace()
 
     def __iter__(self):
         for child in self.children():
@@ -175,11 +195,10 @@ class VarStatement(Node):
 
 
 class VarDecl(Node):
-    def __init__(self, identifier, initializer=None, p=None):
+    def __init__(self, identifier, initializer=None):
         self.identifier = identifier
         self.identifier._mangle_candidate = True
         self.initializer = initializer
-        self.setpos(p)
 
     def children(self):
         return [self.identifier, self.initializer]
@@ -368,7 +387,7 @@ def __init__(self, elements):
         self.elements = elements
 
     def children(self):
-        return self.elements
+        return [self.elements]
 
 
 class Debugger(Node):

diff --git a/src/calmjs/parse/lexers/es5.py b/src/calmjs/parse/lexers/es5.py
@@ -36,6 +36,7 @@
     COMBINING_MARK,
     CONNECTOR_PUNCTUATION,
 )
+from calmjs.parse.utils import format_lex_token
 
 # See "Regular Expression Literals" at
 # http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html
@@ -109,20 +110,43 @@ class Lexer(object):
     http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
     """
     def __init__(self):
+        self.lexer = None
         self.prev_token = None
         self.cur_token = None
         self.cur_token_real = None
         self.next_tokens = []
         self.token_stack = [[None, []]]
+        self.newline_idx = [0]
         self.build()
 
+    @property
+    def lineno(self):
+        return self.lexer.lineno if self.lexer else 0
+
+    @property
+    def lexpos(self):
+        return self.lexer.lexpos if self.lexer else 0
+
+    @property
+    def last_newline_lexpos(self):
+        return self.newline_idx[-1]
+
     def build(self, **kwargs):
         """Build the lexer."""
         self.lexer = ply.lex.lex(object=self, **kwargs)
 
     def input(self, text):
         self.lexer.input(text)
 
+    def _set_pos(self, token):
+        lines = token.value.splitlines(True)
+        lexpos = token.lexpos
+        for line in lines:
+            if line[-1:] in '\r\n':
+                lexpos += len(line)
+                self.lexer.lineno += 1
+                self.newline_idx.append(lexpos)
+
     def token(self):
         if self.next_tokens:
             return self.next_tokens.pop()
@@ -139,15 +163,15 @@ def token(self):
             except IndexError:
                 tok = self._get_update_token()
                 if tok is not None and tok.type == 'LINE_TERMINATOR':
-                    lexer.lineno += len(tok.value.splitlines())
+                    self._set_pos(tok)
                     continue
                 else:
                     return tok
 
             if char != '/' or (char == '/' and next_char in ('/', '*')):
                 tok = self._get_update_token()
                 if tok.type in DIVISION_SYNTAX_MARKERS:
-                    lexer.lineno += len(tok.value.splitlines())
+                    self._set_pos(tok)
                     continue
                 else:
                     return tok
@@ -227,8 +251,11 @@ def _get_update_token(self):
                 # TODO actually give up earlier than this with the first
                 # mismatch.
                 raise ECMASyntaxError(
-                    "Mismatched '%s' at line %d" % (
-                        self.cur_token.value, self.cur_token.lineno)
+                    "Mismatched '%s' at %d:%d" % (
+                        self.cur_token.value,
+                        self.cur_token.lineno,
+                        self._get_colno(self.cur_token),
+                    )
                 )
 
         # insert semicolon before restricted tokens
@@ -240,18 +267,44 @@ def _get_update_token(self):
                                          'RETURN', 'THROW']):
             return self._create_semi_token(self.cur_token)
 
-        return self.cur_token
+        return self._set_colno(self.cur_token)
+
+    def _set_colno(self, token):
+        if token:
+            token.colno = self._get_colno(token)
+        return token
+
+    def _get_colno(self, token):
+        # have a 1 offset to map nicer to commonly used/configured
+        # text editors.
+        return token.lexpos - self.last_newline_lexpos + 1
+
+    def lookup_colno(self, lineno, lexpos):
+        """
+        Look up a colno from the lineno and lexpos.
+        """
+
+        # have a 1 offset to map nicer to commonly used/configured
+        # text editors.
+        return lexpos - self.newline_idx[lineno - 1] + 1
 
     def _create_semi_token(self, orig_token):
         token = ply.lex.LexToken()
         token.type = 'SEMI'
         token.value = ';'
         if orig_token is not None:
             token.lineno = orig_token.lineno
+            # TODO figure out whether/how to normalize this with the
+            # actual length of the original token...
+            # Though, if actual use case boils down to error reporting,
+            # line number is sufficient, and leaving it as 0 means it
+            # shouldn't get dealt with during source map generation.
+            token.colno = 0
             token.lexpos = orig_token.lexpos
         else:
             token.lineno = 0
             token.lexpos = 0
+            token.colno = 0
         return token
 
     # iterator protocol
@@ -350,8 +403,8 @@ def next(self):
 
     def t_regex_error(self, token):
         raise ECMARegexSyntaxError(
-            "Error parsing regular expression '%s' at %s" % (
-                token.value, token.lineno)
+            "Error parsing regular expression '%s' at %s:%s" % (
+                token.value, token.lineno, self._get_colno(token))
         )
 
     # Punctuators
@@ -517,6 +570,9 @@ def t_ID(self, token):
         return token
 
     def t_error(self, token):
-        # TODO figure out how to report column instead of lexpos.
-        raise ECMASyntaxError('Illegal character %r at %s:%s after %s' % (
-            token.value[0], token.lineno, token.lexpos, self.cur_token))
+        raise ECMASyntaxError(
+            'Illegal character %r at %s:%s after %s' % (
+                token.value[0], token.lineno, self._get_colno(token),
+                format_lex_token(self.cur_token),
+            )
+        )