From b9234211753f7b575e77bcec5ecce7c59e96291b Mon Sep 17 00:00:00 2001 From: Juanjo Alvarez Date: Thu, 16 Nov 2017 13:48:34 +0100 Subject: [PATCH] Added static typing to astimprover Signed-off-by: Juanjo Alvarez --- fixtures/regenerate_tests.sh | 13 -- fixtures/rolenumbers.txt | 73 ------ .../python_driver/astimprove.py | 217 +++++++++--------- .../python_driver/requestprocessor.py | 4 +- .../python_package/python_driver/version.py | 2 +- 5 files changed, 116 insertions(+), 193 deletions(-) delete mode 100644 fixtures/regenerate_tests.sh delete mode 100644 fixtures/rolenumbers.txt diff --git a/fixtures/regenerate_tests.sh b/fixtures/regenerate_tests.sh deleted file mode 100644 index 58b7519d..00000000 --- a/fixtures/regenerate_tests.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Run this script from the parent directory to regenerate the integration tests with the new UAST numeric codes - -LASTIMAGE=dev-`git log -1 --sparse|head -1|cut -f2 -d " "|cut -b1-7` -BASEDIR=$PWD -cd tests/sources -for SOURCE in $(ls *.py) -do - echo $SOURCE - cd $BASEDIR - docker run -it --rm -v $BASEDIR:/code bblfsh/python-driver:$LASTIMAGE /opt/driver/bin/driver parse-native /code/tests/sources/$SOURCE > tests/native/$SOURCE.json - docker run -it --rm -v $BASEDIR:/code bblfsh/python-driver:$LASTIMAGE /opt/driver/bin/driver parse-uast /code/tests/sources/$SOURCE > tests/uast/$SOURCE.json -done diff --git a/fixtures/rolenumbers.txt b/fixtures/rolenumbers.txt deleted file mode 100644 index 0e0fc379..00000000 --- a/fixtures/rolenumbers.txt +++ /dev/null @@ -1,73 +0,0 @@ - SimpleIdentifier Role = iota - QualifiedIdentifier - Expression - Statement - File - PackageDeclaration - ImportDeclaration - ImportPath - ImportAlias - FunctionDeclaration - TypeDeclaration - VisibleFromInstance - VisibleFromType - VisibleFromSubtype - VisibleFromPackage - VisibleFromSubpackage - VisibleFromModule - VisibleFromFriend - VisibleFromWorld - If - IfCondition - IfBody - IfElse - Switch - SwitchCase - SwitchCaseCondition - SwitchCaseBody - SwitchDefault - For - ForInit - ForExpression - ForUpdate - ForBody - ForEach - While - WhileCondition - WhileBody - DoWhile - DoWhileCondition - DoWhileBody - Break - Continue - Block - BlockScope - Return - Try - TryBody - TryCatch - TryFinally - Throw - Assert - Call - CallReceiver - CallCallee - CallPositionalArgument - CallNamedArgument - CallNamedArgumentName - CallNamedArgumentValue - Noop - Literal - NullLiteral - StringLiteral - NumberLiteral - TypeLiteral - Type - PrimitiveType - Assignment - AssignmentVariable - AssignmentValue - This - Comment - Documentation - Whitespace diff --git a/native/python_package/python_driver/astimprove.py b/native/python_package/python_driver/astimprove.py index 31ba71be..2ebc2e7f 100644 --- a/native/python_package/python_driver/astimprove.py +++ b/native/python_package/python_driver/astimprove.py @@ -3,45 +3,61 @@ from the import pydetector module. """ -from __future__ import print_function - import token as token_module import tokenize from ast import literal_eval from codecs import encode -from six import StringIO +from copy import deepcopy +from io import BytesIO +from typing import Iterable, List, Dict, Tuple, Any, Union, \ + Set, cast, Optional __all__ = ["AstImprover"] -TOKEN_TYPE = 0 -TOKEN_VALUE = 1 -TOKEN_STARTLOC = 2 -TOKEN_ENDLOC = 3 -TOKEN_RAWVALUE = 4 +NOOP_TOKENS_LINE = {'COMMENT', 'INDENT', 'NL', 'NEWLINE'} -TOKENROW = 0 -TOKENCOL = 1 +Pos = Tuple[int, int] +Node = Dict[str, Any] +AstDict = Dict[Any, Any] +VisitResult = Union[Node, List[Node]] -NOOP_TOKENS_LINE = {'COMMENT', 'INDENT', 'NL', 'NEWLINE'} +class TokenPos(): + def __init__(self, row: int, col: int) -> None: + self.row = row + self.col = col -def _token_name(t): - return token_module.tok_name[t[TOKEN_TYPE]] +class Token(): + def __init__(self, type_: int, value: str, start: Tuple[int, int], + end: Tuple[int, int], rawvalue: str) -> None: + self.type = type_ + self.name = token_module.tok_name[type_] + self.value = value + self.start = TokenPos(*start) + self.end = TokenPos(*end) + self.rawvalue = rawvalue -def _create_tokenized_lines(codestr, tokens): + def __str__(self) -> str: + s = '%s, %s, %s' % (self.type, self.name, self.value) + s += '\n%d %d' % (self.start.row, self.start.col) + s += '\n%d %d' % (self.end.row, self.end.col) + return s + + +def _create_tokenized_lines(codestr: str, tokens: Iterable[Token]) -> List[List[Token]]: lines = codestr.splitlines() if codestr else [] - result = [] + result: List[List[Token]] = [] for i in range(0, len(lines) + 1): result.append([]) for token in tokens: # Save noops in the line of the starting row except for strings where # we save it in the last line (because they can be multiline) - if _token_name(token) == 'STRING': - line = token[TOKEN_ENDLOC][TOKENROW] - 1 + if token.name == 'STRING': + line = token.end.row - 1 else: - line = token[TOKEN_STARTLOC][TOKENROW] - 1 + line = token.start.row - 1 result[line].append(token) assert len(lines) + 1 == len(result), len(result) return result @@ -58,8 +74,8 @@ class LocationFixer(object): in a questionable way (sys.stdout.write -> gives the same column for the three). """ - def __init__(self, codestr, token_lines): - self._current_line = None + def __init__(self, codestr: str, token_lines: List[List[Token]]) -> None: + self._current_line = -1 # _lines will initially hold the same list of tokens per line as received (in a # dict so speed lookups), but the tokens inside will be removed as they're found @@ -67,43 +83,42 @@ def __init__(self, codestr, token_lines): # same name on the same line) self._lines = {idx: val for idx, val in enumerate(token_lines)} - def _pop_token(self, lineno, token_value): + def _pop_token(self, lineno: int, token_value: str) -> Token: tokensline = self._lines[lineno - 1] # Pop the first token with the same name in the same line for t in tokensline: - linetok_value = t[TOKEN_VALUE] - if _token_name(t) != 'STRING': - line_value = linetok_value + if t.name != 'STRING': + line_value = t.value else: - if linetok_value[0] == 'f' and linetok_value[1] in ('"', "'"): + if t.value[0] == 'f' and t.value[1] in ('"', "'"): # fstring: token identify as STRING but they parse into the AST as a # collection of nodes so the token_value is different. To find the # real token position we'll search inside the fstring token value. - tok_subpos = linetok_value.find(str(token_value)) + tok_subpos = t.value.find(token_value) if tok_subpos != -1: - real_col = t[TOKEN_STARTLOC][TOKENCOL] + tok_subpos # We don't remove the fstring token from the line in this case; other # nodes could match different parts of it - return (t[TOKEN_TYPE], t[TOKEN_VALUE], (t[TOKEN_STARTLOC][0], real_col), - t[TOKEN_ENDLOC], t[TOKEN_RAWVALUE]) + newtok = deepcopy(t) + newtok.start.col = t.start.col + tok_subpos + return newtok raise TokenNotFoundException("Could not find token '{}' inside f-string '{}'" - .format(token_value, linetok_value)) + .format(token_value, t.value)) else: # normal string; they include the single or double quotes so we liteval - line_value = literal_eval(linetok_value) + line_value = literal_eval(t.value) if str(line_value) == str(token_value): tokensline.remove(t) return t raise TokenNotFoundException("Token named '{}' not found in line {}" - .format(token_value, lineno)) + .format(t.value, lineno)) - def sync_node_pos(self, nodedict): + def sync_node_pos(self, nodedict: Node) -> None: """ Check the column position, updating the column if needed (this changes the nodedict argument). Some Nodes have questionable column positions in the Python @@ -137,10 +152,10 @@ def sync_node_pos(self, nodedict): # position in that case is fine (uses the last line in that case) return - nodedict["lineno"] = token[TOKEN_STARTLOC][TOKENROW] - nodedict["col_offset"] = token[TOKEN_STARTLOC][TOKENCOL] - nodedict["end_lineno"] = token[TOKEN_ENDLOC][TOKENROW] - nodedict["end_col_offset"] = token[TOKEN_ENDLOC][TOKENCOL] + nodedict["lineno"] = token.start.row + nodedict["col_offset"] = token.start.col + nodedict["end_lineno"] = token.end.row + nodedict["end_col_offset"] = token.end.col class NoopExtractor(object): @@ -149,8 +164,8 @@ class NoopExtractor(object): like blanks and comments. """ - def __init__(self, codestr, token_lines): - self._current_line = None + def __init__(self, codestr: str, token_lines: List[List[Token]]) -> None: + self._current_line = -1 self._all_lines = tuple(token_lines) self.astmissing_lines = self._create_astmissing_lines() @@ -158,43 +173,43 @@ def __init__(self, codestr, token_lines): # of every "real" node to avoid having this node duplicated on all semantic # nodes in the same line, thus avoiding duplication. It will contain just the # line numbers of already added sameline_noops - self._sameline_added_noops = set() + self._sameline_added_noops: Set[int] = set() - def _create_astmissing_lines(self): + def _create_astmissing_lines(self) -> List[Optional[Token]]: """ Return a copy of line_tokens containing lines ignored by the AST (comments and blanks-only lines) """ - lines = [] - nl_token = (token_module.NEWLINE, '\n', (0, 0), (0, 0), '\n') + lines: List[Optional[Token]] = [] + nl_token = Token(token_module.NEWLINE, '\n', (0, 0), (0, 0), '\n') for i, linetokens in enumerate(self._all_lines): - if len(linetokens) == 1 and _token_name(linetokens[0]) == 'NL': + if len(linetokens) == 1 and linetokens[0].name == 'NL': lines.append(nl_token) else: for token in linetokens: - if _token_name(token) == 'COMMENT' and \ - token[TOKEN_RAWVALUE].lstrip().startswith('#'): + if token.name == 'COMMENT' and \ + token.rawvalue.lstrip().startswith('#'): lines.append(token) break else: lines.append(None) assert len(lines) == len(self._all_lines) - for i, linetokens in enumerate(lines): - if linetokens: + for i, linetokens2 in enumerate(lines): + if linetokens2: self._current_line = i break else: self._current_line = len(lines) return lines - def add_noops(self, node, root): + def add_noops(self, node: Node, isRoot: bool) -> None: if not isinstance(node, dict): - return node + return - def _create_nooplines_list(startline, noops_previous): - nooplines = [] + def _create_nooplines_list(startline: int, noops_previous: List[str]) -> List[Node]: + nooplines: List[Node] = [] curline = startline for noopline in noops_previous: nooplines.append({ @@ -221,22 +236,22 @@ def _create_nooplines_list(startline, noops_previous): # Other noops at the end of its significative line except the implicit # finishing newline - noops_sameline = self.sameline_remainder_noops(node) - joined_sameline = ''.join([x['value'] for x in noops_sameline]) + noops_sameline: List[Token] = [i for i in self.sameline_remainder_noops(node) if i] + joined_sameline = ''.join([tok.value for tok in noops_sameline]) + if noops_sameline: node['noops_sameline'] = { "ast_type": "SameLineNoops", "lineno": node.get("lineno", 0), - "col_offset": noops_sameline[0]["colstart"], + "col_offset": noops_sameline[0].start.col, "noop_line": joined_sameline, "end_lineno": node.get("lineno", 0), - "end_col_offset": max(noops_sameline[-1]["colend"], 1) + "end_col_offset": max(noops_sameline[-1].end.col, 1) } # Finally, if this is the root node, add all noops after the last op node - if root: - noops_remainder, startline, endline, endcol =\ - self.remainder_noops() + if isRoot: + noops_remainder, startline, endline, endcol = self.remainder_noops() if noops_remainder: node['noops_remainder'] = { "ast_type": "RemainderNoops", @@ -247,77 +262,72 @@ def _create_nooplines_list(startline, noops_previous): "lines": _create_nooplines_list(startline, noops_remainder) } - def previous_nooplines(self, nodedict): + def previous_nooplines(self, nodedict: Node) -> Tuple[List[str], int, int, int]: """Return a list of the preceding comment and blank lines""" previous = [] - first_lineno = None - lastline = None - lastcol = None + first_lineno = -1 + lastline = -1 + lastcol = -1 lineno = nodedict.get('lineno') if lineno and self.astmissing_lines: while self._current_line < lineno: token = self.astmissing_lines[self._current_line] if token: - s = token[TOKEN_RAWVALUE].rstrip() + '\n' + s = token.rawvalue.rstrip() + '\n' previous.append(s) # take only the first line of the noops as the start and the last # one (overwriteen every iteration) - if not first_lineno: + if first_lineno == -1: first_lineno = self._current_line + 1 lastline = self._current_line + 1 - lastcol = token[TOKEN_ENDLOC][TOKENCOL] + lastcol = token.end.col self._current_line += 1 return previous, first_lineno, lastline, lastcol - def sameline_remainder_noops(self, nodedict): + def sameline_remainder_noops(self, nodedict: Node) -> List[Token]: """ - Return a string containing the trailing (until EOL) noops for the + Return a list containing the trailing (until EOL) noop Tokens for the node, if any. The ending newline is implicit and thus not returned """ # Without a line number for the node we can't know lineno = nodedict.get("lineno") if not lineno: - return '' + return [] # Skip remainder comments already added to a node in this line to avoid every node # in the same line having it (which is not conceptually wrong, but not DRY) if lineno in self._sameline_added_noops: - return '' + return [] # Module nodes have the remaining comments but since we put their first line as "1" # any comment on the first line would wrongly show as sameline comment for the module if nodedict["ast_type"] == 'Module': - return '' + return [] tokens = self._all_lines[lineno - 1] - trailing = [] + trailing: List[Token] = [] for token in tokens: - if _token_name(token) not in NOOP_TOKENS_LINE: + if token.name not in NOOP_TOKENS_LINE: # restart trailing = [] else: - trailing.append({ - 'rowstart': token[TOKEN_STARTLOC][TOKENROW], - 'colstart': token[TOKEN_STARTLOC][TOKENCOL], - 'rowend': token[TOKEN_ENDLOC][TOKENROW], - 'colend': token[TOKEN_ENDLOC][TOKENCOL], - 'value': token[TOKEN_VALUE] - }) + trailing.append(token) + if not trailing: - return '' + return [] self._sameline_added_noops.add(lineno) - nonewline_trailing = trailing[:-1] if trailing[-1]['value'] == '\n' else trailing + nonewline_trailing = trailing[:-1] if trailing[-1].value == '\n' else trailing return nonewline_trailing - def remainder_noops(self): + def remainder_noops(self) -> Tuple[List[str], int, int, int]: """return any remaining ignored lines.""" - trailing = [] - lastline = None + trailing: List[str] = [] + lastline = -1 lastcol = 1 i = self._current_line @@ -325,14 +335,11 @@ def remainder_noops(self): while i < len(self.astmissing_lines): token = self.astmissing_lines[i] - if token: - s = token[TOKEN_RAWVALUE] - trailing.append(s) - i += 1 if token: + trailing.append(token.rawvalue) lastline = i - lastcol = len(token) + lastcol = token.end.col else: lastcol = 1 self._current_line = i @@ -399,10 +406,10 @@ def remainder_noops(self): class AstImprover(object): - def __init__(self, codestr, astdict): + def __init__(self, codestr: str, astdict: AstDict) -> None: self._astdict = astdict # Tokenize and create the noop extractor and the position fixer - self._tokens = tokenize.generate_tokens(StringIO(codestr).readline) + self._tokens: List[Token] = [Token(*i) for i in tokenize.tokenize(BytesIO(codestr.encode('utf-8')).readline)] token_lines = _create_tokenized_lines(codestr, self._tokens) self.noops_sync = NoopExtractor(codestr, token_lines) self.pos_sync = LocationFixer(codestr, token_lines) @@ -414,11 +421,11 @@ def __init__(self, codestr, astdict): self.visit_Global = self.visit_Nonlocal = self._promote_names - def parse(self): + def parse(self) -> VisitResult: res = self.visit(self._astdict, root=True) return res - def visit(self, node, root=False): + def visit(self, node: Node, root: bool=False) -> VisitResult: # the ctx property always has a "Load"/"Store"/etc dictionary that # can be perfectly converted to a string value since they don't # hold anything more than the name @@ -451,14 +458,14 @@ def visit(self, node, root=False): return visit_result - def visit_str(self, node): + def visit_str(self, node: Node) -> str: """ This visits str fields inside nodes (which are represented as keys in the node dictionary), not Str AST nodes """ return str(node) - def visit_Bytes(self, node): + def visit_Bytes(self, node: Node) -> VisitResult: try: s = node["s"].decode() encoding = 'utf8' @@ -470,7 +477,7 @@ def visit_Bytes(self, node): node.update({"s": s, "encoding": encoding}) return node - def _promote_names(self, node): + def _promote_names(self, node: Node) -> VisitResult: # Python AST by default stores global and nonlocal variable names # in a "names" array of strings. That breaks the structure of everything # else in the AST (dictionaries, properties or list of objects) so we @@ -483,7 +490,7 @@ def _promote_names(self, node): node["names"] = names_as_nodes return node - def visit_NameConstant(self, node): + def visit_NameConstant(self, node: Node) -> Node: if "value" in node: repr_val = repr(node["value"]) if repr_val in ("True", "False"): @@ -495,26 +502,26 @@ def visit_NameConstant(self, node): node["ast_type"] = "NameConstant" return node - def visit_Num(self, node): + def visit_Num(self, node: Node) -> Node: # complex objects are not json-serializable if isinstance(node["n"], complex): node.update({"n": {"real": node["n"].real, "imag": node["n"].imag}}) return node - def visit_NoneType(self, node): + def visit_NoneType(self, node: Node) -> Node: ret = node if node else {} ret.update({"LiteralValue": "None", "ast_type": "NoneLiteral"}) return ret - def visit_other(self, node): + def visit_other(self, node: Node) -> VisitResult: for field in node.get("_fields", []): meth = getattr(self, "visit_" + node["ast_type"], self.visit_other_field) node[field] = meth(node[field]) return node - def visit_other_field(self, node): + def visit_other_field(self, node: Node) -> VisitResult: if isinstance(node, dict): return self.visit(node) elif isinstance(node, list) or isinstance(node, tuple): @@ -538,7 +545,9 @@ def visit_other_field(self, node): spec = importlib.util.spec_from_file_location("module.testmod", "../test/fixtures/exported_dict.py") testmod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(testmod) - testdict = testmod.testdict + + if spec.loader: + spec.loader.exec_module(testmod) + testdict = testmod.testdict # type: ignore pprint(AstImprover(codestr, testdict).parse()) diff --git a/native/python_package/python_driver/requestprocessor.py b/native/python_package/python_driver/requestprocessor.py index 34d0dc92..213c6556 100644 --- a/native/python_package/python_driver/requestprocessor.py +++ b/native/python_package/python_driver/requestprocessor.py @@ -3,7 +3,7 @@ from pydetector import detector from traceback import format_exc from python_driver.version import __version__ -from python_driver.astimprove import AstImprover +from python_driver.astimprove import AstImprover, VisitResult from typing import (Any, IO, NewType, Tuple, cast, List, Iterator, Dict, Optional) # typing.AnyStr is bugged on this version of MyPy, so...: @@ -81,7 +81,7 @@ def _send_response(self, response: Response) -> None: pass def _return_error(self, filepath: AnyStr='', status: AnyStr='error', - ast: Optional[Dict[Any, Any]] = None) -> None: + ast: Optional[VisitResult] = None) -> None: """ Build and send to stdout and error response. Also log the errors to the python_driver.log. diff --git a/native/python_package/python_driver/version.py b/native/python_package/python_driver/version.py index 7e49527e..439eb0cd 100644 --- a/native/python_package/python_driver/version.py +++ b/native/python_package/python_driver/version.py @@ -1 +1 @@ -__version__ = '1.0' +__version__ = '1.1'