ARROW-7874: [Python][Archery] Validate docstrings with numpydoc

Closes #6420 from kszucs/numpydoc and squashes the following commits: 183602d <Krisztián Szűcs> flake8 bc65d53 <Krisztián Szűcs> add support to validate only certain symbols like pyarrow.array bf07bad <Krisztián Szűcs> add instructions how to install latest numpydoc if validate function is not available 1830518 <Krisztián Szűcs> restrict packages of interest 25d530b <Krisztián Szűcs> flake8 45a01e3 <Krisztián Szűcs> improve symbol name 4455ac9 <Krisztián Szűcs> support methods 56a87bb <Krisztián Szűcs> example for the signature conversion a3d4428 <Krisztián Szűcs> remove cython typehints 0db0d3c <Krisztián Szűcs> use __doc__ to parse the callable's signature ab0b326 <Krisztián Szűcs> address review comments 017c72b <Krisztián Szűcs> flake8 28d4807 <Krisztián Szűcs> support blacklisting and whitelinsting verification rules a013a62 <Krisztián Szűcs> better validation output fd899db <Krisztián Szűcs> imitate proper __qualname__ for cython objects; provide more context with the messages; blacklist a couple of rules 11b04a5 <Krisztián Szűcs> support for numpydoc validate in archery Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
apache · Feb 24, 2020 · dc01a36 · dc01a36
1 parent c7c2e03
commit dc01a36
Show file tree

Hide file tree

Showing 3 changed files with 342 additions and 13 deletions.
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
@@ -27,7 +27,7 @@
 from .benchmark.runner import BenchmarkRunner, CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.codec import JsonEncoder
-from .utils.lint import linter, LintValidationException
+from .utils.lint import linter, python_numpydoc, LintValidationException
 from .utils.logger import logger, ctx as log_ctx
 from .utils.source import ArrowSources
 from .utils.tmpdir import tmpdir
@@ -222,6 +222,8 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
 @click.option("--with-flake8", default=True, type=BOOL,
               show_default=True,
               help="Lint python files with flake8.")
+@click.option("--with-numpydoc", default=False, type=BOOL,
+              show_default=True, help="Lint python files with numpydoc.")
 @click.option("--with-cmake-format", default=True, type=BOOL,
               show_default=True,
               help="Lint CMakeFiles.txt files with cmake-format.py.")
@@ -247,6 +249,35 @@ def lint(ctx, src, **kwargs):
         sys.exit(1)
 
 
+@archery.command(short_help="Lint python docstring with NumpyDoc")
+@click.argument('symbols', nargs=-1)
+@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
+              callback=validate_arrow_sources,
+              help="Specify Arrow source directory")
+@click.option("--whitelist", "-w", help="Allow only these rules")
+@click.option("--blacklist", "-b", help="Disallow these rules")
+def numpydoc(src, symbols, whitelist, blacklist):
+    """
+    Pass list of modules or symbols as arguments to restrict the validation.
+
+    By default all modules of pyarrow are tried to be validated.
+
+    Examples
+    --------
+    archery numpydoc pyarrow.dataset
+    archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet
+    archery numpydoc pyarrow.array
+    """
+    blacklist = blacklist or {'GL01', 'SA01', 'EX01', 'ES01'}
+    try:
+        results = python_numpydoc(symbols, whitelist=whitelist,
+                                  blacklist=blacklist)
+        for result in results:
+            result.ok()
+    except LintValidationException:
+        sys.exit(1)
+
+
 @archery.group()
 @click.pass_context
 def benchmark(ctx):

diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import inspect
+import tokenize
+from contextlib import contextmanager
+
+try:
+    from numpydoc.validate import Docstring, validate
+except ImportError:
+    have_numpydoc = False
+else:
+    have_numpydoc = True
+
+from ..utils.command import Command, default_bin
+
+
+class Flake8(Command):
+    def __init__(self, flake8_bin=None):
+        self.bin = default_bin(flake8_bin, "flake8")
+
+
+def _tokenize_signature(s):
+    lines = s.encode('ascii').splitlines()
+    generator = iter(lines).__next__
+    return tokenize.tokenize(generator)
+
+
+def _convert_typehint(tokens):
+    names = []
+    opening_bracket_reached = False
+    for token in tokens:
+        # omit the tokens before the opening bracket
+        if not opening_bracket_reached:
+            if token.string == '(':
+                opening_bracket_reached = True
+            else:
+                continue
+
+        if token.type == 1:  # type 1 means NAME token
+            names.append(token)
+        else:
+            if len(names) == 1:
+                yield (names[0].type, names[0].string)
+            elif len(names) == 2:
+                # two "NAME" tokens follow each other which means a cython
+                # typehint like `bool argument`, so remove the typehint
+                # note that we could convert it to python typehints, but hints
+                # are not supported by _signature_fromstr
+                yield (names[1].type, names[1].string)
+            elif len(names) > 2:
+                print(names)
+                raise ValueError('More than two NAME tokens follow each other')
+            names = []
+            yield (token.type, token.string)
+
+
+def inspect_signature(obj):
+    """
+    Custom signature inspection primarly for cython generated callables.
+
+    Cython puts the signatures to the first line of the docstrings, which we
+    can reuse to parse the python signature from, but some gymnastics are
+    required, like removing the cython typehints.
+
+    It converts the cython signature:
+        array(obj, type=None, mask=None, size=None, from_pandas=None,
+              bool safe=True, MemoryPool memory_pool=None)
+    To:
+        <Signature (obj, type=None, mask=None, size=None, from_pandas=None,
+                    safe=True, memory_pool=None)>
+    """
+    cython_signature = obj.__doc__.splitlines()[0]
+    cython_tokens = _tokenize_signature(cython_signature)
+    python_tokens = _convert_typehint(cython_tokens)
+    python_signature = tokenize.untokenize(python_tokens)
+    return inspect._signature_fromstr(inspect.Signature, obj, python_signature)
+
+
+class NumpyDoc:
+
+    def __init__(self, symbols=None):
+        if not have_numpydoc:
+            raise RuntimeError(
+                'Numpydoc is not available, install the development version '
+                'with command: pip install '
+                'git+https://github.com/numpy/numpydoc'
+            )
+        self.symbols = set(symbols or {'pyarrow'})
+
+    def traverse(self, fn, obj, from_package):
+        """Apply a function on publicly exposed API components.
+
+        Recursively iterates over the members of the passed object. It omits
+        any '_' prefixed and thirdparty (non pyarrow) symbols.
+
+        Parameters
+        ----------
+        obj : Any
+        from_package : string, default 'pyarrow'
+            Predicate to only consider objects from this package.
+        """
+        todo = [obj]
+        seen = set()
+
+        while todo:
+            obj = todo.pop()
+            if obj in seen:
+                continue
+            else:
+                seen.add(obj)
+
+            fn(obj)
+
+            for name in dir(obj):
+                if name.startswith('_'):
+                    continue
+
+                member = getattr(obj, name)
+                module = getattr(member, '__module__', None)
+                if not (module and module.startswith(from_package)):
+                    continue
+
+                todo.append(member)
+
+    @contextmanager
+    def _apply_patches(self):
+        """
+        Patch Docstring class to bypass loading already loaded python objects.
+        """
+        orig_load_obj = Docstring._load_obj
+        orig_signature = inspect.signature
+
+        @staticmethod
+        def _load_obj(obj):
+            # By default it expects a qualname and import the object, but we
+            # have already loaded object after the API traversal.
+            if isinstance(obj, str):
+                return orig_load_obj(obj)
+            else:
+                return obj
+
+        def signature(obj):
+            # inspect.signature tries to parse __text_signature__ if other
+            # properties like __signature__ doesn't exists, but cython
+            # doesn't set that property despite that embedsignature cython
+            # directive is set. The only way to inspect a cython compiled
+            # callable's signature to parse it from __doc__ while
+            # embedsignature directive is set during the build phase.
+            # So path inspect.signature function to attempt to parse the first
+            # line of callable.__doc__ as a signature.
+            try:
+                return orig_signature(obj)
+            except Exception as orig_error:
+                try:
+                    return inspect_signature(obj)
+                except Exception:
+                    raise orig_error
+
+        try:
+            Docstring._load_obj = _load_obj
+            inspect.signature = signature
+            yield
+        finally:
+            Docstring._load_obj = orig_load_obj
+            inspect.signature = orig_signature
+
+    def validate(self, from_package='', rules_blacklist=None,
+                 rules_whitelist=None):
+        results = []
+
+        def callback(obj):
+            result = validate(obj)
+
+            errors = []
+            for errcode, errmsg in result.get('errors', []):
+                if rules_whitelist and errcode not in rules_whitelist:
+                    continue
+                if rules_blacklist and errcode in rules_blacklist:
+                    continue
+                errors.append((errcode, errmsg))
+
+            if len(errors):
+                result['errors'] = errors
+                results.append((obj, result))
+
+        with self._apply_patches():
+            for symbol in self.symbols:
+                try:
+                    obj = Docstring._load_obj(symbol)
+                except (ImportError, AttributeError):
+                    print('{} is not available for import'.format(symbol))
+                else:
+                    self.traverse(callback, obj, from_package=from_package)
+
+        return results