Skip to content

Commit

Permalink
ARROW-7874: [Python][Archery] Validate docstrings with numpydoc
Browse files Browse the repository at this point in the history
Closes #6420 from kszucs/numpydoc and squashes the following commits:

183602d <Krisztián Szűcs> flake8
bc65d53 <Krisztián Szűcs> add support to validate only certain symbols like pyarrow.array
bf07bad <Krisztián Szűcs> add instructions how to install latest numpydoc if validate function is not available
1830518 <Krisztián Szűcs> restrict packages of interest
25d530b <Krisztián Szűcs> flake8
45a01e3 <Krisztián Szűcs> improve symbol name
4455ac9 <Krisztián Szűcs> support methods
56a87bb <Krisztián Szűcs> example for the signature conversion
a3d4428 <Krisztián Szűcs> remove cython typehints
0db0d3c <Krisztián Szűcs> use __doc__ to parse the callable's signature
ab0b326 <Krisztián Szűcs> address review comments
017c72b <Krisztián Szűcs> flake8
28d4807 <Krisztián Szűcs> support blacklisting and whitelinsting verification rules
a013a62 <Krisztián Szűcs> better validation output
fd899db <Krisztián Szűcs> imitate proper __qualname__ for cython objects; provide more context with the messages; blacklist a couple of rules
11b04a5 <Krisztián Szűcs> support for numpydoc validate in archery

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
  • Loading branch information
kszucs committed Feb 24, 2020
1 parent c7c2e03 commit dc01a36
Show file tree
Hide file tree
Showing 3 changed files with 342 additions and 13 deletions.
33 changes: 32 additions & 1 deletion dev/archery/archery/cli.py
Expand Up @@ -27,7 +27,7 @@
from .benchmark.runner import BenchmarkRunner, CppBenchmarkRunner
from .lang.cpp import CppCMakeDefinition, CppConfiguration
from .utils.codec import JsonEncoder
from .utils.lint import linter, LintValidationException
from .utils.lint import linter, python_numpydoc, LintValidationException
from .utils.logger import logger, ctx as log_ctx
from .utils.source import ArrowSources
from .utils.tmpdir import tmpdir
Expand Down Expand Up @@ -222,6 +222,8 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
@click.option("--with-flake8", default=True, type=BOOL,
show_default=True,
help="Lint python files with flake8.")
@click.option("--with-numpydoc", default=False, type=BOOL,
show_default=True, help="Lint python files with numpydoc.")
@click.option("--with-cmake-format", default=True, type=BOOL,
show_default=True,
help="Lint CMakeFiles.txt files with cmake-format.py.")
Expand All @@ -247,6 +249,35 @@ def lint(ctx, src, **kwargs):
sys.exit(1)


@archery.command(short_help="Lint python docstring with NumpyDoc")
@click.argument('symbols', nargs=-1)
@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
callback=validate_arrow_sources,
help="Specify Arrow source directory")
@click.option("--whitelist", "-w", help="Allow only these rules")
@click.option("--blacklist", "-b", help="Disallow these rules")
def numpydoc(src, symbols, whitelist, blacklist):
"""
Pass list of modules or symbols as arguments to restrict the validation.
By default all modules of pyarrow are tried to be validated.
Examples
--------
archery numpydoc pyarrow.dataset
archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet
archery numpydoc pyarrow.array
"""
blacklist = blacklist or {'GL01', 'SA01', 'EX01', 'ES01'}
try:
results = python_numpydoc(symbols, whitelist=whitelist,
blacklist=blacklist)
for result in results:
result.ok()
except LintValidationException:
sys.exit(1)


@archery.group()
@click.pass_context
def benchmark(ctx):
Expand Down
210 changes: 210 additions & 0 deletions dev/archery/archery/lang/python.py
@@ -0,0 +1,210 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import inspect
import tokenize
from contextlib import contextmanager

try:
from numpydoc.validate import Docstring, validate
except ImportError:
have_numpydoc = False
else:
have_numpydoc = True

from ..utils.command import Command, default_bin


class Flake8(Command):
def __init__(self, flake8_bin=None):
self.bin = default_bin(flake8_bin, "flake8")


def _tokenize_signature(s):
lines = s.encode('ascii').splitlines()
generator = iter(lines).__next__
return tokenize.tokenize(generator)


def _convert_typehint(tokens):
names = []
opening_bracket_reached = False
for token in tokens:
# omit the tokens before the opening bracket
if not opening_bracket_reached:
if token.string == '(':
opening_bracket_reached = True
else:
continue

if token.type == 1: # type 1 means NAME token
names.append(token)
else:
if len(names) == 1:
yield (names[0].type, names[0].string)
elif len(names) == 2:
# two "NAME" tokens follow each other which means a cython
# typehint like `bool argument`, so remove the typehint
# note that we could convert it to python typehints, but hints
# are not supported by _signature_fromstr
yield (names[1].type, names[1].string)
elif len(names) > 2:
print(names)
raise ValueError('More than two NAME tokens follow each other')
names = []
yield (token.type, token.string)


def inspect_signature(obj):
"""
Custom signature inspection primarly for cython generated callables.
Cython puts the signatures to the first line of the docstrings, which we
can reuse to parse the python signature from, but some gymnastics are
required, like removing the cython typehints.
It converts the cython signature:
array(obj, type=None, mask=None, size=None, from_pandas=None,
bool safe=True, MemoryPool memory_pool=None)
To:
<Signature (obj, type=None, mask=None, size=None, from_pandas=None,
safe=True, memory_pool=None)>
"""
cython_signature = obj.__doc__.splitlines()[0]
cython_tokens = _tokenize_signature(cython_signature)
python_tokens = _convert_typehint(cython_tokens)
python_signature = tokenize.untokenize(python_tokens)
return inspect._signature_fromstr(inspect.Signature, obj, python_signature)


class NumpyDoc:

def __init__(self, symbols=None):
if not have_numpydoc:
raise RuntimeError(
'Numpydoc is not available, install the development version '
'with command: pip install '
'git+https://github.com/numpy/numpydoc'
)
self.symbols = set(symbols or {'pyarrow'})

def traverse(self, fn, obj, from_package):
"""Apply a function on publicly exposed API components.
Recursively iterates over the members of the passed object. It omits
any '_' prefixed and thirdparty (non pyarrow) symbols.
Parameters
----------
obj : Any
from_package : string, default 'pyarrow'
Predicate to only consider objects from this package.
"""
todo = [obj]
seen = set()

while todo:
obj = todo.pop()
if obj in seen:
continue
else:
seen.add(obj)

fn(obj)

for name in dir(obj):
if name.startswith('_'):
continue

member = getattr(obj, name)
module = getattr(member, '__module__', None)
if not (module and module.startswith(from_package)):
continue

todo.append(member)

@contextmanager
def _apply_patches(self):
"""
Patch Docstring class to bypass loading already loaded python objects.
"""
orig_load_obj = Docstring._load_obj
orig_signature = inspect.signature

@staticmethod
def _load_obj(obj):
# By default it expects a qualname and import the object, but we
# have already loaded object after the API traversal.
if isinstance(obj, str):
return orig_load_obj(obj)
else:
return obj

def signature(obj):
# inspect.signature tries to parse __text_signature__ if other
# properties like __signature__ doesn't exists, but cython
# doesn't set that property despite that embedsignature cython
# directive is set. The only way to inspect a cython compiled
# callable's signature to parse it from __doc__ while
# embedsignature directive is set during the build phase.
# So path inspect.signature function to attempt to parse the first
# line of callable.__doc__ as a signature.
try:
return orig_signature(obj)
except Exception as orig_error:
try:
return inspect_signature(obj)
except Exception:
raise orig_error

try:
Docstring._load_obj = _load_obj
inspect.signature = signature
yield
finally:
Docstring._load_obj = orig_load_obj
inspect.signature = orig_signature

def validate(self, from_package='', rules_blacklist=None,
rules_whitelist=None):
results = []

def callback(obj):
result = validate(obj)

errors = []
for errcode, errmsg in result.get('errors', []):
if rules_whitelist and errcode not in rules_whitelist:
continue
if rules_blacklist and errcode in rules_blacklist:
continue
errors.append((errcode, errmsg))

if len(errors):
result['errors'] = errors
results.append((obj, result))

with self._apply_patches():
for symbol in self.symbols:
try:
obj = Docstring._load_obj(symbol)
except (ImportError, AttributeError):
print('{} is not available for import'.format(symbol))
else:
self.traverse(callback, obj, from_package=from_package)

return results

0 comments on commit dc01a36

Please sign in to comment.