Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-7874: [Python][Archery] Validate docstrings with numpydoc #6420

Closed
wants to merge 16 commits into from
Closed
20 changes: 19 additions & 1 deletion dev/archery/archery/cli.py
Expand Up @@ -27,7 +27,7 @@
from .benchmark.runner import BenchmarkRunner, CppBenchmarkRunner
from .lang.cpp import CppCMakeDefinition, CppConfiguration
from .utils.codec import JsonEncoder
from .utils.lint import linter, LintValidationException
from .utils.lint import linter, python_numpydoc, LintValidationException
from .utils.logger import logger, ctx as log_ctx
from .utils.source import ArrowSources
from .utils.tmpdir import tmpdir
Expand Down Expand Up @@ -222,6 +222,8 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
@click.option("--with-flake8", default=True, type=BOOL,
show_default=True,
help="Lint python files with flake8.")
@click.option("--with-numpydoc", default=False, type=BOOL,
show_default=True, help="Lint python files with numpydoc.")
@click.option("--with-cmake-format", default=True, type=BOOL,
show_default=True,
help="Lint CMakeFiles.txt files with cmake-format.py.")
Expand All @@ -247,6 +249,22 @@ def lint(ctx, src, **kwargs):
sys.exit(1)


@archery.command(short_help="Lint python docstring with NumpyDoc")
@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
callback=validate_arrow_sources,
help="Specify Arrow source directory")
@click.option("--whitelist", "-w", help="Allow only these rules")
@click.option("--blacklist", "-b", help="Disallow these rules")
def numpydoc(src, whitelist, blacklist):
blacklist = blacklist or {'GL01', 'SA01', 'EX01', 'ES01'}
try:
results = python_numpydoc(whitelist=whitelist, blacklist=blacklist)
for result in results:
result.ok()
except LintValidationException:
sys.exit(1)


@archery.group()
@click.pass_context
def benchmark(ctx):
Expand Down
124 changes: 124 additions & 0 deletions dev/archery/archery/lang/python.py
@@ -0,0 +1,124 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import importlib
from contextlib import contextmanager

try:
from numpydoc.validate import Docstring, validate
except ImportError:
have_numpydoc = False
else:
have_numpydoc = True

from ..utils.command import Command, default_bin


class Flake8(Command):
def __init__(self, flake8_bin=None):
self.bin = default_bin(flake8_bin, "flake8")


class NumpyDoc:

def __init__(self, modules=None):
if not have_numpydoc:
raise RuntimeError(
'Numpydoc is not available, install the development version '
'of it via pip ...'
)
self.modules = set(modules or {'pyarrow'})

def traverse(self, fn, obj, from_package):
"""Apply a function on publicly exposed API components.

Recursively iterates over the members of the passed object. It omits
any '_' prefixed and thirdparty (non pyarrow) symbols.

Parameters
----------
obj : Any
from_package : string, default 'pyarrow'
Predicate to only consider objects from this package.
"""
todo = [obj]
seen = set()

while todo:
obj = todo.pop()
if obj in seen:
continue
else:
seen.add(obj)

fn(obj)

for name in dir(obj):
if name.startswith('_'):
continue

member = getattr(obj, name)
module = getattr(member, '__module__', None)
if not (module and module.startswith(from_package)):
continue

todo.append(member)

@contextmanager
def _patch_numpydoc(self):
"""
Patch Docstring class to bypass loading already loaded python objects.

By default it expects a qualname and import the object, but we have
already loaded object after the API traversal.
"""
original = Docstring._load_obj
try:
Docstring._load_obj = staticmethod(lambda obj: obj)
kszucs marked this conversation as resolved.
Show resolved Hide resolved
yield
finally:
Docstring._load_obj = original

def validate(self, rules_blacklist=None, rules_whitelist=None):
results = []

def callback(obj):
result = validate(obj)

errors = []
for errcode, errmsg in result.get('errors', []):
if rules_whitelist and errcode not in rules_whitelist:
continue
if rules_blacklist and errcode in rules_blacklist:
continue
errors.append((errcode, errmsg))

if len(errors):
result['errors'] = errors
results.append((obj, result))

with self._patch_numpydoc():
for module_name in self.modules:
try:
module = importlib.import_module(module_name)
except ImportError:
print('{} is not available for import'.format(module_name))
continue
else:
self.traverse(callback, module, module_name)

return results
102 changes: 90 additions & 12 deletions dev/archery/archery/utils/lint.py
Expand Up @@ -18,12 +18,15 @@
import gzip
import os

import click

from .command import Bash, Command, default_bin
from .cmake import CMake
from .git import git
from .logger import logger
from ..lang.cpp import CppCMakeDefinition, CppConfiguration
from ..lang.rust import Cargo
from ..lang.python import Flake8, NumpyDoc
from .rat import Rat, exclusion_from_globs
from .tmpdir import tmpdir

Expand Down Expand Up @@ -98,13 +101,8 @@ def cmake_linter(src, fix=False):
yield LintResult.from_cmd(cmake_format("--check"))


class Flake8(Command):
def __init__(self, flake8_bin=None):
self.bin = default_bin(flake8_bin, "flake8")


def python_linter(src):
""" Run flake8 linter on python/pyarrow, and dev/. """
"""Run flake8 linter on python/pyarrow, and dev/. """
logger.info("Running python linters")
flake8 = Flake8()

Expand All @@ -120,8 +118,85 @@ def python_linter(src):
check=False))


def python_numpydoc(whitelist=None, blacklist=None):
"""Run numpydoc linter on python.

Pyarrow must be available for import.
"""
logger.info("Running python docstring linters")
packages = {
'pyarrow',
'pyarrow.compute',
'pyarrow.csv',
'pyarrow.dataset',
kszucs marked this conversation as resolved.
Show resolved Hide resolved
'pyarrow.feather',
'pyarrow.flight',
'pyarrow.fs',
'pyarrow.gandiva',
'pyarrow.ipc',
'pyarrow.json',
'pyarrow.orc',
'pyarrow.parquet',
'pyarrow.plasma',
'pyarrow.types',
}
try:
numpydoc = NumpyDoc(packages)
except RuntimeError:
logger.error('Numpydoc is not available')
return

results = numpydoc.validate(
rules_whitelist=whitelist,
rules_blacklist=blacklist
)

if len(results) == 0:
yield LintResult(success=True)
return

number_of_violations = 0
for obj, result in results:
errors = result['errors']

# inspect doesn't play nice with cython generated source code,
# to use a hacky way to represent a proper __qualname__
doc = getattr(obj, '__doc__', '')
name = getattr(obj, '__name__', '')
qualname = getattr(obj, '__qualname__', '')
module = getattr(obj, '__module__', '')
instance = getattr(obj, '__self__', '')
if instance:
klass = instance.__class__.__name__
else:
klass = ''
if doc:
signature = doc.splitlines()[0]
else:
signature = ''

desc = '.'.join(filter(None, [module, klass, qualname or name]))

click.echo()
click.echo(click.style(desc, bold=True, fg='yellow'))
if signature:
click.echo(click.style('-> {}'.format(signature), fg='yellow'))

for error in errors:
number_of_violations += 1
click.echo('{}: {}'.format(*error))

msg = 'Total number of docstring violations: {}'.format(
number_of_violations
)
click.echo()
click.echo(click.style(msg, fg='red'))

yield LintResult(success=False)


def rat_linter(src, root):
""" Run apache-rat license linter. """
"""Run apache-rat license linter."""
logger.info("Running apache-rat linter")

if src.git_dirty:
Expand All @@ -145,14 +220,14 @@ def rat_linter(src, root):


def r_linter(src):
""" Run R linter. """
"""Run R linter."""
logger.info("Running r linter")
r_lint_sh = os.path.join(src.r, "lint.sh")
yield LintResult.from_cmd(Bash().run(r_lint_sh, check=False))


def rust_linter(src):
""" Run Rust linter. """
"""Run Rust linter."""
logger.info("Running rust linter")
cargo = Cargo()

Expand Down Expand Up @@ -181,7 +256,7 @@ def is_docker_image(path):


def docker_linter(src):
""" Run Hadolint docker linter. """
"""Run Hadolint docker linter."""
logger.info("Running docker linter")

hadolint = Hadolint()
Expand All @@ -199,11 +274,11 @@ def docker_linter(src):

def linter(src, with_clang_format=True, with_cpplint=True,
with_clang_tidy=False, with_iwyu=False,
with_flake8=True, with_cmake_format=True,
with_flake8=True, with_numpydoc=False, with_cmake_format=True,
with_rat=True, with_r=True, with_rust=True,
with_docker=True,
fix=False):
""" Run all linters. """
"""Run all linters."""
with tmpdir(prefix="arrow-lint-") as root:
build_dir = os.path.join(root, "cpp-build")

Expand All @@ -223,6 +298,9 @@ def linter(src, with_clang_format=True, with_cpplint=True,
if with_flake8:
results.extend(python_linter(src))

if with_numpydoc:
results.extend(python_numpydoc())

if with_cmake_format:
results.extend(cmake_linter(src, fix=fix))

Expand Down