fastai/gen_doc/gen_notebooks.py

"`gen_doc.nbdoc` generates notebook documentation from module functions and links to correct places"
import pkgutil, inspect, sys,os, importlib,json,enum,warnings,nbformat,re
from IPython.core.display import display, Markdown
from nbconvert.preprocessors import ExecutePreprocessor
from nbformat.sign import NotebookNotary
from pathlib import Path
from .core import *
from .nbdoc import *

__all__ = ['create_module_page', 'update_module_page', 'import_mod',
           'link_nb', 'update_notebooks', 'generate_missing_metadata', 'update_nb_metadata']

def get_empty_notebook():
    "Default notbook with the minimum metadata."
    #TODO: check python version and nbformat
    return {'metadata': {'kernelspec': {'display_name': 'Python 3',
                                        'language': 'python',
                                        'name': 'python3'},
                         'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
                         'file_extension': '.py',
                         'mimetype': 'text/x-python',
                         'name': 'python',
                         'nbconvert_exporter': 'python',
                         'pygments_lexer': 'ipython3',
                         'version': '3.6.6'}},
            'nbformat': 4,
            'nbformat_minor': 2}

def get_md_cell(source, metadata=None):
    "Markdown cell containing `source` with `metadata`."
    return {'cell_type': 'markdown',
            'metadata': {} if metadata is None else metadata,
            'source': source}

def get_empty_cell(ctype='markdown'):
    "Empty cell of type `ctype`."
    return {'cell_type': ctype, 'metadata': {}, 'source': []}

def get_code_cell(code, hidden=False):
    "Code cell containing `code` that may be `hidden`."
    return {'cell_type' : 'code',
            'execution_count': 0,
            'metadata' : {'hide_input': hidden, 'trusted':True},
            'source' : code,
            'outputs': []}

def get_doc_cell(func_name):
    "Code cell with the command to show the doc of `func_name`."
    code = f"show_doc({func_name})"
    return get_code_cell(code, True)

def get_global_vars(mod):
    "Return globally assigned variables."
    # https://stackoverflow.com/questions/8820276/docstring-for-variable/31764368#31764368
    import ast,re
    with open(mod.__file__, 'r') as f: fstr = f.read()
    flines = fstr.splitlines()
    d = {}
    for node in ast.walk(ast.parse(fstr)):
        if isinstance(node,ast.Assign) and hasattr(node.targets[0], 'id'):
            key,lineno = node.targets[0].id,node.targets[0].lineno
            codestr = flines[lineno]
            match = re.match(f"^({key})\s*=\s*.*", codestr)
            if match and match.group(1) != '__all__': # only top level assignment
                d[key] = f'`{codestr}` {get_source_link(mod, lineno)}'
    return d

def write_nb(nb, nb_path, mode='w'):
    with open(nb_path, mode) as f: f.write(nbformat.writes(nbformat.from_dict(nb), version=4))

class ExecuteShowDocPreprocessor(ExecutePreprocessor):
    "An ExecutePreprocessor that only executes show_doc cells"
    def preprocess_cell(self, cell, resources, index):
        if 'source' in cell and cell.cell_type == "code":
            if IMPORT_RE.search(cell['source']) or SHOW_DOC_RE.search(cell['source']):
                return super().preprocess_cell(cell, resources, index)
        return cell, resources

def execute_nb(fname, metadata=None, save=True, show_doc_only=False):
    "Execute notebook `fname` with `metadata` for preprocessing."
    # Any module used in the notebook that isn't inside must be in the same directory as this script
    with open(fname) as f: nb = nbformat.read(f, as_version=4)
    ep_class = ExecuteShowDocPreprocessor if show_doc_only else ExecutePreprocessor
    ep = ep_class(timeout=600, kernel_name='python3')
    metadata = metadata or {}
    ep.preprocess(nb, metadata)
    if save:
        with open(fname, 'wt') as f: nbformat.write(nb, f)
        NotebookNotary().sign(nb)

def _symbol_skeleton(name): return [get_doc_cell(name), get_md_cell(f"`{name}`")]

def create_module_page(mod, dest_path, force=False):
    "Create the documentation notebook for module `mod_name` in path `dest_path`"
    nb = get_empty_notebook()
    mod_name = mod.__name__
    strip_name = strip_fastai(mod_name)
    init_cell = [get_md_cell(f'# {strip_name}'), get_md_cell('Type an introduction of the package here.')]
    cells = [get_code_cell(f'from fastai.gen_doc.nbdoc import *\nfrom {mod_name} import * ', True)]

    gvar_map = get_global_vars(mod)
    if gvar_map: cells.append(get_md_cell('### Global Variable Definitions:'))
    for name in get_exports(mod):
        if name in gvar_map: cells.append(get_md_cell(gvar_map[name]))

    for ft_name in get_ft_names(mod, include_inner=True):
        if not hasattr(mod, ft_name):
            warnings.warn(f"Module {strip_name} doesn't have a function named {ft_name}.")
            continue
        cells += _symbol_skeleton(ft_name)
        elt = getattr(mod, ft_name)
    nb['cells'] = init_cell + cells + [get_md_cell(UNDOC_HEADER)]

    doc_path = get_doc_path(mod, dest_path)
    write_nb(nb, doc_path, 'w' if force else 'x')
    execute_nb(doc_path)
    return doc_path

_default_exclude = ['.ipynb_checkpoints', '__pycache__', '__init__.py', 'imports']

def get_module_names(path_dir, exclude=None):
    if exclude is None: exclude = _default_exclude
    "Search a given `path_dir` and return all the modules contained inside except those in `exclude`"
    files = sorted(path_dir.glob('*'), key=lambda x: (x.is_dir(), x.name), reverse=True) # directories first
    res = [f'{path_dir.name}']
    for f in files:
        if f.is_dir() and f.name in exclude: continue # exclude directories
        if any([f.name.endswith(ex) for ex in exclude]): continue # exclude extensions

        if f.suffix == '.py': res.append(f'{path_dir.name}.{f.stem}')
        elif f.is_dir(): res += [f'{path_dir.name}.{name}' for name in get_module_names(f)]
    return res

def read_nb(fname):
    "Read a notebook in `fname` and return its corresponding json"
    with open(fname,'r') as f: return nbformat.reads(f.read(), as_version=4)

SHOW_DOC_RE = re.compile(r"show_doc\(([\w\.]*)")
def read_nb_content(cells, mod_name):
    "Build a dictionary containing the position of the `cells`."
    doc_fns = {}
    for i, cell in enumerate(cells):
        if cell['cell_type'] == 'code':
            for match in SHOW_DOC_RE.findall(cell['source']):
                doc_fns[match] = i
    return doc_fns

def read_nb_types(cells):
    doc_fns = {}
    for i, cell in enumerate(cells):
        if cell['cell_type'] == 'markdown':
            match = re.match(r"^(?:<code>|`)?(\w*)\s*=\s*", cell['source'])
            if match is not None: doc_fns[match.group(1)] = i
    return doc_fns

def link_markdown_cells(cells, modules):
    "Create documentation links for all cells in markdown with backticks."
    for i, cell in enumerate(cells):
        if cell['cell_type'] == 'markdown':
            cell['source'] = link_docstring(modules, cell['source'])

def get_insert_idx(pos_dict, name):
    "Return the position to insert a given function doc in a notebook."
    keys,i = list(pos_dict.keys()),0
    while i < len(keys) and str.lower(keys[i]) < str.lower(name): i+=1
    if i == len(keys): return -1
    else:              return pos_dict[keys[i]]

def update_pos(pos_dict, start_key, nbr=2):
    "Update the `pos_dict` by moving all positions after `start_key` by `nbr`."
    for key,idx in pos_dict.items():
        if str.lower(key) >= str.lower(start_key): pos_dict[key] += nbr
    return pos_dict

def insert_cells(cells, pos_dict, ft_name, append=False):
    "Insert the function doc `cells` at their correct position and updates `pos_dict`."
    idx = get_insert_idx(pos_dict, ft_name)
    if append or idx == -1: cells += [get_doc_cell(ft_name), get_empty_cell()]
    else:
        cells.insert(idx, get_doc_cell(ft_name))
        cells.insert(idx+1, get_empty_cell())
        pos_dict = update_pos(pos_dict, ft_name, 2)
    return cells, pos_dict

def get_doc_path(mod, dest_path):
    strip_name = strip_fastai(mod.__name__)
    return os.path.join(dest_path,f'{strip_name}.ipynb')

def generate_missing_metadata(dest_file):
    fn = Path(dest_file)
    meta_fn = fn.parent/'jekyll_metadata.ipynb'
    if not fn.exists() or not meta_fn.exists(): return print('Could not find notebooks:', fn, meta_fn)
    metadata_nb = read_nb(meta_fn)

    if has_metadata_cell(metadata_nb['cells'], fn.name): return
    nb = read_nb(fn)
    jmd = nb['metadata'].get('jekyll', {})
    fmt_params = ''
    for k,v in jmd.items(): fmt_params += f',\n    {k}={stringify(v)}'
    metadata_cell = get_code_cell(f"update_nb_metadata('{Path(fn).name}'{fmt_params})", hidden=False)
    metadata_nb['cells'].append(metadata_cell)
    write_nb(metadata_nb, meta_fn)

def update_nb_metadata(nb_path=None, title=None, summary=None, keywords='fastai', overwrite=True, **kwargs):
    "Creates jekyll metadata for given notebook path."
    nb = read_nb(nb_path)
    data = {'title': title, 'summary': summary, 'keywords': keywords, **kwargs}
    data = {k:v for (k,v) in data.items() if v is not None} # remove none values
    if not data: return
    nb['metadata']['jekyll'] = data
    write_nb(nb, nb_path)
    NotebookNotary().sign(nb)

def has_metadata_cell(cells, fn):
    for c in cells:
        if re.search(f"update_nb_metadata\('{fn}'", c['source']): return c

def stringify(s): return f'\'{s}\'' if isinstance(s, str) else s

IMPORT_RE = re.compile(r"from (fastai[\.\w_]*)")
def get_imported_modules(cells, nb_module_name=''):
    "Finds all submodules of notebook - sorted by submodules > top level modules > manual imports. This gives notebook imports priority"
    module_names = get_top_level_modules()
    nb_imports = [match.group(1) for cell in cells for match in IMPORT_RE.finditer(cell['source']) if cell['cell_type'] == 'code']
    parts = nb_module_name.split('.')
    parent_modules = ['.'.join(parts[:(x+1)]) for x in range_of(parts)] # Imports parent modules - a.b.c = [a, a.b, a.b.c]
    all_modules = module_names + nb_imports + parent_modules
    mods = [import_mod(m, ignore_errors=True) for m in all_modules]
    return [m for m in mods if m is not None]

def get_top_level_modules(num_levels=1):
    mod_dir = Path(import_mod('fastai').__file__).parent
    filtered_n = filter(lambda x: x.count('.')<=num_levels, get_module_names(mod_dir))
    return sorted(filtered_n, key=lambda s: s.count('.'), reverse=True) # Submodules first (sorted by periods)

NEW_FT_HEADER = '## New Methods - Please document or move to the undocumented section'
UNDOC_HEADER = '## Undocumented Methods - Methods moved below this line will intentionally be hidden'
def parse_sections(cells):
    old_cells, undoc_cells, new_cells = [], [], []
    current_section = old_cells
    for cell in cells:
        if cell['cell_type'] == 'markdown':
            if re.match(UNDOC_HEADER, cell['source']): current_section = undoc_cells
            if re.match(NEW_FT_HEADER, cell['source']): current_section = new_cells
        current_section.append(cell)
    undoc_cells = undoc_cells or [get_md_cell(UNDOC_HEADER)]
    new_cells = new_cells or [get_md_cell(NEW_FT_HEADER)]
    return old_cells, undoc_cells, new_cells

def remove_undoc_cells(cells):
    old, _, _ = parse_sections(cells)
    return old

# currently code vbox sub-cells mainly
def remove_code_cell_jupyter_widget_state_elem(cells):
    for c in cells:
        if c['cell_type'] == 'code':
            if 'outputs' in c:
                c['outputs'] = [l for l in c['outputs'] if not ('data' in l and 'application/vnd.jupyter.widget-view+json' in l.data)]
    return cells

def update_module_page(mod, dest_path='.'):
    "Update the documentation notebook of a given module."
    doc_path = get_doc_path(mod, dest_path)
    strip_name = strip_fastai(mod.__name__)
    nb = read_nb(doc_path)
    cells = nb['cells']

    link_markdown_cells(cells, get_imported_modules(cells, mod.__name__))

    type_dict = read_nb_types(cells)
    gvar_map = get_global_vars(mod)
    for name in get_exports(mod):
        if name not in gvar_map: continue
        code = gvar_map[name]
        if name in type_dict: cells[type_dict[name]] = get_md_cell(code)
        else: cells.append(get_md_cell(code))

    pos_dict = read_nb_content(cells, strip_name)
    ft_names = get_ft_names(mod, include_inner=True)
    new_fts = list(set(ft_names) - set(pos_dict.keys()))
    if new_fts: print(f'Found new fuctions for {mod}. Please document:\n{new_fts}')
    existing, undoc_cells, new_cells = parse_sections(cells)
    for ft_name in new_fts: new_cells.extend([get_doc_cell(ft_name), get_empty_cell()])
    if len(new_cells) > 1: nb['cells'] = existing + undoc_cells + new_cells

    write_nb(nb, doc_path)
    return doc_path

def link_nb(nb_path):
    nb = read_nb(nb_path)
    cells = nb['cells']
    link_markdown_cells(cells, get_imported_modules(cells, Path(nb_path).stem))
    write_nb(nb, nb_path)
    NotebookNotary().sign(read_nb(nb_path))

def get_module_from_notebook(doc_path):
    "Find module given a source path. Assume it belongs to fastai directory"
    return f'fastai.{Path(doc_path).stem}'

def check_nbconvert_version():
    import nbconvert
    assert nbconvert.version_info >= (5,4,0), "Please update nbconvert to >=5.4 for consistent .html output"

def update_notebooks(source_path, dest_path=None, update_html=True, document_new_fns=False,
                     update_nb_links=True, html_path=None, force=False):
    "`source_path` can be a directory or a file. Assume all modules reside in the fastai directory."
    from .convert2html import convert_nb
    source_path = Path(source_path)

    if source_path.is_file():
        dest_path = source_path.parent if dest_path is None else Path(dest_path)
        html_path = dest_path/'..'/'docs' if html_path is None else Path(html_path)
        doc_path = source_path
        assert source_path.suffix == '.ipynb', 'Must update from notebook or module'
        if document_new_fns:
            mod = import_mod(get_module_from_notebook(source_path))
            if not mod: print('Could not find module for path:', source_path)
            elif mod.__file__.endswith('__init__.py'): pass
            else: update_module_page(mod, dest_path)
        generate_missing_metadata(doc_path)
        if update_nb_links:
            print(f'Updating notebook {doc_path}. Please wait...')
            link_nb(doc_path)
            execute_nb(doc_path, {'metadata': {'path': doc_path.parent}}, show_doc_only=True)
        if update_html:
            check_nbconvert_version()
            html_fn = html_path/doc_path.with_suffix('.html').name
            if not force and html_fn.is_file():
                in_mod  = os.path.getmtime(doc_path)
                out_mod = os.path.getmtime(html_fn)
                if in_mod < out_mod: return
            convert_nb(doc_path, html_path)

    elif (source_path.name.startswith('fastai.')):
        # Do module update
        assert dest_path is not None, 'To update a module, you must specify a destination folder for where notebook resides'
        mod = import_mod(source_path.name)
        if not mod: return print('Could not find module for:', source_path)
        doc_path = Path(dest_path)/(strip_fastai(mod.__name__)+'.ipynb')
        if not doc_path.exists():
            print('Notebook does not exist. Creating:', doc_path)
            create_module_page(mod, dest_path)
        update_notebooks(doc_path, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,
                         update_nb_links=update_nb_links, html_path=html_path)
    elif source_path.is_dir():
        for f in sorted(Path(source_path).glob('*.ipynb')):
            update_notebooks(f, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,
                             update_nb_links=update_nb_links, html_path=html_path)
    else: print('Could not resolve source file:', source_path)