# Translating [Markdown] to [Python]

A primary translation is literate programming is the tangle step that converts the literate program into 
the programming language. The original implementation converts `".WEB"` files to valid pascal - `".PAS"` - files.
The `pidgy` approach begins with [Markdown] files and proper [Python] files as the outcome. The rest of this 
document configures how [IPython] acknowledges the transformation and the heuristics the translate [Markdown] to [Python].

[Markdown]: #
[Python]: #

In [1]:
    import typing, mistune, IPython, pidgy.util, ast, textwrap, markdown_it
    __all__ = 'tangle', 'Tangle'

The `pidgy` tangle workflow has three steps:

1. Block-level lexical analysis to tokenize [Markdown].
2. Normalize the tokens to compacted `"code" and not "code"` tokens.
3. Translate the normalized tokens to a string of valid [Python] code.

[Markdown]: #
[Python]: #

## Block level lexical analysis.

`pidgy` uses a modified `mistune.BlockLexer` to create block level tokens
for a [Markdown] source. A specific `pidgy` addition is the addition off 
a `doctest` block object, `doctest` are testable strings that are ignored by the tangle
step. The tokens are to be normalized and translated to [Python] strings.

<details><summary><code>BlockLexer</code></summary>

In [2]:
    class BaseRenderer(pidgy.util.BaseRenderer):        
        def quote(self, str, trailing=''):
            """Wrap a truple block quotations."""
            quote, length = self.QUOTES[self.QUOTES[0] in str], len(str)
            left, right = length - len(str.lstrip()), len(str.rstrip())
            if not str[left:right].strip(): return str
            while str[right-1] == '\\':
                right -= 1
            return str[:left] + quote + str[left:right] + quote + trailing + str[right:]

        def measure_base_indent(self, tokens, env): 
            next = self.get_next_code_token(tokens, -1)
            if next and next.type == 'code_block':
                env['base_indent'] = pidgy.util.lead_indent(env['src'][slice(*next.map)])
            else:
                env['base_indent'] = 4
                
        def get_next_code_token(self, tokens, idx):
            for token in tokens[idx+1:]:
                if token.type in {'code_block'}:
                    return token
        
        def hanging_indent(self, str, env):
            start = len(str)-len(str.lstrip())
            return str[:start] + ' '* env['extra_indent'] + str[start:]
        
        def indent(self, str, env):
            return textwrap.indent(str, ' ' *env['base_indent'])


        def token_to_str(self, tokens, idx, env):
            if idx < len(tokens):
                if tokens[idx] and tokens[idx].map:
                    return ''.join(env['src'][slice(*tokens[idx].map)])
            return ""
        
        def update_env(self, code, tokens, idx, env):
            next = self.get_next_code_token(tokens, idx)
            extra_indent = 0
            if next:
                extra_indent = max(0, pidgy.util.lead_indent(env['src'][slice(*next.map)]) -env['base_indent'])
            if not extra_indent and code.rstrip().endswith(":"):
                extra_indent += 4
            rstrip = code.rstrip()
            env.update(
                extra_indent=extra_indent,
                base_indent=pidgy.util.trailing_indent(code),
                continued=rstrip.endswith('\\'), 
                quoted=rstrip.rstrip('\\').endswith(self.QUOTES)
            )
        def render(self, tokens, options, env):
            env.update(base_indent=0, quoted=False, extra_indent=0, continued=False)
            tokens = pidgy.util.reconfigure_tokens(pidgy.util.filter_tangle_tokens(tokens), env)
            self.measure_base_indent(tokens, env)
            if not tokens:
                return self.quote(''.join(env['src']), trailing=';')
            return textwrap.dedent(pidgy.util.continuation(
                markdown_it.renderer.RendererHTML.render(self, tokens, options, env), env
            ) + "\n" + self.noncode(tokens, len(tokens), env))

In [3]:
    class PythonRender(BaseRenderer):
        QUOTES = '"""', "'''"
    
        def noncode(self, tokens, idx, env): 
            token, range, prior = None, slice(None), slice(*tokens[-1].map)
            if idx < len(tokens):
                token = tokens[idx]
                range, prior = slice(*tokens[idx].map), slice(*tokens[idx-1].map) if idx else slice(0,0)                
            
            non_code = pidgy.util.dedent_block(''.join(env['src'][prior.stop:range.start]))
            non_code = self.indent(self.hanging_indent(non_code, env), env)
            if not env.get('quoted', False):
                non_code = self.quote(non_code, trailing=';' if token is None else '')
            return non_code
                
        def update_env(self, code, tokens, idx, env):
            next = self.get_next_code_token(tokens, idx)
            env.update(base_indent=pidgy.util.trailing_indent(code))

            extra_indent = 0
            if next:
                extra_indent = max(0, pidgy.util.lead_indent(env['src'][slice(*next.map)]) -env['base_indent'])
            if not extra_indent and code.rstrip().endswith(":"):
                extra_indent += 4
            rstrip = code.rstrip()
            env.update(
                extra_indent=extra_indent,
                continued=rstrip.endswith('\\'), 
                quoted=rstrip.rstrip('\\').endswith(self.QUOTES)
            )
        
        def code_block(self, tokens, idx, options, env):
            code = self.noncode(tokens, idx, env) + pidgy.util.quote_docstrings(self.token_to_str(tokens, idx, env))
            return self.update_env(code, tokens, idx, env) or code

        
        def fence(self, tokens, idx, options, env):
            "We'll only recieve fences without a lang."
            code =  self.noncode(tokens, idx, env) + textwrap.indent(
                pidgy.util.quote_docstrings(pidgy.util.unfence(self.token_to_str(tokens, idx, env))), ' '*4
            )
            return self.update_env(code, tokens, idx, env) or code

        def front_matter(self, tokens, idx, options, env):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if token.markup == '+++':
                code = F'''locals().update(__import__('toml').loads("""{code}""".partition('+++')[2].rpartition('+++')[0]))\n'''
            elif token.markup == '---':
                code = F'''locals().update(__import__('yaml').safe_load("""{code}""".partition('---')[2].rpartition('---')[0]))\n'''            
            return self.indent(code, env)

            
        def reference(self, tokens, idx, options, env, *, re='link_item'):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if env['quoted']:
                return code
            
            expr  = "{"+F"""x.group(1): x.group(2).rstrip() for x in __import__('pidgy').util.{re}.finditer({
                self.quote(textwrap.dedent(code), trailing=")}").rstrip()
            }"""
            if not env['continued']:
                expr = """locals()["__annotations__"] = {**%s, **locals().get('__annotations__', {})}"""%expr
            code = self.noncode(tokens, idx, env) + self.indent(expr + "\n", env)
            return code
        
        def footnote_reference_open(self, tokens, idx, options, env):
            return self.reference(tokens, idx, options, env, re='footnote_item')
        
        def bullet_list_open(self, tokens, idx, options, env):
            token, code = tokens[idx], self.token_to_str(tokens, idx, env)
            if env['quoted']:
                return code
            if env['continued']:
                return self.indent(
                    (F"""[x.group().rstrip().partition(' ')[2] for x in __import__('pidgy').util.list_item.finditer({
                        self.quote(textwrap.dedent(code), trailing=')]')
                    }\n"""), env)
            code = self.quote(textwrap.dedent(code), trailing=';')
            code = self.indent(self.hanging_indent(code, env), env)
            return code

        ordered_list_open = bullet_list_open 

In [4]:
    @pidgy.implementation
    def tangle(str:str)->str:
        translate = Tangle()
        return translate.stringify(''.join(str or []))

In [5]:
    class pidgyManager(IPython.core.inputtransformer2.TransformerManager):
        def transform_cell(self, cell): return super(type(self), self).transform_cell(tangle(str=cell))

</details>

</details>

## Flattening the tokens to a [Python] string.

The tokenizer controls the translation of markdown strings to python strings.  Our major constraint is that the Markdown input should retain line numbers.

<details><summary><code>Flatten</code></summary>

Append the lexer for nested rules.

In [6]:
        class Tangle(markdown_it.MarkdownIt):
            def __init__(self, *args, **kwargs):
                kwargs['renderer_cls'] = kwargs.get('renderer_cls', PythonRender)
                super().__init__(*args, **kwargs)
                [self.block.ruler.before(
                    "code",
                    "front_matter",
                    __import__('functools').partial(pidgy.util.frontMatter, x),
                    {"alt": ["paragraph", "reference", "blockquote", "list"]},
                ) for x in "-+"]
                self.block.ruler.before(
                    "reference", "footnote_def", markdown_it.extensions.footnote.index.footnote_def, {"alt": ["paragraph", "reference"]}
                )
                self.disable('html_block')
                
            def parse(self, src, env=None, normalize=False):
                src = pidgy.util.enforce_blanklines(src)
                if env is None:
                    env = markdown_it.utils.AttrDict()
                env.update(src=src.splitlines(True))
                tokens = super().parse(src, env)
                if normalize: tokens = pidgy.util.reconfigure_tokens(pidgy.util.filter_tangle_tokens(tokens), env)
                return tokens
            def render(self, src, env=None):                
                if env is None:
                    env  = markdown_it.utils.AttrDict()
                return super().render(src, env)
            def stringify(self, src, env=None):               
                env = env or markdown_it.utils.AttrDict()
                return self.render(src)

## More `pidgy` langauge features

`pidgy` experiments extra language features for python, using the same system
that IPython uses to add features like line and cell magics.

Recently, IPython introduced a convention that allows top level await statements outside of functions. Building of this convenience, `pidgy` allows for top-level __return__ and __yield__ statements.  These statements are replaced with the an IPython display statement.

In [7]:
    class ExtraSyntax(ast.NodeTransformer):
        def visit_FunctionDef(self, node): return node
        visit_AsyncFunctionDef = visit_FunctionDef        

        def visit_Return(self, node):
            replace = ast.parse('''__import__('IPython').display.display()''').body[0]
            replace.value.args = node.value.elts if isinstance(node.value, ast.Tuple) else [node.value]
            return ast.copy_location(replace, node)

        def visit_Expr(self, node):
            if isinstance(node.value, (ast.Yield, ast.YieldFrom)):  return ast.copy_location(self.visit_Return(node.value), node)
            return node

        visit_Expression = visit_Expr

We know naming is hard, there is no point focusing on it. `pidgy` allows authors
to use emojis as variables in python. They add extra color and expression to the narrative.

In [8]:
    def demojize(lines, delimiters=('_', '_')):
        str = ''.join(lines or [])
        import tokenize, emoji, stringcase; tokens = []
        try:
            for token in list(tokenize.tokenize(
                __import__('io').BytesIO(str.encode()).readline)):
                if token.type == tokenize.ERRORTOKEN:
                    string = emoji.demojize(token.string, delimiters=delimiters
                                           ).replace('-', '_').replace("’", "_")
                    if tokens and tokens[-1].type == tokenize.NAME: tokens[-1] = tokenize.TokenInfo(tokens[-1].type, tokens[-1].string + string, tokens[-1].start, tokens[-1].end, tokens[-1].line)
                    else: tokens.append(
                        tokenize.TokenInfo(
                            tokenize.NAME, string, token.start, token.end, token.line))
                else: tokens.append(token)
            return tokenize.untokenize(tokens).decode()
        except BaseException: ...

In [9]:
    def init_json():
        import builtins
        builtins.yes = builtins.true = True
        builtins.no = builtins.false = False
        builtins.null = None