In [None]:
#| default_exp utils

# utils

> an xml parsing util

In [None]:
#| hide
from nbdev.showdoc import *
import ast
from typing import List, Union, Tuple
from textwrap import indent

In [None]:
#| export
import re

In [None]:
#| exporti
class TagNotFoundError(Exception):
    """Raised when the specified tag is not found in the XML."""
    pass


class NoContentError(Exception):
    """Raised when the specified tag is found but contains no content."""
    pass


class MultipleTagsError(Exception):
    """
    Raised when multiple instances of the specified tag are found in the XML.
    """
    pass

In [None]:
#| export
def parse_from_xml(input_text: str, tag_name: str) -> str:
    """
    Parse content from an XML-like string for a specific tag.

    Args:
        input_text (str): The input text containing XML-like tags.
        tag_name (str): The name of the tag to parse.

    Returns:
        str: The content within the specified tag.

    Raises:
        TagNotFoundError: If the specified tag is not found in the text.
        MultipleTagsError: If multiple instances of the tag are found.
        NoContentError: If the tag is present but contains no content.
    """
    pattern = rf"<{tag_name}>(.*?)</{tag_name}>"
    matches = re.findall(pattern, input_text, re.DOTALL)

    if not matches:
        raise TagNotFoundError(f"Tag '{tag_name}' not found in the text.")

    if len(matches) > 1:
        raise MultipleTagsError(
            f"Multiple instances of tag '{tag_name}' found in the text."
        )

    content = matches[0].strip()
    content = content.replace("```python", "").replace("```", "").strip()

    if not content:
        raise NoContentError(
            f"Tag '{tag_name}' is present but contains no content."
        )

    return content


::: {.content-hidden}

## nbdev module cleaning

We want to pass the `ocm.py` module to Claude so that it can use the framework to construct solutions. By default, `nbdev`'s output module file contains lots of extra comment lines, which would be wasteful to send to Claude. Also, we use patching so some methods are defined separately to their classes. We should try and programmatically clean this up.

Let's make a simple script that represents an output from nbdev. We'll include patched methods and decorators
:::

In [None]:
#| hide
script = """\
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.

from fastcore.utils import *
import numpy as np

# %% ../nbs/00_core.ipynb 8
class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

# %% ../nbs/00_core.ipynb 13
class MyOtherClass:
    \"\"\"
    Some other class
    \"\"\"
    ...

# %% ../nbs/00_core.ipynb 17
@patch
def __eq__(self: MyClass, other: MyClass) -> bool:
    \"\"\"
    Checks for equality
    Uses np.array_equal
    \"\"\"
    return np.array_equal(self.data, other.data)  # return the test result

# %% ../nbs/00_core.ipynb 23
my_class = MyClass(np.ones((2,3)))

# %% ../nbs/00_core.ipynb 27
@patch(as_prop=True)
def shape(self: MyClass) -> Tuple[int, int]:
    return self.data.shape

# %% ../nbs/00_core.ipynb 32
shape = my_class.shape
"""
print(script)

# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.

from fastcore.utils import *
import numpy as np

# %% ../nbs/00_core.ipynb 8
class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

# %% ../nbs/00_core.ipynb 13
class MyOtherClass:
    """
    Some other class
    """
    ...

# %% ../nbs/00_core.ipynb 17
@patch
def __eq__(self: MyClass, other: MyClass) -> bool:
    """
    Checks for equality
    Uses np.array_equal
    """
    return np.array_equal(self.data, other.data)  # return the test result

# %% ../nbs/00_core.ipynb 23
my_class = MyClass(np.ones((2,3)))

# %% ../nbs/00_core.ipynb 27
@patch(as_prop=True)
def shape(self: MyClass) -> Tuple[int, int]:
    return self.data.shape

# %% ../nbs/00_core.ipynb 32
shape = my_class.shape



::: {.content-hidden}

First and easiest step is to strip out the nbdev comments
:::

In [None]:
#| hide
script = '\n'.join([
    l for l in script.split('\n') if not (
        l.startswith('# %%') or l.startswith('# AUTOGENERATED!') or l.startswith('__all__')
    )
])
print(script)


from fastcore.utils import *
import numpy as np

class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

class MyOtherClass:
    """
    Some other class
    """
    ...

@patch
def __eq__(self: MyClass, other: MyClass) -> bool:
    """
    Checks for equality
    Uses np.array_equal
    """
    return np.array_equal(self.data, other.data)  # return the test result

my_class = MyClass(np.ones((2,3)))

@patch(as_prop=True)
def shape(self: MyClass) -> Tuple[int, int]:
    return self.data.shape

shape = my_class.shape



::: {.content-hidden}

Let's use the `ast` library to split the script into chunks representing top-level statements

:::

In [None]:
#| hide
def script_splitter(
    script: str  # the nbdev-like script with the special comment lines stripped out
) -> Tuple[List[str], List[str]]:  # List of statements and a separate list of their types ('assign', 'class name', 'patch' etc.)
    "Extract top-level statements from a Python script string."
    tree = ast.parse(script)
    lines = script.splitlines()

    # to make it easy to keep the intended spacing between code chunks, we calculate the end line as the start of the next chunk
    def _find_start(n):
        s = n.lineno - 1 # ast line numbers are 1-based
        if lines[s-1].startswith('@patch'): s -= 1
        return s
    
    statements = []
    types = []
    for i, node in enumerate(tree.body):
        start_line = _find_start(node)
        if i < len(tree.body)-1: end_line = _find_start(tree.body[i+1])
        else:
            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line

        # Extract the statement text
        if start_line == end_line: statement = lines[start_line]
        else: statement = '\n'.join(lines[start_line:end_line])
        
        statements.append(statement)
        
        if statement.startswith('@patch'):
            types.append('patch')
        elif isinstance(node, ast.ClassDef):
            types.append(re.search(r'class\s+(\w+)', lines[start_line]).group(1))
        elif isinstance(node, ast.FunctionDef):
            types.append(re.search(r'def\s+(\w+)', lines[start_line]).group(1))
        else:
            types.append('assign/import')

    return statements, types

In [None]:
#| hide
statements, types = script_splitter(script)
print('\n****\n'.join(statements))

from fastcore.utils import *
****
import numpy as np

****
class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

****
class MyOtherClass:
    """
    Some other class
    """
    ...

****
@patch
def __eq__(self: MyClass, other: MyClass) -> bool:
    """
    Checks for equality
    Uses np.array_equal
    """
    return np.array_equal(self.data, other.data)  # return the test result

****
my_class = MyClass(np.ones((2,3)))

****
@patch(as_prop=True)
def shape(self: MyClass) -> Tuple[int, int]:
    return self.data.shape

****
shape = my_class.shape


In [None]:
#| hide
print(types)

['assign/import', 'assign/import', 'MyClass', 'MyOtherClass', 'patch', 'assign/import', 'patch', 'assign/import']


In [None]:
#| hide
patches = [s for s, t in zip(statements, types) if t == 'patch']
p = patches[1]; print(p)

In [None]:
#| hide
def find_classname(patch):
    pattern = r'\bself\s*:\s*([\w.]+)(?=[,)])'
    
    match = re.search(pattern, patch)
    if match:
        # If it's a nested class path, return just the first part
        return match.group(1).split('.')[0]
    return None

In [None]:
#| hide
find_classname(p)

'MyClass'

::: {.content-hidden}

When the class name is referenced in the patch, it isn't quoted but needs to be when moved into the main class definition. Let's write a function to clean up a patch

:::

In [None]:
#| hide
def clean_patch(patch, classname):
    pattern_self = rf'\bself\s*:\s*{re.escape(classname)}'
    transformed = re.sub(pattern_self, 'self', patch)
    
    # Then, handle other parameter annotations
    # Look for any remaining type annotations of the class name
    # But don't match if it's already quoted
    pattern_others = rf':\s*{re.escape(classname)}(?!["\'])'
    transformed = re.sub(pattern_others, f": '{classname}'", transformed)
    transformed = '\n'.join(transformed.split('\n')[1:])
    if 'as_prop' in patch.split('\n')[0]:
        transformed = '@decorator\n' + transformed
    
    return transformed

In [None]:
#| hide
print(clean_patch(p, find_classname(p)))

@decorator
def shape(self) -> Tuple[int, int]:
    return self.data.shape



In [None]:
#| hide
for p in patches:
    name = find_classname(p)
    idx = types.index(name)

    statements[idx] += '\n' + indent(clean_patch(p, name), prefix='    ')

In [None]:
#| hide
print('\n'.join([s for s, t in zip(statements, types) if t != 'patch']))

from fastcore.utils import *
import numpy as np

class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

    def __eq__(self, other: 'MyClass') -> bool:
        """
        Checks for equality
        Uses np.array_equal
        """
        return np.array_equal(self.data, other.data)  # return the test result

    @decorator
    def shape(self) -> Tuple[int, int]:
        return self.data.shape

class MyOtherClass:
    """
    Some other class
    """
    ...

my_class = MyClass(np.ones((2,3)))

shape = my_class.shape


::: {.content-hidden}

Looks good. Let's put it all together

:::

In [None]:
#| hide
def nbdev_cleaner(script):
    script = '\n'.join([
        l for l in script.split('\n') if not (
            l.startswith('# %%') or l.startswith('# AUTOGENERATED!') or l.startswith('from fastcore.utils')
        )
    ])

    statements, types = script_splitter(script)
    patches = [s for s, t in zip(statements, types) if t == 'patch']

    for p in patches:
        name = find_classname(p)
        idx = types.index(name)
    
        statements[idx] += '\n' + indent(clean_patch(p, name), prefix='    ')

    new_script = '\n'.join([s for s, t in zip(statements, types) if t != 'patch'])
    new_script = re.sub(r'\n{3,}', '\n\n', new_script) # strip 3 blank lines from the top

    return new_script

In [None]:
#| hide
print(nbdev_cleaner(script))

import numpy as np

class MyClass:
    "A toy class"
    def __init__(self,
                 data: np.ndarray  # a 2-d numpy array
                ):
        # store the data
        self.data = data

    def some_method(self):
        ...

    def __eq__(self, other: 'MyClass') -> bool:
        """
        Checks for equality
        Uses np.array_equal
        """
        return np.array_equal(self.data, other.data)  # return the test result

    @decorator
    def shape(self) -> Tuple[int, int]:
        return self.data.shape

class MyOtherClass:
    """
    Some other class
    """
    ...

my_class = MyClass(np.ones((2,3)))

shape = my_class.shape


In [None]:
#| hide
with open('../arcsolver/ocm.py', 'r') as f:
    raw_script = f.read()

In [None]:
#| hide
print(raw_script)

"""Primitive classes for constructing object-centric models (OCMs) for ARC tasks"""

# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_ocm.ipynb.

# %% auto 0
__all__ = ['Vector', 'Color', 'Direction', 'Object', 'Rectangle', 'Line', 'Bitmap', 'Grid', 'ShapeExtractor', 'PatternMatcher',
           'EnclosureFiller', 'CyclicPattern']

# %% ../nbs/01_ocm.ipynb 5
from fastcore.utils import *
from enum import Enum
from typing import ClassVar, List, Optional, Tuple, Union
import numpy as np
from pydantic import BaseModel, Field, field_validator, model_validator
from scipy import ndimage

# %% ../nbs/01_ocm.ipynb 8
class Vector(BaseModel):
    "2D vector for positions, sizes, and directions."
    i: int
    j: int

    model_config = {"frozen": True}

    def __init__(self, i: int, j: int):
        super().__init__(i=i, j=j)

    def to_array(self) -> np.ndarray:
        return np.array([self.i, self.j], dtype=int)

    @classmethod
    def from_array(cls, a: Union[np.ndarray, List, Tuple

In [None]:
#| hide
cleaned_script = nbdev_cleaner(raw_script)
print(cleaned_script)

"""Primitive classes for constructing object-centric models (OCMs) for ARC tasks"""

__all__ = ['Vector', 'Color', 'Direction', 'Object', 'Rectangle', 'Line', 'Bitmap', 'Grid', 'ShapeExtractor', 'PatternMatcher',
           'EnclosureFiller', 'CyclicPattern']

from enum import Enum
from typing import ClassVar, List, Optional, Tuple, Union
import numpy as np
from pydantic import BaseModel, Field, field_validator, model_validator
from scipy import ndimage

class Vector(BaseModel):
    "2D vector for positions, sizes, and directions."
    i: int
    j: int

    model_config = {"frozen": True}

    def __init__(self, i: int, j: int):
        super().__init__(i=i, j=j)

    def to_array(self) -> np.ndarray:
        return np.array([self.i, self.j], dtype=int)

    @classmethod
    def from_array(cls, a: Union[np.ndarray, List, Tuple]) -> 'Vector':
        if not isinstance(a, np.ndarray): a = np.array(a)
        if a.shape != (2,) or a.dtype != int:
            raise ValueError("Need 1D arr

In [None]:
#| hide
with open('../arcsolver/ocm_cleaned.py', 'w') as f:
    f.write(cleaned_script)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()