# Process Lecture Notes Markdown to JSON Hierarchy

## Convert a Markdown lecture notes file into a hierarchical JSON structure.

## Rules implemented:
1) Headings starting with '#' define hierarchy levels (# -> 1, ## -> 2, etc.).
2) Each section node stores:
   - id: hierarchical identifier like "1", "1.2", "2.1.3"
   - level: integer heading level (1 for '#', 2 for '##', ...)
   - content: raw Markdown content belonging to that section
   - title: heading text (extra field; helpful for inspection)
   - children: nested subsections
3) A line that starts with '#' and ENDS with ':code' is NOT treated as a heading;
   it is preserved verbatim in the current section content.
4) Headings are ignored within fenced code blocks (``` or ~~~) and display-math blocks ($$ and `\[\]` on its own line).
5) Any text before the first heading is captured in a synthetic preamble node at level 0.


In [None]:
import json
import re, os
import sys
from pathlib import Path
from typing import List, Dict, Any, Optional, Union, Iterable, Tuple

## Parse Markdown to JSON Tree

In [None]:
HEADING_RE = re.compile(r'^(?P<hashes>#{1,})(?P<sp>\s+)(?P<title>.*)$')

def parse_markdown_to_tree(md_text: str) -> Dict[str, Any]:
    """
    Parse Markdown into a hierarchical section tree.

    Returns a root dict:
    {
      "id": "0",
      "level": 0,
      "type": "section",
      "title": "__ROOT__",
      "content": "<preamble-if-any>",
      "children": [ ... section nodes ... ]
    }
    """
    # Root node (level 0) holds optional preamble content and all top-level sections
    root = {
        "id": "0",
        "level": 0,
        "type": "section",
        "title": "__ROOT__",
        "content": "",
        "children": []
    }

    # Stack of the current path of nodes; index 0 is the root
    node_stack: List[Dict[str, Any]] = [root]

    # Counters for hierarchical ids, e.g., [2, 1, 3] -> "2.1.3"
    counters: List[int] = []

    
    # Helper: get the current node to append content to (last of stack)
    def current_node() -> Dict[str, Any]:
        return node_stack[-1]

    # Helper: create a new section node and attach it under its parent
    def start_section(level: int, title: str) -> Dict[str, Any]:
        nonlocal counters

        # Ensure counters has at least `level` entries; truncate deeper levels
        if len(counters) < level:
            counters.extend([0] * (level - len(counters)))
        else:
            counters = counters[:level]

        # Increment the counter for this level
        counters[level - 1] += 1

        # Hierarchical id like "1", "1.2", ...
        section_id = ".".join(str(x) for x in counters[:level])

        node = {
            "id": section_id,
            "level": level,
            "type": "section",
            "title": title.strip(),
            "content": "",
            "children": []
        }

        # Adjust the node stack to this level:
        # Ensure parent is level-1; root has level 0
        # Pop until stack size == level (parent at index level-1)
        while len(node_stack) > level:
            node_stack.pop()

        # Parent is the node at level-1
        parent = node_stack[-1]
        parent["children"].append(node)
        node_stack.append(node)
        return node

    lines = md_text.splitlines(keepends=True)

    for raw_line in lines:

        # remove any line number from the raw_line
        cleaned_line = re.sub(r'^\d+:\s*', '', raw_line)
        
        
        # Outside of fences: check heading syntax
        # If the line starts with '#' and ends with ':code' -> treat as content (special rule)
        stripped = cleaned_line.rstrip("\n")
        if stripped.startswith("#") and stripped.endswith(":code"):
            # Do NOT parse as heading; keep verbatim
            #cleaned_cleaned_line = re.sub(r'(:code|:markdown)$', '', cleaned_line)
            current_node()["content"] += cleaned_line
            continue

        m = HEADING_RE.match(stripped)
        if m:
            hashes = m.group("hashes")
            title = m.group("title")
            level = len(hashes)

            # Create a new section node at this level

            #cleaned_title = re.sub(r'(:code|:markdown)$', '', title)
            start_section(level, title)
            # Heading lines themselves are not part of the section content,
            # but you could uncomment the next line to include them if desired:
            #cleaned_cleaned_line = re.sub(r'(:code|:markdown)$', '', cleaned_line)
            #current_node()["content"] += cleaned_cleaned_line
          
        else:
            # Regular content goes into the current node (root if no heading yet)
            #cleaned_cleaned_line = re.sub(r'(:code|:markdown)$', '', cleaned_line)
            current_node()["content"] += cleaned_line

    # Optionally, if root["content"] is only whitespace, you can trim it
    # but we’ll keep as-is to preserve exact input.
    return root

## Remove :code and :markdown Suffixes

Strip trailing ':code' and ':markdown' suffixes from JSON lecture notes.

In [None]:
# Matches optional whitespace + ':' + (code|markdown) at end of a line
SUFFIX_RE = re.compile(r"\s*:(?:code|markdown)\s*$", re.IGNORECASE)

TARGET_TOP_KEYS = {"title", "content"}
TARGET_ELEMENT_KEYS = {"raw", "clean"}

def strip_suffix_from_line(line: str) -> str:
    """Remove a trailing ':code' or ':markdown' from a single line."""
    return SUFFIX_RE.sub("", line)

def strip_suffix_from_text_block(text: str) -> str:
    """Process a (possibly multi-line) text block line-by-line."""
    # Preserve original line endings by splitting on '\n' and re-joining
    lines = text.split("\n")
    return "\n".join(strip_suffix_from_line(ln) for ln in lines)

def process_content_elements(elems: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Clean the 'raw' and 'clean' fields within content_elements."""
    for el in elems:
        if not isinstance(el, dict):
            continue
        for k in list(el.keys()):
            if k in TARGET_ELEMENT_KEYS and isinstance(el[k], str):
                el[k] = strip_suffix_from_text_block(el[k])
    return elems

def process_node(node: Dict[str, Any]) -> Dict[str, Any]:
    """
    Recursively traverse a section/node dict and clean targeted string fields.
    Expects a structure similar to the uploaded lecture notes JSON.
    """
    # Clean top-level targeted keys if present
    for k in list(node.keys()):
        v = node[k]
        if k in TARGET_TOP_KEYS and isinstance(v, str):
            node[k] = strip_suffix_from_text_block(v)

    # Clean content_elements array if present
    if "content_elements" in node and isinstance(node["content_elements"], list):
        node["content_elements"] = process_content_elements(node["content_elements"])

    # Recurse into children
    if "children" in node and isinstance(node["children"], list):
        for child in node["children"]:
            if isinstance(child, dict):
                process_node(child)

    return node

def process_json(obj: Union[Dict[str, Any], List[Any]]) -> Union[Dict[str, Any], List[Any]]:
    """
    Entry point for cleaning; supports top-level dict or list of dicts.
    """
    if isinstance(obj, dict):
        return process_node(obj)
    elif isinstance(obj, list):
        return [process_node(x) if isinstance(x, dict) else x for x in obj]
    else:
        return obj

## Break Down to Individual Elements

Parse a lecture-notes-like JSON file and, for every object with a 'content' key,
add a new sibling key 'content_elements' that lists the identified elements.

In [None]:
"""
Parse a lecture-notes-like JSON file and, for every object with a 'content' key,
add a new sibling key 'content_elements' that lists the identified elements.

Element types detected (line-granularity, with basic multi-line grouping):
- 'code'      : lines marked with ':code' or between triple-backtick fences
- 'markdown'  : lines marked with ':markdown'
- 'math'      : LaTeX-style math ($...$, $$...$$, \(...\), \[...\], \begin{equation}...\end{equation})
- 'image'     : Markdown images ![alt](url) or <img ... src="...">
- 'video'     : <iframe ...> embeds or URLs from video domains (YouTube, Vimeo, etc.)
- 'link'      : bare URLs that are not images/videos
- 'html'      : other HTML tags (non-img/non-iframe) on the line
- 'text'      : anything else after marker stripping
- 'other'     : fallback

Each element preserves:
- 'raw'    : the original line or grouped block (verbatim)
- 'clean'  : a cleaned version (markers removed; trimmed)
- 'meta'   : metadata such as line numbers, detected markers, URLs, language hints
"""

# ---------- Regexes for detection ----------

# Markdown image: ![alt](url)
RE_MD_IMAGE = re.compile(r'!\[[^\]]*\]\((?P<url>[^)]+)\)')

# HTML <img ... src="...">
RE_HTML_IMAGE = re.compile(r'<img\s+[^>]*src=["\'](?P<url>[^"\']+)["\'][^>]*>', re.IGNORECASE)

# HTML <iframe ... src="...">
RE_IFRAME = re.compile(r'<iframe\s+[^>]*src=["\'](?P<url>[^"\']+)["\'][^>]*>', re.IGNORECASE)

# Bare URL (very permissive)
RE_URL = re.compile(
    r'(?P<url>(?:https?://|www\.)[^\s)<>\]]+)', re.IGNORECASE
)

# Code fence ```
RE_FENCE_OPEN = re.compile(r'^\s*```(?P<lang>[A-Za-z0-9_\-+]*)\s*$')
RE_FENCE_CLOSE = re.compile(r'^\s*```\s*$')

# LaTeX math (inline and display)
RE_INLINE_MATH = re.compile(r'(?<!\\)\$(?P<expr>[^$]+?)(?<!\\)\$')
RE_BLOCK_MATH_INLINE = re.compile(r'(?<!\\)\$\$(?P<expr>.+?)(?<!\\)\$\$')
RE_BEGIN_ENV = re.compile(r'\\begin\{(?P<env>equation\*?|align\*?|gather\*?|multline\*?)\}')
RE_END_ENV = re.compile(r'\\end\{(?P<env>equation\*?|align\*?|gather\*?|multline\*?)\}')
RE_PAREN_MATH = re.compile(r'\\\((?P<expr>.+?)\\\)')
RE_BRACKET_MATH = re.compile(r'\\\[(?P<expr>.+?)\\\]')

# Generic HTML tag (for html detection; exclude img/iframe which are handled separately)
RE_HTML_GENERIC = re.compile(r'<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*</\1\s*>')

# ':code' / ':markdown' markers (suffixes in this dataset)
RE_MARKER_SUFFIX = re.compile(r'(?P<core>.*?)(?::(?P<marker>code|markdown))\s*$')

# Video domains and file extensions
VIDEO_DOMAINS = (
    'youtube.com', 'youtu.be', 'vimeo.com', 'dailymotion.com', 'video.google.com'
)
VIDEO_EXTS = ('.mp4', '.mov', '.webm', '.mkv', '.avi', '.m4v')


# ---------- Utilities ----------

def _has_video_signature(url: str) -> bool:
    url_low = url.lower()
    if any(dom in url_low for dom in VIDEO_DOMAINS):
        return True
    if any(url_low.endswith(ext) for ext in VIDEO_EXTS):
        return True
    return False


def _strip_marker_suffix(line: str) -> Tuple[str, Optional[str]]:
    """Remove a trailing ':code' or ':markdown' marker; return (clean, marker|None)."""
    m = RE_MARKER_SUFFIX.match(line.rstrip('\n'))
    if m:
        return m.group('core'), m.group('marker')
    return line.rstrip('\n'), None


def _first_url(line: str) -> Optional[str]:
    m = RE_URL.search(line)
    if m:
        return m.group('url')
    return None


def _is_html_other_than_img_iframe(line: str) -> bool:
    # Detect presence of other HTML tags (heuristic)
    if RE_IFRAME.search(line) or RE_HTML_IMAGE.search(line):
        return False
    return bool(RE_HTML_GENERIC.search(line))


# ---------- Element construction ----------

def _make_element(
    etype: str,
    raw: str,
    clean: str,
    line_start: int,
    line_end: Optional[int] = None,
    url: Optional[str] = None,
    marker: Optional[str] = None,
    lang: Optional[str] = None
) -> Dict[str, Any]:
    return {
        "type": etype,
        "raw": raw,
        "clean": clean.strip(),
        "meta": {
            "line_start": line_start,
            "line_end": line_end if line_end is not None else line_start,
            "marker": marker,
            "url": url,
            "lang": lang,
        },
    }


# ---------- Core parsing (per content string) ----------

def parse_content_to_elements(content: str) -> List[Dict[str, Any]]:
    """
    Convert a raw content string into a list of element dicts,
    applying line-based splitting and multi-line grouping for code/math blocks.
    """
    if not content:
        return []

    lines = content.splitlines()
    elements: List[Dict[str, Any]] = []

    in_code_fence = False
    code_fence_lang: Optional[str] = None
    code_buffer: List[str] = []
    code_start_line: Optional[int] = None

    in_math_env = False
    math_env_buffer: List[str] = []
    math_start_line: Optional[int] = None

    def flush_code_block(end_line_idx: int) -> None:
        nonlocal code_buffer, code_fence_lang, code_start_line
        if code_buffer:
            raw = "\n".join(code_buffer)
            clean = raw
            elements.append(
                _make_element(
                    etype="code",
                    raw=raw,
                    clean=clean,
                    line_start=code_start_line if code_start_line is not None else end_line_idx,
                    line_end=end_line_idx,
                    lang=code_fence_lang,
                )
            )
            code_buffer = []
        code_fence_lang = None
        code_start_line = None

    def flush_math_block(end_line_idx: int) -> None:
        nonlocal math_env_buffer, math_start_line
        if math_env_buffer:
            raw = "\n".join(math_env_buffer)
            clean = raw
            elements.append(
                _make_element(
                    etype="math",
                    raw=raw,
                    clean=clean,
                    line_start=math_start_line if math_start_line is not None else end_line_idx,
                    line_end=end_line_idx,
                )
            )
            math_env_buffer = []
        math_start_line = None

    i = 0
    while i < len(lines):
        line = lines[i]
        line_num = i + 1  # 1-based

        # Code fence handling
        if not in_code_fence:
            m_open = RE_FENCE_OPEN.match(line)
            if m_open:
                in_code_fence = True
                code_fence_lang = m_open.group('lang') or None
                code_buffer = [line]  # include fence line in raw
                code_start_line = line_num
                i += 1
                continue
        else:
            code_buffer.append(line)
            if RE_FENCE_CLOSE.match(line):
                in_code_fence = False
                flush_code_block(line_num)
            i += 1
            continue

        # Math environment handling (LaTeX \begin{equation} ... \end{equation})
        if not in_math_env:
            if RE_BEGIN_ENV.search(line):
                in_math_env = True
                math_env_buffer = [line]
                math_start_line = line_num
                i += 1
                continue
        else:
            math_env_buffer.append(line)
            if RE_END_ENV.search(line):
                in_math_env = False
                flush_math_block(line_num)
            i += 1
            continue

        # Strip ':code' / ':markdown' markers (dataset-specific)
        core, marker = _strip_marker_suffix(line)
        core_stripped = core.strip()

        # Detect image (Markdown or HTML)
        img_md = RE_MD_IMAGE.search(core_stripped)
        img_html = RE_HTML_IMAGE.search(core_stripped)
        if img_md:
            url = img_md.group('url')
            elements.append(_make_element("image", raw=line, clean=core_stripped, line_start=line_num, url=url, marker=marker))
            i += 1
            continue
        if img_html:
            url = img_html.group('url')
            elements.append(_make_element("image", raw=line, clean=core_stripped, line_start=line_num, url=url, marker=marker))
            i += 1
            continue

        # Detect video (iframe or video-like URL)
        iframe = RE_IFRAME.search(core_stripped)
        if iframe:
            url = iframe.group('url')
            elements.append(_make_element("video", raw=line, clean=core_stripped, line_start=line_num, url=url, marker=marker))
            i += 1
            continue

        url = _first_url(core_stripped)
        if url and _has_video_signature(url):
            elements.append(_make_element("video", raw=line, clean=core_stripped, line_start=line_num, url=url, marker=marker))
            i += 1
            continue

        # Detect inline/display math on the line
        if RE_BLOCK_MATH_INLINE.search(core_stripped) or RE_INLINE_MATH.search(core_stripped) \
           or RE_PAREN_MATH.search(core_stripped) or RE_BRACKET_MATH.search(core_stripped):
            elements.append(_make_element("math", raw=line, clean=core_stripped, line_start=line_num, marker=marker))
            i += 1
            continue

        # Detect code by ':code' marker (common in provided context)
        if marker == 'code':
            elements.append(_make_element("code", raw=line, clean=core_stripped, line_start=line_num, marker=marker))
            i += 1
            continue

        # Detect generic HTML (non-img/non-iframe)
        if _is_html_other_than_img_iframe(core_stripped):
            elements.append(_make_element("html", raw=line, clean=core_stripped, line_start=line_num, marker=marker))
            i += 1
            continue

        # Detect link (non-video)
        if url:
            elements.append(_make_element("link", raw=line, clean=core_stripped, line_start=line_num, url=url, marker=marker))
            i += 1
            continue

        # Treat explicit ':markdown' marker as markdown text
        if marker == 'markdown':
            elements.append(_make_element("markdown", raw=line, clean=core_stripped, line_start=line_num, marker=marker))
            i += 1
            continue

        # Empty line → represent as empty text element (helps preserve structure)
        if core_stripped == "":
            elements.append(_make_element("text", raw=line, clean="", line_start=line_num, marker=marker))
            i += 1
            continue

        # Default: plain text
        elements.append(_make_element("text", raw=line, clean=core_stripped, line_start=line_num, marker=marker))
        i += 1

    # Flush any unclosed blocks (defensive)
    if in_code_fence:
        flush_code_block(len(lines))
    if in_math_env:
        flush_math_block(len(lines))

    return elements


# ---------- Tree traversal (mutates in-place) ----------

def add_elements_recursively(node: Dict[str, Any]) -> None:
    """
    For a node (dict) with 'content', add 'content_elements' using the parser above.
    Recurse into 'children' if present.
    """
    if isinstance(node, dict):
        if 'content' in node and isinstance(node['content'], str):
            node['content_elements'] = parse_content_to_elements(node['content'])

        # Recurse into children (if any)
        if 'children' in node and isinstance(node['children'], list):
            for child in node['children']:
                add_elements_recursively(child)

## Run Pipeline on Lecture Notes
### From a Markdown Lecture Notes to Hierarchical JSON

In [None]:
input_md_path = Path("./data/lecture_notes_8.md")
output_json_path = Path("./data/lecture_notes_8.json")

In [None]:
md_text = input_md_path.read_text(encoding="utf-8", errors="replace")

In [None]:
json_tree = parse_markdown_to_tree(md_text)

In [None]:
json_tree

In [None]:
json_tree_elements_cleaned = process_json(json_tree)

In [None]:
json_tree_elements_cleaned

In [None]:
add_elements_recursively(json_tree_elements_cleaned)

In [None]:
json_tree_elements_cleaned

In [None]:
with output_json_path.open("w", encoding="utf-8") as f:
    json.dump(json_tree_elements_cleaned, f, ensure_ascii=False, indent=2)