In [1]:
# | default_exp content_parser


In [2]:
# | export
import re
import yaml
import json
from pathlib import Path
from urllib.parse import urlparse
from datetime import datetime


In [3]:
# | test
from fastcore.test import test_eq


In [4]:
# | export
def parse_metadata(content: str) -> dict:
    """Extract metadata from content frontmatter"""
    yaml_section = content.split("---")[1]
    metadata = yaml.safe_load(yaml_section)
    if "title" not in metadata and "pagetitle" in metadata:
        metadata["title"] = metadata.get("pagetitle", "")
    return metadata



In [5]:
# | export
def parse_notebook_metadata(content: str) -> dict:
    """Extract metadata from Jupyter notebook"""
    notebook = json.loads(content)

    # Check first cell for YAML frontmatter
    if notebook.get("cells"):
        first_cell = notebook["cells"][0]
        if first_cell.get("cell_type") == "markdown":
            source = "".join(first_cell.get("source", []))
            if source.startswith("---"):
                return parse_metadata(source)

    return {}


In [6]:
# | test
from nbdev.qmd import meta


In [7]:
# | test
# Test Parse metadat
from pathlib import Path

sample_dir = Path("sample")
if not sample_dir.exists():
    sample_dir = Path("../sample")

with open(sample_dir / "example.md", "r") as file:
    content = file.read()

metadata = parse_metadata(content)

# content is .ipynb
with open(sample_dir / "design_questions.ipynb", "r") as f:
    nb_content = f.read()
nb_metadata = parse_notebook_metadata(nb_content)
print(nb_metadata)


{}


In [8]:
#| export
def is_frontmatter(cell):
    if cell.get("cell_type") =="markdown":
        cell_text = "".join(cell.get("source",[]))
        if cell_text.startswith("---"):
            return True
    return False

In [9]:
#| export
def is_visible_code(cell, is_quarto=False):
    if cell.get("cell_type") != "code":
        return False
    if not is_quarto:
        return True
    cell_text = "".join(cell.get("source", []))
    return "#| echo: false" not in cell_text and "#| include: false" not in cell_text


In [10]:
#| export
def extract_notebook_content(content: str, is_quarto: bool = False) -> str:
    notebook = json.loads(content)
    cells = notebook.get("cells",[])
    not_frontmatter_cells= filter(lambda c: not is_frontmatter(c),cells)
    cells_to_include = filter(lambda c: is_visible_code(c, is_quarto) or c.get("cell_type") == "markdown"
    , not_frontmatter_cells)
    return "\n".join("".join(c.get("source", [])) for c in cells_to_include)


In [11]:
# | export
def remove_metadata(content: str) -> str:
    """Remove frontmatter from content"""
    end = content.find("---", 3)
    return content[end + 3 :].strip() if end != -1 else content


In [12]:
# | export
def check_length(text: str, min_len: int, max_len: int) -> dict:
    """Check if text length falls within the optimal range"""
    length = len(text)
    return {"length": length, "optimal_length": min_len <= length <= max_len}


def check_title_length(title: str, min_len: int = 30, max_len: int = 60) -> dict:
    length = len(title)
    return {"length": length, "optimal_length": min_len <= length <= max_len}


def check_desc_length(description: str, min_len: int = 150, max_len: int = 160) -> dict:
    length = len(description)
    return {"length": length, "optimal_length": min_len <= length <= max_len}


In [13]:
# | export
def extract_headers(file_path: str) -> list[dict]:
    """Extract all headers with metadata"""
    headings = []
    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            for level in range(1, 7):
                prefix = "#" * level + " "
                if line.startswith(prefix):
                    content = line.strip("#").strip()
                    headings.append(
                        {
                            "type": f"h{level}",
                            "line_number": line_number,
                            "content": content,
                            "length": len(content),
                        }
                    )
                    break
    return headings


In [14]:
# | test
headers = extract_headers(str(sample_dir / "example.md"))
test_eq(len([h for h in headers if h["type"] == "h1"]), 2)
test_eq(headers[0]["content"], "This is me Kareem")


In [15]:
# | export
def check_length(text: str, min_len: int, max_len: int) -> dict:
    """Check if text length falls within the optimal range"""
    length = len(text)
    return {"length": length, "optimal_length": min_len <= length <= max_len}


def check_title_length(title: str, min_len: int = 50, max_len: int = 60) -> dict:
    length = len(title)
    return {"length": length, "optimal_length": min_len <= length <= max_len}

In [16]:
# | export
def check_desc_length(description: str, min_len: int = 150, max_len: int = 160) -> dict:
    length = len(description)
    return {"length": length, "optimal_length": min_len <= length <= max_len}


In [17]:
# | export
def check_content_length(content: str) -> dict:
    """Count words in content"""
    words = len(content.split())
    return {"word_count": words, "is_sufficient": words >= 300}


In [18]:
# | hide
from pprint import pprint


In [19]:
# | hide

title = metadata.get("title") or metadata.get("pagetitle", "")
description = metadata.get("description") or metadata.get("excerpt") or metadata.get("excertp", "")
pprint(check_title_length(title))
pprint(check_desc_length(description))
pprint(check_content_length(content))


{'length': 33, 'optimal_length': False}
{'length': 146, 'optimal_length': False}
{'is_sufficient': False, 'word_count': 181}


In [20]:
# | export
def extract_links(content: str) -> dict[str, dict]:
    """Extract all links with metadata"""
    links = {}
    lines = content.split("\n")
    for line_number, line in enumerate(lines, start=1):
        for match in re.finditer(r"\[(.*?)\]\((.*?)\)", line):
            title, url = match.groups()
            if url not in links:
                links[url] = {"titles": [], "lines": []}
            links[url]["titles"].append(title)
            links[url]["lines"].append(line_number)

    # Also extract HTML links
    try:
        from bs4 import BeautifulSoup
    except Exception:
        return links

    soup = BeautifulSoup(content, "html.parser")
    for a in soup.find_all("a", href=True):
        url = a.get("href", "").strip()
        if not url:
            continue
        title = a.get_text(strip=True)
        if url not in links:
            links[url] = {"titles": [], "lines": []}
        links[url]["titles"].append(title)
        links[url]["lines"].append(-1)  # HTML line unknown
    return links


In [21]:
# | test
links = extract_links(content)
test_eq("https://emdadelgaz.com" in links, True)
test_eq("https://awazly.com/" in links, True)


In [22]:
# | export
def extract_images(content: str) -> list[dict]:
    """Extract images with alt text"""
    matches = re.findall(r"\!\[(.*?)\]\((.*?)\)", content)
    return [{"alt_text": alt, "url": url} for alt, url in matches]


In [23]:
# | export
def imgs_missing_alts(images: list[dict]) -> list[str]:
    """Return URLs of images missing alt text"""
    return [img["url"] for img in images if not img.get("alt_text")]


In [24]:
# | test
images = extract_images(content)
test_eq(len(images), 1)
test_eq(images[0]["alt_text"], "Iron man photo")


In [25]:
# | export
SPECIAL_PREFIXES = ("#", "mailto:", "tel:", "javascript:")
IMAGE_EXTS = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")

def is_special_url(url: str) -> bool:
    """Check if URL is an anchor, mailto, tel, or javascript link"""
    return any(url.startswith(prefix) for prefix in SPECIAL_PREFIXES)


def filter_internal_links(urls: list[str], domain: str) -> list[str]:
    """Filter for internal links (excludes images and special URLs)"""
    internal = []
    for url in urls:
        if is_special_url(url) or url.lower().endswith(IMAGE_EXTS):
            continue
        if not url.startswith("http"):
            internal.append(url)
        elif urlparse(url).netloc == domain:
            internal.append(url)
    return internal



In [26]:
# | export
def filter_external_links(urls: list[str], domain: str) -> list[str]:
    """Filter for external links only"""
    internal = filter_internal_links(urls, domain)
    return [
        url for url in urls
        if url not in internal
        and not url.lower().endswith(IMAGE_EXTS)
        and not is_special_url(url)
    ]

In [27]:
# | export
def normalize_text(text: str) -> str:
    """Normalize text by removing extra whitespace"""
    return re.sub(r"\s+", " ", text).strip()


In [28]:
# | export
def detect_phone_numbers(text: str) -> list[str]:
    """Extract phone numbers from text"""
    phone_regex = re.compile(r"(\+\d{1,3})?\s*?(\d{3})\s*?(\d{3})\s*?(\d{3,4})")
    groups = phone_regex.findall(text)
    return ["".join(g) for g in groups]


In [29]:
# | test
phones = detect_phone_numbers(content)
test_eq("+966503139675" in phones, True)


In [30]:
# | export
def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate similarity ratio between two texts"""
    from difflib import SequenceMatcher

    return SequenceMatcher(None, text1, text2).ratio()


In [31]:
# | export
def get_file_paths(pattern: str) -> list[str]:
    """Get file paths matching pattern"""
    import glob

    return glob.glob(pattern, recursive=True)


In [32]:
# | export
def get_file_name(file_path: str) -> str:
    """Extract filename without extension from path"""
    return Path(file_path).stem


In [33]:
# | export
def get_markdown_files(directory: str) -> list[str]:
    """Get all markdown filenames (without extension) from directory"""
    import os

    return [
        f.replace(".md", "")
        for f in os.listdir(directory)
        if f.endswith(".md") and f != ".obsidian"
    ]


In [34]:
# | export
def arabic_to_slug(text: str) -> str:
    """Convert Arabic text to URL-friendly slug"""
    char_map = {
        "ا": "a",
        "ب": "b",
        "ت": "t",
        "ث": "th",
        "ج": "j",
        "ح": "h",
        "خ": "kh",
        "د": "d",
        "ذ": "th",
        "ر": "r",
        "ز": "z",
        "س": "s",
        "ش": "sh",
        "ص": "s",
        "ض": "d",
        "ط": "t",
        "ظ": "z",
        "ع": "",
        "غ": "gh",
        "ف": "f",
        "ق": "q",
        "ك": "k",
        "ل": "l",
        "م": "m",
        "ن": "n",
        "ه": "h",
        "و": "w",
        "ي": "y",
        "ة": "h",
        " ": "-",
    }

    slug = "".join(char_map.get(c, c) for c in text.strip().lower())
    while "--" in slug:
        slug = slug.replace("--", "-")
    return slug.strip("-")


In [35]:
# | export
def map_files_to_slugs(directory: str) -> dict[str, str]:
    """Map markdown filenames to URL slugs"""
    files = get_markdown_files(directory)
    return {filename: arabic_to_slug(filename) for filename in files}


In [36]:
#| export
def get_page_content(file_path: str, is_quarto: bool = False) -> str:
    """Read a file and return its text content"""
    with open(file_path, "r") as f:
        raw = f.read()
    if file_path.endswith(".ipynb"):
        return extract_notebook_content(raw, is_quarto=is_quarto)
    return remove_metadata(raw)
