In [None]:
# | default_exp content_parser


In [1]:
# | export
import re
import yaml
from pathlib import Path
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from datetime import datetime

In [2]:
# | test
from fastcore.test import test_eq


In [3]:
# | export
def parse_metadata(content: str) -> Dict:
    """Extract metadata from content frontmatter"""
    yaml_section = content.split("---")[1]
    return yaml.safe_load(yaml_section)


In [4]:
# | export
def parse_notebook_metadata(content: str) -> Dict:
    """Extract metadata from Jupyter notebook"""
    import json

    notebook = json.loads(content)

    # Check first cell for YAML frontmatter
    if notebook.get("cells"):
        first_cell = notebook["cells"][0]
        if first_cell.get("cell_type") == "markdown":
            source = "".join(first_cell.get("source", []))
            if source.startswith("---"):
                return parse_metadata(source)

    return {}


In [48]:
# | test
# Test Parse metadat
from nbdev.qmd import meta
with open("../sample/example.md", "r") as file:
    content = file.read()

metadata = parse_metadata(content)

#content is .md 
# with open("../sample/example.md", "r") as f:
#     content = f.read()

#content is .ipynb
with open("../sample/design_questions.ipynb", "r") as f:
    content = f.read()
metadata = parse_notebook_metadata(content)
print(metadata)
# metadata = parse_metadata(content)
# test_eq(metadata["title"], "Kareem Elkhateb SEO Trend Example")
# test_eq(str(metadata["publishDate"]), "2024-01-27")
# test_eq(metadata["tags"], ["Astrojs", "Rust", "C++", "C#", "Camel_Space", "Horse Case"])


{}


In [49]:

notebook = json.loads(content)
notebook 


{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['## Design Basic SEO \n',
    '\n',
    '### Design Ideas\n',
    '\n',
    '1. Collect all URls and then apply the separeted external and internal \n',
    '\n',
    '2. How to deal with different types of urls \n',
    '   1. images\n',
    '   2. Videos\n',
    '   3. Markdown internal photo\n']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['\n',
    '### Design Questions\n',
    '\n',
    '1. How to read files and work with them efficiently to handle issues of \n',
    '   1. Get out of memory \n',
    '   2. Slow execution \n',
    '\n',
    '2. Should I remove the frontmatter while processing the data and make in a different API ! \n',
    '\n',
    '3. Should i design the API to Work the Md content or read the file content with file path! ']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['## Design Keywords\n',
    '\n',
    '### Design Concepts\n',
    '\n',
    '1. Should I S

In [68]:

def extract_notebook_content(content:str, is_quarto:bool=False)->str:
    source= ""
    if notebook.get("cells"):
        first_cell = 0
        for cell in notebook["cells"]:
            if cell.get("cell_type") == "markdown" and first_cell == 0:
                if source.startswith("---"):
                    first_cell=1
                    continue 
            if cell.get("cell_type") == "markdown":
                source += "".join(cell.get("source", []))
            if is_quarto:
                if cell.get("cell_type") == "code":
                    if cell.get("source").contains("#| echo: false") or cell.get("source").contains("#| include: false"):
                        continue
                    source += "".join(cell.get("source", []))

    return source


In [6]:
# | export
def remove_metadata(content: str) -> str:
    """Remove frontmatter from content"""
    end = content.find("---", 3)
    return content[end + 3 :].strip() if end != -1 else content


In [36]:
# | test
# Test Remove metadata
content = remove_metadata(content)
content


'{\n "cells": [\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Design Basic SEO \\n",\n    "\\n",\n    "### Design Ideas\\n",\n    "\\n",\n    "1. Collect all URls and then apply the separeted external and internal \\n",\n    "\\n",\n    "2. How to deal with different types of urls \\n",\n    "   1. images\\n",\n    "   2. Videos\\n",\n    "   3. Markdown internal photo\\n",\n    "### Design Questions\\n",\n    "\\n",\n    "1. How to read files and work with them efficiently to handle issues of \\n",\n    "   1. Get out of memory \\n",\n    "   2. Slow execution \\n",\n    "\\n",\n    "2. Should I remove the frontmatter while processing the data and make in a different API ! \\n",\n    "\\n",\n    "3. Should i design the API to Work the Md content or read the file content with file path! "\n   ]\n  },\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Design Keywords\\n",\n    "\\n",\n    "### Design Concepts\\n",\n    "\\n",\

In [41]:
content

'{\n "cells": [\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Design Basic SEO \\n",\n    "\\n",\n    "### Design Ideas\\n",\n    "\\n",\n    "1. Collect all URls and then apply the separeted external and internal \\n",\n    "\\n",\n    "2. How to deal with different types of urls \\n",\n    "   1. images\\n",\n    "   2. Videos\\n",\n    "   3. Markdown internal photo\\n",\n    "### Design Questions\\n",\n    "\\n",\n    "1. How to read files and work with them efficiently to handle issues of \\n",\n    "   1. Get out of memory \\n",\n    "   2. Slow execution \\n",\n    "\\n",\n    "2. Should I remove the frontmatter while processing the data and make in a different API ! \\n",\n    "\\n",\n    "3. Should i design the API to Work the Md content or read the file content with file path! "\n   ]\n  },\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Design Keywords\\n",\n    "\\n",\n    "### Design Concepts\\n",\n    "\\n",\

In [8]:
# | export
def extract_headers(file_path: str) -> List[Dict]:
    """Extract all headers with metadata"""
    headings = []
    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            for level in range(1, 7):
                prefix = "#" * level + " "
                if line.startswith(prefix):
                    content = line.strip("#").strip()
                    headings.append(
                        {
                            "type": f"h{level}",
                            "line_number": line_number,
                            "content": content,
                            "length": len(content),
                        }
                    )
                    break
    return headings


In [9]:
# | test
headers = extract_headers("../sample/example.md")
test_eq(len([h for h in headers if h["type"] == "h1"]), 2)
test_eq(headers[0]["content"], "This is me Kareem")


In [10]:
# | export


def check_title_length(title: str) -> Dict:
    length = len(title)
    return {"length": length, "optimal_lenth": 50 <= length <= 60}


In [11]:
# | export


def check_desc_length(description: str) -> Dict:
    length = len(description)
    return {"length": length, "optimal_lenth": 150 <= length <= 160}


In [12]:
# | export
def check_content_length(content: str) -> Dict:
    """Count words in content"""
    words = len(content.split())
    return {"word_count": words, "is_sufficient": words >= 300}


In [13]:
# | hide
from pprint import pprint

pprint(check_title_length(metadata["title"]))
pprint(check_desc_length(metadata["excertp"]))
pprint(check_content_length(content))


{'length': 33, 'optimal_lenth': False}
{'length': 146, 'optimal_lenth': False}
{'is_sufficient': False, 'word_count': 124}


In [14]:
# | export
def extract_links(content: str) -> Dict[str, Dict]:
    """Extract all links with metadata"""
    links = {}
    lines = content.split("\n")
    for line_number, line in enumerate(lines, start=1):
        for match in re.finditer(r"\[(.*?)\]\((.*?)\)", line):
            title, url = match.groups()
            if url not in links:
                links[url] = {"titles": [], "lines": []}
            links[url]["titles"].append(title)
            links[url]["lines"].append(line_number)
    return links


In [15]:
# | test
links = extract_links(content)
test_eq("https://emdadelgaz.com" in links, True)
test_eq("https://awazly.com/" in links, True)


In [16]:
# | export
def extract_images(content: str) -> List[Dict]:
    """Extract images with alt text"""
    matches = re.findall(r"\!\[(.*?)\]\((.*?)\)", content)
    return [{"alt_text": alt, "url": url} for alt, url in matches]


In [17]:
# | export
def imgs_missing_alts(images: List[Dict]) -> List[str]:
    """Return URLs of images missing alt text"""
    return [img["url"] for img in images if not img.get("alt_text")]


In [18]:
# | test
images = extract_images(content)
test_eq(len(images), 1)
test_eq(images[0]["alt_text"], "Iron man photo")


In [19]:
# | export
def filter_internal_links(urls: List[str], domain: str) -> List[str]:
    """Filter for internal links (excludes images)"""
    image_exts = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")
    internal = []

    for url in urls:
        # Skip images
        if url.lower().endswith(image_exts):
            continue
        # Skip anchors
        if url.startswith("#"):
            continue
        # Relative paths are internal
        if not url.startswith("http"):
            internal.append(url)
        # Same domain
        elif urlparse(url).netloc == domain:
            internal.append(url)

    return internal


In [20]:
# | export
def filter_external_links(urls: List[str], domain: str) -> List[str]:
    """Filter for external links only"""
    image_exts = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")
    internal = filter_internal_links(urls, domain)

    return [
        url
        for url in urls
        if url not in internal  # Exclude internal
        and not url.lower().endswith(image_exts)
    ]  # Exclude images


In [21]:
# | export
def normalize_text(text: str) -> str:
    """Normalize text by removing extra whitespace"""
    return re.sub(r"\s+", " ", text).strip()


In [22]:
# | export
def detect_phone_numbers(text: str) -> List[str]:
    """Extract phone numbers from text"""
    phone_regex = re.compile(r"(\+\d{1,3})?\s*?(\d{3})\s*?(\d{3})\s*?(\d{3,4})")
    groups = phone_regex.findall(text)
    return ["".join(g) for g in groups]


In [23]:
# | test
phones = detect_phone_numbers(content)
test_eq("+966503139675" in phones, True)


In [24]:
# | export
def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate similarity ratio between two texts"""
    from difflib import SequenceMatcher

    return SequenceMatcher(None, text1, text2).ratio()


In [25]:
# | export
def get_file_paths(pattern: str) -> List[str]:
    """Get file paths matching pattern"""
    import glob

    return glob.glob(pattern, recursive=True)


In [26]:
# | export
def get_file_name(file_path: str) -> str:
    """Extract filename without extension from path"""
    return Path(file_path).stem


In [27]:
# | export
def get_markdown_files(directory: str) -> List[str]:
    """Get all markdown filenames (without extension) from directory"""
    import os

    return [
        f.replace(".md", "")
        for f in os.listdir(directory)
        if f.endswith(".md") and f != ".obsidian"
    ]


In [28]:
# | export
def arabic_to_slug(text: str) -> str:
    """Convert Arabic text to URL-friendly slug"""
    char_map = {
        "ا": "a",
        "ب": "b",
        "ت": "t",
        "ث": "th",
        "ج": "j",
        "ح": "h",
        "خ": "kh",
        "د": "d",
        "ذ": "th",
        "ر": "r",
        "ز": "z",
        "س": "s",
        "ش": "sh",
        "ص": "s",
        "ض": "d",
        "ط": "t",
        "ظ": "z",
        "ع": "",
        "غ": "gh",
        "ف": "f",
        "ق": "q",
        "ك": "k",
        "ل": "l",
        "م": "m",
        "ن": "n",
        "ه": "h",
        "و": "w",
        "ي": "y",
        "ة": "h",
        " ": "-",
    }

    slug = "".join(char_map.get(c, c) for c in text.strip().lower())
    while "--" in slug:
        slug = slug.replace("--", "-")
    return slug.strip("-")


In [29]:
# | export
def map_files_to_slugs(directory: str) -> Dict[str, str]:
    """Map markdown filenames to URL slugs"""
    files = get_markdown_files(directory)
    return {filename: arabic_to_slug(filename) for filename in files}
