In [None]:
# | default_exp content_parser


In [None]:
# | export
import re
import yaml
from pathlib import Path
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from datetime import datetime

In [None]:
# | test
from fastcore.test import test_eq


In [None]:
# | export
def parse_metadata(content: str) -> Dict:
    """Extract metadata from content frontmatter"""
    yaml_section = content.split("---")[1]
    return yaml.safe_load(yaml_section)


In [None]:
# | export
def parse_notebook_metadata(content: str) -> Dict:
    """Extract metadata from Jupyter notebook"""
    import json

    notebook = json.loads(content)

    # Check first cell for YAML frontmatter
    if notebook.get("cells"):
        first_cell = notebook["cells"][0]
        if first_cell.get("cell_type") == "markdown":
            source = "".join(first_cell.get("source", []))
            if source.startswith("---"):
                return parse_metadata(source)

    return {}


NameError: name 'Dict' is not defined

In [None]:
# | test
# Test Parse metadat
with open("../sample/example.md", "r") as file:
    content = file.read()

metadata = parse_metadata(content)
# | test

with open("../sample/example.md", "r") as f:
    content = f.read()

metadata = parse_metadata(content)
test_eq(metadata["title"], "Kareem Elkhateb SEO Trend Example")
test_eq(str(metadata["publishDate"]), "2024-01-27")
test_eq(metadata["tags"], ["Astrojs", "Rust", "C++", "C#", "Camel_Space", "Horse Case"])


In [None]:
# | export
def remove_metadata(content: str) -> str:
    """Remove frontmatter from content"""
    end = content.find("---", 3)
    return content[end + 3 :].strip() if end != -1 else content


In [None]:
# | test
# Test Remove metadata
content = remove_metadata(content)
content


'# This is me Kareem\n\n# This is Kareem Also\n\nMy name is kareem and i am going to help all you!\n\n## How do you know me!\n\nI know you by just saying you are just a shity person!\n\n## oh no! iron man!\n\n![Iron man photo](~/assets/images/28.png)\nThis is a fancy photo of Iron man!!\n\nIf you want to call IronMan you can find him in: +01013646887 and **+966503139675** there is also 01005134688 .\nAre you series!\nThe Hulk is here!\n\n## References\n\n[main_website](https://emdadelgaz.com)\n[main_website_again](https://emdadelgaz.com)\n[about_website](https://emdadelgaz/about.com)\n[contact_page](http://emdadelgaz/contact.net)\n[awazly_website](https://awazly.com/)\n\n### Books\n\n1. Clean code\n2. Data Integartions\n3. Batman\n\n#### nbdev is super cool!\n\n##### Test Deriven Developement is a life changing!\n\n###### I am an Love with best girl in the whole world!'

In [None]:
# | export
def extract_headers(file_path: str) -> List[Dict]:
    """Extract all headers with metadata"""
    headings = []
    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            for level in range(1, 7):
                prefix = "#" * level + " "
                if line.startswith(prefix):
                    content = line.strip("#").strip()
                    headings.append(
                        {
                            "type": f"h{level}",
                            "line_number": line_number,
                            "content": content,
                            "length": len(content),
                        }
                    )
                    break
    return headings


In [None]:
# | test
headers = extract_headers("../sample/example.md")
test_eq(len([h for h in headers if h["type"] == "h1"]), 2)
test_eq(headers[0]["content"], "This is me Kareem")


In [None]:
# | export


def check_title_length(title: str) -> Dict:
    length = len(title)
    return {"length": length, "optimal_lenth": 50 <= length <= 60}


In [None]:
# | export


def check_desc_length(description: str) -> Dict:
    length = len(description)
    return {"length": length, "optimal_lenth": 150 <= length <= 160}


In [None]:
# | export
def check_content_length(content: str) -> Dict:
    """Count words in content"""
    words = len(content.split())
    return {"word_count": words, "is_sufficient": words >= 300}


In [None]:
# | hide
from pprint import pprint

pprint(check_title_length(metadata["title"]))
pprint(check_desc_length(metadata["excertp"]))
pprint(check_content_length(content))


{'length': 33, 'optimal_lenth': False}
{'length': 146, 'optimal_lenth': False}
{'is_sufficient': False, 'word_count': 124}
{}


In [None]:
# | export
def extract_links(content: str) -> Dict[str, Dict]:
    """Extract all links with metadata"""
    links = {}
    lines = content.split("\n")
    for line_number, line in enumerate(lines, start=1):
        for match in re.finditer(r"\[(.*?)\]\((.*?)\)", line):
            title, url = match.groups()
            if url not in links:
                links[url] = {"titles": [], "lines": []}
            links[url]["titles"].append(title)
            links[url]["lines"].append(line_number)
    return links


In [None]:
# | test
links = extract_links(content)
test_eq("https://emdadelgaz.com" in links, True)
test_eq("https://awazly.com/" in links, True)


In [None]:
# | export
def extract_images(content: str) -> List[Dict]:
    """Extract images with alt text"""
    matches = re.findall(r"\!\[(.*?)\]\((.*?)\)", content)
    return [{"alt_text": alt, "url": url} for alt, url in matches]


In [None]:
# | export
def imgs_missing_alts(images: List[Dict]) -> List[str]:
    """Return URLs of images missing alt text"""
    return [img["url"] for img in images if not img.get("alt_text")]


In [None]:
# | test
images = extract_images(content)
test_eq(len(images), 1)
test_eq(images[0]["alt_text"], "Iron man photo")


[]


In [None]:
# | export
def filter_internal_links(urls: List[str], domain: str) -> List[str]:
    """Filter for internal links (excludes images)"""
    image_exts = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")
    internal = []

    for url in urls:
        # Skip images
        if url.lower().endswith(image_exts):
            continue
        # Skip anchors
        if url.startswith("#"):
            continue
        # Relative paths are internal
        if not url.startswith("http"):
            internal.append(url)
        # Same domain
        elif urlparse(url).netloc == domain:
            internal.append(url)

    return internal


NameError: name 'List' is not defined

In [None]:
# | export
def filter_external_links(urls: List[str], domain: str) -> List[str]:
    """Filter for external links only"""
    image_exts = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")
    internal = filter_internal_links(urls, domain)

    return [
        url
        for url in urls
        if url not in internal  # Exclude internal
        and not url.lower().endswith(image_exts)
    ]  # Exclude images


In [None]:
# | export
def normalize_text(text: str) -> str:
    """Normalize text by removing extra whitespace"""
    return re.sub(r"\s+", " ", text).strip()


In [None]:
# | export
def detect_phone_numbers(text: str) -> List[str]:
    """Extract phone numbers from text"""
    phone_regex = re.compile(r"(\+\d{1,3})?\s*?(\d{3})\s*?(\d{3})\s*?(\d{3,4})")
    groups = phone_regex.findall(text)
    return ["".join(g) for g in groups]


In [None]:
# | test
phones = detect_phone_numbers(content)
test_eq("+966503139675" in phones, True)


In [None]:
# | export
def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate similarity ratio between two texts"""
    from difflib import SequenceMatcher

    return SequenceMatcher(None, text1, text2).ratio()


In [None]:
# | export
def get_file_paths(pattern: str) -> List[str]:
    """Get file paths matching pattern"""
    import glob

    return glob.glob(pattern, recursive=True)


In [None]:
# | export
def get_file_name(file_path: str) -> str:
    """Extract filename without extension from path"""
    return Path(file_path).stem


In [None]:
# | export
def get_markdown_files(directory: str) -> List[str]:
    """Get all markdown filenames (without extension) from directory"""
    import os

    return [
        f.replace(".md", "")
        for f in os.listdir(directory)
        if f.endswith(".md") and f != ".obsidian"
    ]


In [None]:
# | export
def arabic_to_slug(text: str) -> str:
    """Convert Arabic text to URL-friendly slug"""
    char_map = {
        "ا": "a",
        "ب": "b",
        "ت": "t",
        "ث": "th",
        "ج": "j",
        "ح": "h",
        "خ": "kh",
        "د": "d",
        "ذ": "th",
        "ر": "r",
        "ز": "z",
        "س": "s",
        "ش": "sh",
        "ص": "s",
        "ض": "d",
        "ط": "t",
        "ظ": "z",
        "ع": "",
        "غ": "gh",
        "ف": "f",
        "ق": "q",
        "ك": "k",
        "ل": "l",
        "م": "m",
        "ن": "n",
        "ه": "h",
        "و": "w",
        "ي": "y",
        "ة": "h",
        " ": "-",
    }

    slug = "".join(char_map.get(c, c) for c in text.strip().lower())
    while "--" in slug:
        slug = slug.replace("--", "-")
    return slug.strip("-")


In [None]:
# | export
def map_files_to_slugs(directory: str) -> Dict[str, str]:
    """Map markdown filenames to URL slugs"""
    files = get_markdown_files(directory)
    return {filename: arabic_to_slug(filename) for filename in files}
