In [None]:
# | default_exp seo_content_analysis

In [None]:
# | export
from seo_rat.content_parser import extract_headers, remove_metadata

In [None]:
# | export
def calculate_keyword_density(content: str, keyword: str) -> dict:
    """Calculate keyword density and positions"""
    content_lower = content.lower()
    keyword_lower = keyword.lower()

    positions = []
    pos = 0
    while (pos := content_lower.find(keyword_lower, pos)) != -1:
        positions.append(pos)
        pos += 1

    total_words = len(content.split())
    count = len(positions)
    density = (count / total_words * 100) if total_words > 0 else 0

    return {
        "keyword": keyword,
        "count": count,
        "density": density,
        "positions": positions,
    }

In [None]:
# | export
def check_h1_count(
    headers: list[dict], title: str | None = None, is_quarto: bool = False
) -> dict:
    """Check H1 count â€” for Quarto, the title frontmatter field acts as H1"""
    h1s = [h for h in headers if h["type"] == "h1"]

    if is_quarto and title:
        return {"h1_count": 1, "has_single_h1": True, "h1_source": "title"}

    return {"h1_count": len(h1s), "has_single_h1": len(h1s) == 1}

In [None]:
# | export
def keyword_in_first_section(content: str, keyword: str, percent: int = 10) -> bool:
    """Check if keyword appears in first X% of content"""
    section_length = int(len(content) * percent / 100)
    return keyword.lower() in content[:section_length].lower()

In [None]:
# | export
def check_paragraph_length(content: str) -> dict:
    """Check average paragraph length"""
    paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
    sentences_per_para = [len(p.split(". ")) for p in paragraphs]
    avg = sum(sentences_per_para) / len(sentences_per_para) if paragraphs else 0

    return {
        "avg_sentences_per_paragraph": avg,
        "is_optimal": 2 <= avg <= 4,
    }

In [None]:
# | export
def keyword_in_metadata(metadata: dict, keyword: str) -> dict:
    """Check if keyword is in title, excerpt, description"""
    kw = keyword.lower()
    return {
        "in_title": kw in str(metadata.get("title", "")).lower(),
        "in_excerpt": kw in str(metadata.get("excerpt", "")).lower(),
        "in_description": kw in str(metadata.get("description", "")).lower(),
    }

In [None]:
# | export
def keyword_in_alt_texts(images: list[dict], keyword: str) -> bool:
    """Check if keyword appears in any image alt text"""
    return any(keyword.lower() in img.get("alt_text", "").lower() for img in images)

In [None]:
# | export
def analyze_header_distribution(headers: list[dict]) -> dict:
    """Analyze header hierarchy distribution"""
    distribution: dict[str, int] = {}
    for h in headers:
        h_type = h["type"]
        distribution[h_type] = distribution.get(h_type, 0) + 1

    total = len(headers)
    percentages = {
        k: (v / total * 100) if total > 0 else 0 for k, v in distribution.items()
    }

    return {"counts": distribution, "percentages": percentages}

In [None]:
# | test
from fastcore.test import test_eq
from pprint import pprint
from seo_rat.content_parser import parse_metadata, extract_images


In [None]:
# | test
from pathlib import Path

sample_dir = Path("sample")
if not sample_dir.exists():
    sample_dir = Path("../sample")

with open(sample_dir / "example.md", "r") as f:
    content = f.read()

# keyword density
density = calculate_keyword_density(content, "Kareem")
test_eq(density["count"] > 0, True)
test_eq("density" in density, True)

# h1 count
headers = extract_headers(str(sample_dir / "example.md"))
h1_check = check_h1_count(headers)
test_eq(h1_check["h1_count"], 2)
test_eq(h1_check["has_single_h1"], False)

# keyword in first section
test_eq(keyword_in_first_section(content, "Kareem", percent=10), True)

# paragraph length
para_check = check_paragraph_length(content)
test_eq("avg_sentences_per_paragraph" in para_check, True)

# keyword in metadata
metadata = parse_metadata(content)
kw_meta = keyword_in_metadata(metadata, "Kareem")
test_eq("in_title" in kw_meta, True)

# keyword in alt texts
images = extract_images(content)
test_eq(isinstance(keyword_in_alt_texts(images, "Kareem"), bool), True)

# header distribution
dist = analyze_header_distribution(headers)
test_eq("counts" in dist, True)
test_eq("percentages" in dist, True)

pprint(dist)
