In [None]:
# | default_exp seo_analysis

In [None]:
# | export
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from sqlmodel import Session, select, create_engine, SQLModel
from seo_rat.article import Article
from seo_rat.content_parser import extract_headers, remove_metadata


In [None]:
# | export
def calculate_keyword_density(content: str, keyword: str) -> Dict:
    """Calculate keyword density and positions"""
    content_lower = content.lower()
    keyword_lower = keyword.lower()

    # Find all positions
    positions = []
    pos = 0
    while (pos := content_lower.find(keyword_lower, pos)) != -1:
        positions.append(pos)
        pos += 1

    # Calculate density
    total_words = len(content.split())
    count = len(positions)
    density = (count / total_words * 100) if total_words > 0 else 0

    return {
        "keyword": keyword,
        "count": count,
        "density": density,
        "positions": positions,
    }


In [None]:
# | test
from fastcore.test import test_eq
from pprint import pprint

# Read test file
with open("../sample/example.md", "r") as f:
    content = f.read()


In [None]:
# |test
# Test keyword density
density = calculate_keyword_density(content, "Kareem")

test_eq(density["count"] > 0, True)
test_eq("density" in density, True)


In [None]:
# | export
def check_h1_count(
    headers: List[Dict], title: str = None, is_quarto: bool = False
) -> Dict:
    h1s = [h for h in headers if h["type"] == "h1"]

    # For Quarto, title acts as H1
    if is_quarto and title:
        return {"h1_count": 1, "has_single_h1": True, "h1_source": "title"}

    return {"h1_count": len(h1s), "has_single_h1": len(h1s) == 1}


In [None]:
# | test
# Test H1 count
headers = extract_headers("../sample/example.md")
h1_check = check_h1_count(headers)
test_eq(h1_check["h1_count"], 2)
test_eq(h1_check["has_single_h1"], False)


In [None]:
# | hide
pprint(headers)


[{'content': 'This is me Kareem',
  'length': 17,
  'line_number': 15,
  'type': 'h1'},
 {'content': 'This is Kareem Also',
  'length': 19,
  'line_number': 17,
  'type': 'h1'},
 {'content': 'How do you know me!',
  'length': 19,
  'line_number': 21,
  'type': 'h2'},
 {'content': 'oh no! iron man!', 'length': 16, 'line_number': 25, 'type': 'h2'},
 {'content': 'References', 'length': 10, 'line_number': 34, 'type': 'h2'},
 {'content': 'Books', 'length': 5, 'line_number': 42, 'type': 'h3'},
 {'content': 'nbdev is super cool!',
  'length': 20,
  'line_number': 48,
  'type': 'h4'},
 {'content': 'Test Deriven Developement is a life changing!',
  'length': 45,
  'line_number': 50,
  'type': 'h5'},
 {'content': 'I am an Love with best girl in the whole world!',
  'length': 47,
  'line_number': 52,
  'type': 'h6'}]


In [None]:
# | hide
pprint(h1_check)


{'h1_count': 2, 'has_single_h1': False}


In [None]:
# | export
def keyword_in_first_section(content: str, keyword: str, percent: int = 10) -> bool:
    """Check if keyword appears in first X% of content"""
    section_length = int(len(content) * percent / 100)
    return keyword.lower() in content[:section_length].lower()


In [None]:
# | test
in_start = keyword_in_first_section(content, "Kareem", percent=10)
pprint(in_start)
test_eq(in_start, True)


True


In [None]:
# | export
def check_paragraph_length(content: str) -> Dict:
    """Check average paragraph length"""
    paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
    sentences_per_para = [len(p.split(". ")) for p in paragraphs]
    avg = sum(sentences_per_para) / len(sentences_per_para) if paragraphs else 0

    return {
        "avg_sentences_per_paragraph": avg,
        "is_optimal": 2 <= avg <= 4,  # 2-4 sentences ideal
    }


In [None]:
# | test
para_check = check_paragraph_length(content)
print(f"Avg sentences/paragraph: {para_check['avg_sentences_per_paragraph']:.1f}")
print(f"Optimal: {para_check['is_optimal']}")


Avg sentences/paragraph: 1.2
Optimal: False


In [None]:
# | export
def keyword_in_metadata(metadata: Dict, keyword: str) -> Dict:
    """Check if keyword is in title, excerpt, description"""
    return {
        "in_title": keyword.lower() in str(metadata.get("title", "")).lower(),
        "in_excerpt": keyword.lower() in str(metadata.get("excerpt", "")).lower(),
        "in_description": keyword.lower()
        in str(metadata.get("description", "")).lower(),
    }


In [None]:
# | hide
from seo_rat.content_parser import parse_metadata

metadata = parse_metadata(content)
test_key_in_metadata = keyword_in_metadata(metadata, "Kareem")
pprint(test_key_in_metadata)

{'in_description': False, 'in_excerpt': False, 'in_title': True}


In [None]:
# | export
def keyword_in_alt_texts(images: List[Dict], keyword: str) -> bool:
    """Check if keyword appears in any image alt text"""
    return any(keyword.lower() in img["alt_text"].lower() for img in images)


In [None]:
# | hide
from seo_rat.content_parser import extract_images

images = extract_images(content)
pprint(images)
test_keyword_in_alt_texts = keyword_in_alt_texts(images, "Kareem")
pprint(test_keyword_in_alt_texts)


[{'alt_text': 'Iron man photo', 'url': '~/assets/images/28.png'}]
False


In [None]:
# | export
def analyze_header_distribution(headers: List[Dict]) -> Dict:
    """Analyze header hierarchy distribution"""
    distribution = {}
    for h in headers:
        h_type = h["type"]
        distribution[h_type] = distribution.get(h_type, 0) + 1

    total = len(headers)
    percentages = {
        k: (v / total * 100) if total > 0 else 0 for k, v in distribution.items()
    }

    return {"counts": distribution, "percentages": percentages}


In [None]:
# | hide
headers = extract_headers("../sample/example.md")
header_distribution = analyze_header_distribution(headers)
pprint(header_distribution)


{'counts': {'h1': 2, 'h2': 3, 'h3': 1, 'h4': 1, 'h5': 1, 'h6': 1},
 'percentages': {'h1': 22.22222222222222,
                 'h2': 33.33333333333333,
                 'h3': 11.11111111111111,
                 'h4': 11.11111111111111,
                 'h5': 11.11111111111111,
                 'h6': 11.11111111111111}}


In [None]:
# | export
def detect_duplicate_content(
    session: Session, file_path: str, similarity_threshold: float = 0.8
) -> Dict:
    """Find similar articles by comparing content"""
    from seo_rat.content_parser import (
        remove_metadata,
        normalize_text,
        calculate_similarity,
    )

    with open(file_path, "r") as f:
        current_content = normalize_text(remove_metadata(f.read()))

    articles = session.exec(select(Article)).all()
    similar = []

    for article in articles:
        if article.file_path == file_path:
            continue

        with open(article.file_path, "r") as f:
            other_content = normalize_text(remove_metadata(f.read()))

        similarity = calculate_similarity(current_content, other_content)

        if similarity >= similarity_threshold:
            similar.append({"file_path": article.file_path, "similarity": similarity})

    return {"has_duplicates": len(similar) > 0, "similar_articles": similar}


In [None]:
# | test
engine = create_engine("sqlite:///:memory:")
SQLModel.metadata.create_all(engine)

with Session(engine) as session:
    # Create website
    website = Website(url="https://test.com", name="Test", lang="en")
    session.add(website)
    session.commit()
    session.refresh(website)

    # Create a second similar file
    with open("../sample/example2.md", "w") as f:
        (f.write(content),)
    article1 = insert_article(session, website.id, "../sample/example.md")
    article2 = insert_article(session, website.id, "../sample/example2.md")

    result = detect_duplicate_content(session, "../sample/example.md")
    test_eq(result["has_duplicates"], True)


NoReferencedTableError: Foreign key associated with column 'article.website_id' could not find table 'website' with which to generate a foreign key to target column 'id'

In [None]:
# | export
def analyze_keyword_cannibalization(session: Session, keyword: str) -> Dict:
    """Find articles competing for same keyword"""

    articles = session.exec(
        select(Article).where(Article.focus_keyword == keyword)
    ).all()

    if len(articles) <= 1:
        return {
            "has_cannibalization": False,
            "keyword": keyword,
            "count": len(articles),
        }

    results = []
    for article in articles:
        with open(article.file_path, "r") as f:
            content = remove_metadata(f.read())

        density = calculate_keyword_density(content, keyword)
        results.append(
            {
                "file_path": article.file_path,
                "density": density["density"],
                "count": density["count"],
            }
        )

    return {
        "has_cannibalization": True,
        "keyword": keyword,
        "count": len(articles),
        "articles": results,
    }


In [None]:
# | hide

test_analyze_keyword_cannibalization = analyze_keyword_cannibalization(
    session, "Kareem"
)
pprint(test_analyze_keyword_cannibalization)


NameError: name 'session' is not defined

In [None]:
# | export
def analyze_content_groups(session: Session, similarity_threshold: float = 0.8) -> Dict:
    """Group similar articles together"""
    from seo_rat.content_parser import (
        remove_metadata,
        normalize_text,
        calculate_similarity,
    )

    articles = session.exec(select(Article)).all()
    groups = []
    processed = set()

    for article in articles:
        if article.id in processed:
            continue

        with open(article.file_path, "r") as f:
            main_content = normalize_text(remove_metadata(f.read()))

        group = {"main_article": article.file_path, "similar_articles": []}

        for other in articles:
            if other.id == article.id or other.id in processed:
                continue

            with open(other.file_path, "r") as f:
                other_content = normalize_text(remove_metadata(f.read()))

            similarity = calculate_similarity(main_content, other_content)

            if similarity >= similarity_threshold:
                group["similar_articles"].append(
                    {"file_path": other.file_path, "similarity": similarity}
                )
                processed.add(other.id)

        if group["similar_articles"]:
            groups.append(group)
            processed.add(article.id)

    return {
        "total_articles": len(articles),
        "groups": groups,
        "duplicate_groups": len(groups),
    }


In [None]:
# | hide

test_analyze_content_groups = analyze_content_groups(session, 0.8)
pprint(test_analyze_content_groups)


NameError: name 'session' is not defined

In [None]:
# |hide
def get_num_heads(h_elements):
    """
    Return A list continas the length of each heading

    Takes the heading info from `get_heads_info`
    """
    #! Update this to work with the new dict structure
    return list(map(len, h_elements.values()))

In [None]:
# | test
# Test detect_duplicate_content (needs session)
from sqlmodel import create_engine, Session, SQLModel
from seo_rat.models import Website
from seo_rat.article import Article, insert_article
import tempfile

engine = create_engine("sqlite:///:memory:")
SQLModel.metadata.create_all(engine)

with Session(engine) as session:
    # Create test articles
    website = Website(url="https://test.com", name="Test", lang="en")
    session.add(website)
    session.commit()

    # Create temp files with similar content
    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f1:
        f1.write(content)
        path1 = f1.name

    article1 = insert_article(session, website.id, path1)

    result = detect_duplicate_content(session, path1)
    test_eq("has_duplicates" in result, True)
