In [1]:
# | default_exp seo_analysis

In [8]:
# | export
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from sqlmodel import Session, select
from seo_rat.article import Article

In [9]:
# | export
def calculate_keyword_density(content: str, keyword: str) -> Dict:
    """Calculate keyword density and positions"""
    content_lower = content.lower()
    keyword_lower = keyword.lower()

    # Find all positions
    positions = []
    pos = 0
    while (pos := content_lower.find(keyword_lower, pos)) != -1:
        positions.append(pos)
        pos += 1

    # Calculate density
    total_words = len(content.split())
    count = len(positions)
    density = (count / total_words * 100) if total_words > 0 else 0

    return {
        "keyword": keyword,
        "count": count,
        "density": density,
        "positions": positions,
    }


# | export
def check_h1_count(headers: List[Dict]) -> Dict:
    """Check H1 heading count"""
    h1s = [h for h in headers if h["type"] == "h1"]
    return {
        "h1_count": len(h1s),
        "has_single_h1": len(h1s) == 1,
        "h1_contents": [h["content"] for h in h1s],
    }


# | export
def keyword_in_first_section(content: str, keyword: str, percent: int = 10) -> bool:
    """Check if keyword appears in first X% of content"""
    section_length = int(len(content) * percent / 100)
    return keyword.lower() in content[:section_length].lower()


In [10]:
# | export
def keyword_in_metadata(metadata: Dict, keyword: str) -> Dict:
    """Check if keyword is in title, excerpt, description"""
    return {
        "in_title": keyword.lower() in str(metadata.get("title", "")).lower(),
        "in_excerpt": keyword.lower() in str(metadata.get("excerpt", "")).lower(),
        "in_description": keyword.lower()
        in str(metadata.get("description", "")).lower(),
    }


# | export
def keyword_in_alt_texts(images: List[Dict], keyword: str) -> bool:
    """Check if keyword appears in any image alt text"""
    return any(keyword.lower() in img["alt_text"].lower() for img in images)


# | export
def analyze_header_distribution(headers: List[Dict]) -> Dict:
    """Analyze header hierarchy distribution"""
    distribution = {}
    for h in headers:
        h_type = h["type"]
        distribution[h_type] = distribution.get(h_type, 0) + 1

    total = len(headers)
    percentages = {
        k: (v / total * 100) if total > 0 else 0 for k, v in distribution.items()
    }

    return {"counts": distribution, "percentages": percentages}


In [11]:
# | export
def detect_duplicate_content(
    session: Session, file_path: str, similarity_threshold: float = 0.8
) -> Dict:
    """Find similar articles by comparing content"""
    from seo_rat.content_parser import (
        remove_metadata,
        normalize_text,
        calculate_similarity,
    )

    # Read current article
    with open(file_path, "r") as f:
        current_content = normalize_text(remove_metadata(f.read()))

    # Get all other articles
    articles = session.exec(select(Article)).all()
    similar = []

    for article in articles:
        if article.file_path == file_path:
            continue

        with open(article.file_path, "r") as f:
            other_content = normalize_text(remove_metadata(f.read()))

        similarity = calculate_similarity(current_content, other_content)

        if similarity >= similarity_threshold:
            similar.append({"file_path": article.file_path, "similarity": similarity})

    return {"has_duplicates": len(similar) > 0, "similar_articles": similar}


In [12]:
# | export
def analyze_keyword_cannibalization(session: Session, keyword: str) -> Dict:
    """Find articles competing for same keyword"""
    from seo_rat.content_parser import remove_metadata, calculate_keyword_density

    articles = session.exec(
        select(Article).where(Article.focus_keyword == keyword)
    ).all()

    if len(articles) <= 1:
        return {
            "has_cannibalization": False,
            "keyword": keyword,
            "count": len(articles),
        }

    results = []
    for article in articles:
        with open(article.file_path, "r") as f:
            content = remove_metadata(f.read())

        density = calculate_keyword_density(content, keyword)
        results.append(
            {
                "file_path": article.file_path,
                "density": density["density"],
                "count": density["count"],
            }
        )

    return {
        "has_cannibalization": True,
        "keyword": keyword,
        "count": len(articles),
        "articles": results,
    }


In [13]:
# | export
def analyze_content_groups(session: Session, similarity_threshold: float = 0.8) -> Dict:
    """Group similar articles together"""
    from seo_rat.content_parser import (
        remove_metadata,
        normalize_text,
        calculate_similarity,
    )

    articles = session.exec(select(Article)).all()
    groups = []
    processed = set()

    for article in articles:
        if article.id in processed:
            continue

        with open(article.file_path, "r") as f:
            main_content = normalize_text(remove_metadata(f.read()))

        group = {"main_article": article.file_path, "similar_articles": []}

        for other in articles:
            if other.id == article.id or other.id in processed:
                continue

            with open(other.file_path, "r") as f:
                other_content = normalize_text(remove_metadata(f.read()))

            similarity = calculate_similarity(main_content, other_content)

            if similarity >= similarity_threshold:
                group["similar_articles"].append(
                    {"file_path": other.file_path, "similarity": similarity}
                )
                processed.add(other.id)

        if group["similar_articles"]:
            groups.append(group)
            processed.add(article.id)

    return {
        "total_articles": len(articles),
        "groups": groups,
        "duplicate_groups": len(groups),
    }


In [14]:
# |hide
def get_num_heads(h_elements):
    """
    Return A list continas the length of each heading

    Takes the heading info from `get_heads_info`
    """
    #! Update this to work with the new dict structure
    return list(map(len, h_elements.values()))