In [None]:
# | default_exp seo_report


In [None]:
# | export
from typing import Dict, List
from sqlmodel import Session, select
from seo_rat.article import get_article_by_path, insert_article, Article
from seo_rat.sqlite_db import SQLiteDB
from seo_rat.models import Website
from seo_rat.content_mapper import map_all_urls_to_files
from seo_rat.content_parser import (
    parse_notebook_metadata,
    filter_external_links,
    filter_internal_links,
    extract_links,
)


In [None]:
# | export
def sync_articles_to_db(
    session: Session, website_id: int, url_file_mapping: Dict[str, str]
):
    """Insert all articles into database"""
    for url, file_path in url_file_mapping.items():
        if file_path:  # Only if file exists
            # Check if already exists
            existing = get_article_by_path(session, file_path)
            if not existing:
                insert_article(session, website_id, file_path)


In [None]:
# | hide
db = SQLiteDB()
with db.get_session() as session:
    websites = session.exec(select(Website)).all()
    for w in websites:
        print(f"ID: {w.id}, URL: {w.url}")
url_mapping = map_all_urls_to_files(
    "https://kareemai.com/sitemap.xml",
    "/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/",
    "https://kareemai.com/",
)

sync_articles_to_db(session, website_id=2, url_file_mapping=url_mapping)
articles = session.exec(select(Article)).all()
for article in articles:
    print(article)


ID: 2, URL: https://kareemai.com
secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/papers.qmd' id=1 focus_keyword=None created_at=datetime.datetime(2026, 1, 18, 14, 46, 57, 901517)
secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/oss/opensource.qmd' id=2 focus_keyword=None created_at=datetime.datetime(2026, 1, 18, 14, 46, 57, 914530)
secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-12-15.md' id=3 focus_keyword=None created_at=datetime.datetime(2026, 1, 18, 14, 46, 57, 923431)
secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-12-13.md' id=4 focus_keyword=None created_at=datetime.datetime(2026, 1, 18, 14, 46, 57, 933589)
secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-06-06-til.qmd' id=5 focu

In [None]:
# | export


def find_duplicate_metadata(
    session: Session, field: str, similarity_threshold: float = 0.9
) -> List[Dict]:
    """Find pages with duplicate or very similar titles"""
    from seo_rat.content_parser import parse_metadata, calculate_similarity

    articles = session.exec(select(Article)).all()
    titles = {}

    # Collect all titles
    for article in articles:
        with open(article.file_path, "r") as f:
            if article.file_path.endswith(".ipynb"):
                metdata = parse_notebook_metadata(f.read())
                titles[article.file_path] = metdata.get(field, "")
            elif article.file_path.endswith(".md") or article.file_path.endswith(
                ".qmd"
            ):
                metadata = parse_metadata(f.read())
                titles[article.file_path] = metadata.get(field, "")

    # Find duplicates
    duplicates = []
    checked = set()

    for path1, field1 in titles.items():
        for path2, field2 in titles.items():
            if path1 >= path2 or (path1, path2) in checked:
                continue

            similarity = calculate_similarity(field1, field2)
            if similarity >= similarity_threshold:
                duplicates.append(
                    {
                        "file1": path1,
                        "file2": path2,
                        f"{field}{1}": field1,
                        f"{field}{2}": field2,
                        "similarity": similarity,
                    }
                )
            checked.add((path1, path2))

    return duplicates


In [None]:
# | hide
find_duplicate_metadata(session, "description", similarity_threshold=0.7)

[]

In [None]:
# | export


def analyze_links(content: str, domain: str) -> Dict:
    """Analyze internal and external links in content"""
    links = extract_links(content)
    all_urls = list(links.keys())

    internal = filter_internal_links(all_urls, domain)
    external = filter_external_links(all_urls, domain)

    return {
        "total_links": len(all_urls),
        "internal_count": len(internal),
        "external_count": len(external),
        "internal_links": internal,
        "external_links": external,
    }


In [None]:
# | test

from seo_rat.content_parser import remove_metadata

article = session.exec(select(Article)).first()
print(article)
with open(article.file_path, "r") as f:
    content = remove_metadata(f.read())

link_analysis = analyze_links(content, "kareemai.com")
print(link_analysis)

print(f"Total: {link_analysis['total_links']}")
print(f"Internal: {link_analysis['internal_count']}")
print(f"External: {link_analysis['external_count']}")


secondary_keywords=None website_id=2 file_path='/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/papers.qmd' id=1 focus_keyword=None created_at=datetime.datetime(2026, 1, 18, 14, 46, 57, 901517)
{'total_links': 4, 'internal_count': 0, 'external_count': 4, 'internal_links': [], 'external_links': ['https://www.researchgate.net/publication/392372128_QARI-OCR_High-Fidelity_Arabic_Text_Recognition_through_Multimodal_Large_Language_Model_Adaptation?_sg%5B0%5D=Ddb9kwrghDtknsstHB3jlx1WzktpoFiMyRPptkC8aa65gsBxoJELnyU0eWOEMdJZc7iDzAWz_H6F-tcBgH5cEFeAxcJSKj_KPbQmRJ4U.0revArIdJ55OwTpYSzjXCjzvTRcpqs6fjm7JpaoCoUTXQ3fpCpx0DPLOF7UnmV8HWDrYSY9_QqO1XfHUzMtApQ&_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6ImxvZ2luIiwicGFnZSI6InByb2ZpbGUiLCJwcmV2aW91c1BhZ2UiOiJwcm9maWxlIiwicG9zaXRpb24iOiJwYWdlQ29udGVudCJ9fQ', "https://www.researchgate.net/publication/375102394_Explainable_Artificial_Intelligence_of_Multi-level_Stacking_Ensemble_for_Detection_of_Alzheimer's_Disease_based_on_Particle_Swarm_Optimization_and_the_Su

In [None]:
from seo_rat.content_parser import (
    parse_metadata,
    parse_notebook_metadata,
    remove_metadata,
    extract_headers,
    extract_images,
    check_desc_length,
    check_content_length,
    check_title_length,
    imgs_missing_alts,
)
from seo_rat.seo_analysis import check_h1_count

In [None]:
# | export
def generate_seo_report(session: Session, website_id: int, domain: str) -> Dict:
    """Generate comprehensive SEO report for all pages"""

    articles = session.exec(
        select(Article).where(Article.website_id == website_id)
    ).all()

    page_reports = []
    issues = []

    for article in articles:
        try:
            with open(article.file_path, "r") as f:
                raw_content = f.read()

            # Parse metadata
            if article.file_path.endswith(".ipynb"):
                metadata = parse_notebook_metadata(raw_content)
            else:
                metadata = parse_metadata(raw_content)

            content = remove_metadata(raw_content)
            headers = extract_headers(article.file_path)
            images = extract_images(content)

            # Run checks
            report = {
                "file_path": article.file_path,
                "title_check": check_title_length(
                    metadata.get("title", "")
                ),  # Pass string
                "description_check": check_desc_length(metadata["description"]),
                "content_check": check_content_length(content),
                "h1_check": check_h1_count(
                    headers, title=metadata["title"], is_quarto=True
                ),
                "missing_alts": imgs_missing_alts(images),
                "link_analysis": analyze_links(content, domain),
            }

            # Collect issues
            if not report["h1_check"]["has_single_h1"]:
                issues.append(f"{article.file_path}: Multiple or no H1")
            if not report["content_check"]["is_sufficient"]:
                issues.append(f"{article.file_path}: Content too short")

            page_reports.append(report)

        except Exception as e:
            print(f"Error analyzing {article.file_path}: {e}")

    # Site-wide checks
    duplicate_titles = find_duplicate_metadata(session, "title", 0.9)
    duplicate_descriptions = find_duplicate_metadata(session, "description", 0.9)

    return {
        "total_pages": len(articles),
        "pages_analyzed": len(page_reports),
        "page_reports": page_reports,
        "duplicate_titles": duplicate_titles,
        "duplicate_descriptions": duplicate_descriptions,
        "issues": issues,
        "summary": {
            "total_issues": len(issues),
            "duplicate_titles_count": len(duplicate_titles),
            "duplicate_descriptions_count": len(duplicate_descriptions),
        },
    }


In [None]:
report = generate_seo_report(session, website_id=2, domain="kareemai.com")
print(f"Total pages: {report['total_pages']}")
print(f"Issues found: {report['summary']['total_issues']}")


Error analyzing /home/kobo/Desktop/obsidian_valuts/logseq/karem-site/index.qmd: 'title'
Total pages: 51
Issues found: 4


In [None]:
report["page_reports"][13]

{'file_path': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/blog/posts/minishlab/pyversity_qdrant.ipynb',
 'title_check': {'length': 53, 'optimal_lenth': True},
 'description_check': {'length': 110, 'optimal_lenth': False},
 'content_check': {'word_count': 6920, 'is_sufficient': True},
 'h1_check': {'h1_count': 1, 'has_single_h1': True, 'h1_source': 'title'},
 'missing_alts': [],
 'link_analysis': {'total_links': 2,
  'internal_count': 0,
  'external_count': 2,
  'internal_links': [],
  'external_links': ['https://github.com/Pringled/pyversity',
   'https://qdrant.tech/blog/mmr-diversity-aware-reranking/#step-3-setting-up-qdrant-for-visual-fashion-search']}}

In [None]:
report

{'total_pages': 51,
 'pages_analyzed': 50,
 'page_reports': [{'file_path': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/papers.qmd',
   'title_check': {'length': 59, 'optimal_lenth': True},
   'description_check': {'length': 189, 'optimal_lenth': False},
   'content_check': {'word_count': 243, 'is_sufficient': False},
   'h1_check': {'h1_count': 1, 'has_single_h1': True, 'h1_source': 'title'},
   'missing_alts': [],
   'link_analysis': {'total_links': 4,
    'internal_count': 0,
    'external_count': 4,
    'internal_links': [],
    'external_links': ['https://www.researchgate.net/publication/392372128_QARI-OCR_High-Fidelity_Arabic_Text_Recognition_through_Multimodal_Large_Language_Model_Adaptation?_sg%5B0%5D=Ddb9kwrghDtknsstHB3jlx1WzktpoFiMyRPptkC8aa65gsBxoJELnyU0eWOEMdJZc7iDzAWz_H6F-tcBgH5cEFeAxcJSKj_KPbQmRJ4U.0revArIdJ55OwTpYSzjXCjzvTRcpqs6fjm7JpaoCoUTXQ3fpCpx0DPLOF7UnmV8HWDrYSY9_QqO1XfHUzMtApQ&_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6ImxvZ2luIiwicGFnZSI6InByb2ZpbGUiLCJwcmV2aW91