In [1]:
# | default_exp index_tracking


In [2]:
# | export
from sqlmodel import Session, select
from seo_rat.models import IndexStatus
from seo_rat.gsc_client import GSCAuth
from googleapiclient.discovery import build
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
import time

In [3]:
# | export
def inspect_url_status(auth: GSCAuth, site_url: str, page_url: str) -> dict:
    """Inspect URL indexing status from GSC"""
    service = build("searchconsole", "v1", credentials=auth.get_credentials())
    request = {"inspectionUrl": page_url, "siteUrl": site_url, "languageCode": "en-US"}
    response = service.urlInspection().index().inspect(body=request).execute()
    result = response.get("inspectionResult", {}).get("indexStatusResult", {})

    return {
        "verdict": result.get("verdict", "UNKNOWN"),
        "coverage_state": result.get("coverageState"),
        "last_crawl_time": result.get("lastCrawlTime"),
        "indexing_state": result.get("indexingState"),
        "robots_txt_state": result.get("robotsTxtState"),
    }


In [4]:
# | test
from fastcore.test import test_eq
from seo_rat.index_tracking import (
    inspect_url_status,
    store_index_status,
    get_index_status,
)
from pprint import pprint
from seo_rat.gsc_client import GSCAuth
from seo_rat.sqlite_db import SQLiteDB
from sqlmodel import Session, create_engine, SQLModel

auth = GSCAuth()

# Test inspect_url_status
status = inspect_url_status(auth, "sc-domain:kareemai.com", "https://kareemai.com/")
print(f"Verdict: {status['verdict']}")
test_eq("verdict" in status, True)

pprint(status)


Verdict: PASS
{'coverage_state': 'Submitted and indexed',
 'indexing_state': 'INDEXING_ALLOWED',
 'last_crawl_time': '2026-02-24T04:23:19Z',
 'robots_txt_state': 'ALLOWED',
 'verdict': 'PASS'}


In [5]:
# | export
def store_index_status(session: Session, auth: GSCAuth, site_url: str, page_url: str):
    """Inspect and store URL index status as a new history row"""
    status_data = inspect_url_status(auth, site_url, page_url)
    record = IndexStatus(site_url=site_url, page_url=page_url, **status_data)
    session.add(record)
    session.commit()


In [8]:
# | hide
db = SQLiteDB()
with db.get_session() as session:
    store_index_status(session, auth, "sc-domain:kareemai.com", "https://kareemai.com/")
    print("Stored in ./data/seo.db")


pprint(status)


Stored in ./data/seo.db
{'coverage_state': 'Submitted and indexed',
 'indexing_state': 'INDEXING_ALLOWED',
 'last_crawl_time': '2026-02-24T04:23:19Z',
 'robots_txt_state': 'ALLOWED',
 'verdict': 'PASS'}


In [9]:
# | export
def get_index_status(
    session: Session,
    site_url: str,
    verdict: str | None = None,
) -> list[IndexStatus]:
    """Get latest index status per page"""
    from sqlalchemy import func

    latest = (
        select(IndexStatus.page_url, func.max(IndexStatus.checked_at).label("max_checked"))
        .where(IndexStatus.site_url == site_url)
        .group_by(IndexStatus.page_url)
        .subquery()
    )

    query = select(IndexStatus).join(
        latest,
        (IndexStatus.page_url == latest.c.page_url) &
        (IndexStatus.checked_at == latest.c.max_checked),
    )
    if verdict:
        query = query.where(IndexStatus.verdict == verdict)
    return session.exec(query).all()


In [10]:
# | hide
with db.get_session() as session:
    test_index_status = get_index_status(
        session,
        "sc-domain:kareemai.com",
    )
    pprint(test_index_status)

[IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/', coverage_state='Submitted and indexed', indexing_state='INDEXING_ALLOWED', checked_at=datetime.datetime(2026, 2, 28, 7, 38, 35, 12388), verdict='PASS', id=1, last_crawl_time='2026-02-24T04:23:19Z', robots_txt_state='ALLOWED')]


In [13]:
# | export
def get_not_indexed_pages(session: Session, site_url: str) -> list[IndexStatus]:
    """Get pages that are not indexed (latest status only)"""
    return [p for p in get_index_status(session, site_url) if p.verdict != "PASS"]


In [17]:
#| export
def get_not_indexed_by_reason(session: Session, site_url: str) -> dict[str, list[IndexStatus]]:
    """Group not-indexed pages by their coverage state reason"""
    pages = get_not_indexed_pages(session, site_url)
    grouped = {}
    for page in pages:
        reason = page.coverage_state or "Unknown"
        grouped.setdefault(reason, []).append(page)
    return grouped


In [18]:
# | export
def fetch_sitemap_urls(sitemap_url: str) -> list[str]:
    """Fetch all URLs from sitemap XML"""
    response = httpx.get(sitemap_url)
    soup = BeautifulSoup(response.text, "xml")
    return [loc.text for loc in soup.find_all("loc")]


In [19]:
# | export
def store_all_index_status(
    session: Session,
    auth: GSCAuth,
    site_url: str,
    sitemap_url: str,
) -> dict:
    """Check and store index status for all pages in sitemap"""
    urls = fetch_sitemap_urls(sitemap_url)
    total = len(urls)
    results = {"successful": [], "failed": []}

    for i, url in enumerate(urls, 1):
        print(f"Checking {i}/{total}: {url}")
        try:
            store_index_status(session, auth, site_url, url)
            results["successful"].append(url)
        except Exception as e:
            results["failed"].append({"url": url, "error": str(e)})
        time.sleep(1)

    return results

In [27]:
#| hide
reasons = get_not_indexed_by_reason(session, "sc-domain:kareemai.com")
for reason, pages in reasons.items():
    print(f"\n{reason} ({len(pages)} pages):")
    for p in pages:
        print(f"  {p.page_url}")
reasons


Discovered - currently not indexed (6 pages):
  https://kareemai.com/til/tils/2025-12-15.html
  https://kareemai.com/til/tils/2025-06-06-til.html
  https://kareemai.com/til/tils/2025-05-25-til.html
  https://kareemai.com/blog/posts/nlp/embedding_world/sparse_embedding/bm25_from_scratch.html
  https://kareemai.com/blog/posts/nlp/embedding_world/sparse_embedding/bm25_arabic_qdrant.html
  https://kareemai.com/blog/posts/minishlab/pyversity_qdrant.html

URL is unknown to Google (1 pages):
  https://kareemai.com/til/tils/2025-12-13.html

Crawled - currently not indexed (2 pages):
  https://kareemai.com/blog/posts/seo/seo_rat_journey.html
  https://kareemai.com/blog/posts/dspy/Opencode.html

Alternate page with proper canonical tag (2 pages):
  https://kareemai.com/til/index.html
  https://kareemai.com/index.html


{'Discovered - currently not indexed': [IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-12-15.html', coverage_state='Discovered - currently not indexed', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 2, 28, 7, 41, 6, 194623), verdict='NEUTRAL', id=4, last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
  IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-06-06-til.html', coverage_state='Discovered - currently not indexed', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 2, 28, 7, 41, 21, 265349), verdict='NEUTRAL', id=6, last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
  IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-05-25-til.html', coverage_state='Discovered - currently not indexed', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.da

In [22]:
# | hide
test_storing_all_index_status = store_all_index_status(
    session,
    auth,
    site_url="sc-domain:kareemai.com",
    sitemap_url="https://kareemai.com/sitemap.xml",
)

Checking 1/53: https://kareemai.com/papers.html
Checking 2/53: https://kareemai.com/oss/opensource.html
Checking 3/53: https://kareemai.com/til/tils/2025-12-15.html
Checking 4/53: https://kareemai.com/til/tils/2025-12-13.html
Checking 5/53: https://kareemai.com/til/tils/2025-06-06-til.html
Checking 6/53: https://kareemai.com/til/tils/2025-05-25-til.html
Checking 7/53: https://kareemai.com/til/tils/2025-05-21-til.html
Checking 8/53: https://kareemai.com/til/tils/2025-05-19-til.html
Checking 9/53: https://kareemai.com/til/tils/2025-05-17-til.html
Checking 10/53: https://kareemai.com/blog/posts/tools_reviews/super_productivity_app.html
Checking 11/53: https://kareemai.com/blog/posts/speech_recognition/my_dream_job_at_tarteel.html
Checking 12/53: https://kareemai.com/blog/posts/minishlab/ctranslate_maswray.html
Checking 13/53: https://kareemai.com/blog/posts/seo/seo_rat_journey.html
Checking 14/53: https://kareemai.com/blog/posts/seo/how_i_use_nlp_for_seo.html
Checking 15/53: https://karee

In [23]:
# | hide
with db.get_session() as session:
    test_getting_not_indexed = get_not_indexed_pages(
        session, site_url="sc-domain:kareemai.com"
    )
    pprint(test_getting_not_indexed)

[IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-12-15.html', coverage_state='Discovered - currently not indexed', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 2, 28, 7, 41, 6, 194623), verdict='NEUTRAL', id=4, last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
 IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-12-13.html', coverage_state='URL is unknown to Google', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 2, 28, 7, 41, 13, 745126), verdict='NEUTRAL', id=5, last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
 IndexStatus(site_url='sc-domain:kareemai.com', page_url='https://kareemai.com/til/tils/2025-06-06-til.html', coverage_state='Discovered - currently not indexed', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 2, 28, 7, 41, 21, 265349), verdict='NEUTRA

In [24]:
#| export
def submit_sitemap(auth: GSCAuth, site_url: str, sitemap_url: str):
    """Submit sitemap to Google Search Console"""
    service = build("searchconsole", "v1", credentials=auth.get_credentials())
    service.sitemaps().submit(siteUrl=site_url, feedpath=sitemap_url).execute()
    print(f"Submitted: {sitemap_url}")


In [29]:
#|hide
submit_sitemap(auth, "sc-domain:kareemai.com", "https://kareemai.com/til/index.xml")


Submitted: https://kareemai.com/til/index.xml


In [25]:
#| export
def get_index_history(session: Session, page_url: str) -> list[IndexStatus]:
    """Get full index status history for a page"""
    return session.exec(
        select(IndexStatus)
        .where(IndexStatus.page_url == page_url)
        .order_by(IndexStatus.checked_at.desc())
    ).all()

In [28]:
#|hide
history = get_index_history(session, "https://kareemai.com/")
for h in history:
    print(f"{h.checked_at} | {h.verdict} | {h.coverage_state}")


2026-02-28 07:38:35.012388 | PASS | Submitted and indexed
