In [None]:
# | default_exp index_tracking


In [None]:
# | export
from sqlmodel import Session, select
from seo_rat.models import IndexStatus
from seo_rat.gsc_client import GSCAuth
from googleapiclient.discovery import build
from typing import List, Optional,Dict
from datetime import datetime


In [None]:
# | export
def inspect_url_status(auth: GSCAuth, site_url: str, page_url: str) -> dict:
    """Inspect URL indexing status from GSC"""
    service = build("searchconsole", "v1", credentials=auth.get_credentials())

    request = {"inspectionUrl": page_url, "siteUrl": site_url, "languageCode": "en-US"}

    response = service.urlInspection().index().inspect(body=request).execute()
    result = response.get("inspectionResult", {}).get("indexStatusResult", {})

    return {
        "verdict": result.get("verdict", "UNKNOWN"),
        "coverage_state": result.get("coverageState"),
        "last_crawl_time": result.get("lastCrawlTime"),
        "indexing_state": result.get("indexingState"),
        "robots_txt_state": result.get("robotsTxtState"),
    }


In [None]:
# | test
from fastcore.test import test_eq
from seo_rat.index_tracking import (
    inspect_url_status,
    store_index_status,
    get_index_status,
)
from pprint import pprint
from seo_rat.gsc_client import GSCAuth
from seo_rat.sqlite_db import SQLiteDB
from sqlmodel import Session, create_engine, SQLModel

auth = GSCAuth()

# Test inspect_url_status
status = inspect_url_status(auth, "sc-domain:kareemai.com", "https://kareemai.com/")
print(f"Verdict: {status['verdict']}")
test_eq("verdict" in status, True)

pprint(status)


Verdict: PASS
{'coverage_state': 'Submitted and indexed',
 'indexing_state': 'INDEXING_ALLOWED',
 'last_crawl_time': '2026-01-07T20:21:30Z',
 'robots_txt_state': 'ALLOWED',
 'verdict': 'PASS'}


In [None]:
# | export
def store_index_status(session: Session, auth: GSCAuth, site_url: str, page_url: str):
    """Inspect and store URL index status"""
    status_data = inspect_url_status(auth, site_url, page_url)

    # Upsert
    existing = session.exec(
        select(IndexStatus).where(IndexStatus.page_url == page_url)
    ).first()

    if existing:
        for key, value in status_data.items():
            setattr(existing, key, value)
        existing.checked_at = datetime.now()
    else:
        index_status = IndexStatus(site_url=site_url, page_url=page_url, **status_data)
        session.add(index_status)

    session.commit()


In [None]:
# | hide
db = SQLiteDB()
with db.get_session() as session:
    store_index_status(session, auth, "sc-domain:kareemai.com", "https://kareemai.com/")
    print("Stored in ./data/seo.db")


pprint(status)


Stored in ./data/seo.db
{'coverage_state': 'Submitted and indexed',
 'indexing_state': 'INDEXING_ALLOWED',
 'last_crawl_time': '2026-01-07T20:21:30Z',
 'robots_txt_state': 'ALLOWED',
 'verdict': 'PASS'}


In [None]:
# | export
def get_index_status(
    session: Session, site_url: str, verdict: str = None
) -> List[IndexStatus]:
    """Get stored index status for a site"""
    stmt = select(IndexStatus).where(IndexStatus.site_url == site_url)
    if verdict:
        stmt = stmt.where(IndexStatus.verdict == verdict)
    return session.exec(stmt).all()


In [None]:
# | hide
with db.get_session() as session:
    test_index_status = get_index_status(
        session,
        "sc-domain:kareemai.com",
    )
    pprint(test_index_status)

[IndexStatus(coverage_state='Submitted and indexed', id=1, page_url='https://kareemai.com/', indexing_state='INDEXING_ALLOWED', checked_at=datetime.datetime(2026, 1, 17, 0, 22, 58, 415336), site_url='sc-domain:kareemai.com', verdict='PASS', last_crawl_time='2026-01-07T20:21:30Z', robots_txt_state='ALLOWED'),
 IndexStatus(coverage_state='Submitted and indexed', id=2, page_url='https://kareemai.com/papers.html', indexing_state='INDEXING_ALLOWED', checked_at=datetime.datetime(2026, 1, 17, 0, 21, 52, 636178), site_url='sc-domain:kareemai.com', verdict='PASS', last_crawl_time='2025-12-13T05:14:51Z', robots_txt_state='ALLOWED'),
 IndexStatus(coverage_state='Discovered - currently not indexed', id=3, page_url='https://kareemai.com/oss/opensource.html', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 1, 17, 0, 21, 59, 258333), site_url='sc-domain:kareemai.com', verdict='NEUTRAL', last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
 IndexStatu

In [None]:
# | export
def get_not_indexed_pages(session: Session, site_url: str) -> List[IndexStatus]:
    """Get pages that are not indexed"""
    return session.exec(
        select(IndexStatus).where(
            IndexStatus.site_url == site_url, IndexStatus.verdict != "PASS"
        )
    ).all()


In [None]:
# | export
def fetch_sitemap_urls(sitemap_url: str) -> List[str]:
    """Fetch all URLs from sitemap XML"""
    import httpx
    from bs4 import BeautifulSoup

    response = httpx.get(sitemap_url)
    soup = BeautifulSoup(response.text, "xml")
    return [loc.text for loc in soup.find_all("loc")]


In [None]:
# | export
def store_all_index_status(
    session: Session, auth: GSCAuth, site_url: str, sitemap_url: str
):
    """Check and store index status for all pages in sitemap"""
    urls = fetch_sitemap_urls(sitemap_url)
    total = len(urls)

    for i, url in enumerate(urls, 1):
        print(f"Checking {i}/{total}: {url}")
        store_index_status(session, auth, site_url, url)


In [None]:
# | hide
test_storing_all_index_status = store_all_index_status(
    session,
    auth,
    site_url="sc-domain:kareemai.com",
    sitemap_url="https://kareemai.com/sitemap.xml",
)

Checking 1/51: https://kareemai.com/papers.html
Checking 2/51: https://kareemai.com/oss/opensource.html
Checking 3/51: https://kareemai.com/til/tils/2025-12-15.html
Checking 4/51: https://kareemai.com/til/tils/2025-12-13.html
Checking 5/51: https://kareemai.com/til/tils/2025-06-06-til.html
Checking 6/51: https://kareemai.com/til/tils/2025-05-25-til.html
Checking 7/51: https://kareemai.com/til/tils/2025-05-21-til.html
Checking 8/51: https://kareemai.com/til/tils/2025-05-19-til.html
Checking 9/51: https://kareemai.com/til/tils/2025-05-17-til.html
Checking 10/51: https://kareemai.com/blog/posts/tools_reviews/super_productivity_app.html
Checking 11/51: https://kareemai.com/blog/posts/ds_and_algo/master_ds.html
Checking 12/51: https://kareemai.com/blog/posts/courses/havrard CS197 AI research experiences.html
Checking 13/51: https://kareemai.com/blog/posts/speech_recognition/muaalm_quran_chance.html
Checking 14/51: https://kareemai.com/blog/posts/minishlab/pyversity_qdrant.html
Checking 15/5

In [None]:
# | hide
with db.get_session() as session:
    test_getting_not_indexed = get_not_indexed_pages(
        session, site_url="sc-domain:kareemai.com"
    )
    pprint(test_getting_not_indexed)

[IndexStatus(coverage_state='Discovered - currently not indexed', id=3, page_url='https://kareemai.com/oss/opensource.html', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 1, 17, 0, 23, 11, 909950), site_url='sc-domain:kareemai.com', verdict='NEUTRAL', last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
 IndexStatus(coverage_state='Discovered - currently not indexed', id=4, page_url='https://kareemai.com/til/tils/2025-12-15.html', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 1, 17, 0, 23, 18, 476308), site_url='sc-domain:kareemai.com', verdict='NEUTRAL', last_crawl_time=None, robots_txt_state='ROBOTS_TXT_STATE_UNSPECIFIED'),
 IndexStatus(coverage_state='Discovered - currently not indexed', id=5, page_url='https://kareemai.com/til/tils/2025-12-13.html', indexing_state='INDEXING_STATE_UNSPECIFIED', checked_at=datetime.datetime(2026, 1, 17, 0, 23, 25, 40007), site_url='sc-domain:kareemai.com', verdict=