# AP Analysis Database Overview

This notebook previews the structured outputs generated by the news edits pipeline. It assumes the AP database has already been processed into `out/ap/analysis.db`. Use the cells below to inspect table layouts, row counts, and a few starter summaries.

In [None]:
from pathlib import Path
import sqlite3
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)


In [None]:
DB_PATH = Path("out/ap/analysis.db")
if not DB_PATH.exists():
    raise FileNotFoundError(f"Expected database at {DB_PATH.resolve()}")
DB_PATH


Connect to the SQLite database so we can explore its schema and contents.

In [None]:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
conn


## Available Tables

In [None]:
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name", conn)
tables


## Row Counts by Table

In [None]:
row_counts = []
for table in tables['name']:
    count_sql = f"SELECT COUNT(*) AS rowcount FROM "{table}""
    count = pd.read_sql(count_sql, conn)['rowcount'][0]
    row_counts.append({'table': table, 'rows': int(count)})
pd.DataFrame(row_counts).sort_values('rows', ascending=False).reset_index(drop=True)


## Articles Snapshot

In [None]:
pd.read_sql(
    """
    SELECT article_id, news_org, url, title_first, title_final, total_edits, is_live_blog
    FROM articles
    ORDER BY total_edits DESC
    LIMIT 5
    """,
    conn,
)


## Version Cadence by Article

In [None]:
pd.read_sql(
    """
    SELECT article_id, COUNT(*) AS version_count,
           MIN(timestamp_utc) AS first_seen,
           MAX(timestamp_utc) AS last_seen
    FROM versions
    GROUP BY article_id
    ORDER BY version_count DESC
    LIMIT 10
    """,
    conn,
)


## Top Sources by Mentions

In [None]:
pd.read_sql(
    """
    SELECT source_canonical, source_type,
           COUNT(*) AS mentions,
           SUM(is_in_lede) AS lede_mentions,
           SUM(is_in_title) AS title_mentions
    FROM source_mentions
    GROUP BY source_canonical, source_type
    ORDER BY mentions DESC
    LIMIT 10
    """,
    conn,
)


## Numeric Change Overview

In [None]:
pd.read_sql(
    """
    SELECT change_type, COUNT(*) AS occurrences,
           AVG(confidence) AS avg_confidence
    FROM pair_numeric_changes
    GROUP BY change_type
    ORDER BY occurrences DESC
    """,
    conn,
)


## Close the Connection

In [None]:
conn.close()
