# AP Analysis Database Overview

This notebook previews the structured outputs generated by the news edits pipeline. It assumes the AP database has already been processed into `out/ap/analysis.db`. Use the cells below to inspect table layouts, row counts, and a few starter summaries.

In [1]:
from pathlib import Path
import sqlite3
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

In [6]:
DB_PATH = Path("../out/ap/analysis.db")
if not DB_PATH.exists():
    raise FileNotFoundError(f"Expected database at {DB_PATH.resolve()}")
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

## Available Tables

In [7]:
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name", conn)
tables

Unnamed: 0,name
0,article_metrics
1,articles
2,entity_mentions
3,pair_anon_named_replacements
4,pair_claims
5,pair_frame_cues
6,pair_numeric_changes
7,pair_sources_added
8,pair_sources_removed
9,pair_title_events


## Row Counts by Table

In [8]:
row_counts = []
for table in tables['name']:
    count_sql = f"SELECT COUNT(*) AS rowcount FROM \"{table}\""
    count = pd.read_sql(count_sql, conn)['rowcount'][0]
    row_counts.append({'table': table, 'rows': int(count)})
pd.DataFrame(row_counts).sort_values('rows', ascending=False).reset_index(drop=True)

Unnamed: 0,table,rows
0,entity_mentions,441
1,pair_numeric_changes,21
2,source_mentions,21
3,pair_claims,10
4,sources_agg,8
5,pair_frame_cues,4
6,version_metrics,3
7,versions,3
8,pair_anon_named_replacements,2
9,version_pairs,2


In [20]:
pd.read_sql('select * from pair_claims limit 5', con=conn).loc[1].to_dict()

{'from_version_id': '1',
 'to_version_id': '11',
 'claim_id': 'C2',
 'proposition': 'Manning was an intelligence officer.',
 'status': 'contradicted',
 'change_note': "Manning's role was corrected from 'intelligence officer' to 'intelligence analyst'.",
 'confidence': 5.0}

## Articles Snapshot

In [10]:
pd.read_sql(
    """
    SELECT article_id, news_org, url, title_first, title_final, total_edits, is_live_blog
    FROM articles
    ORDER BY total_edits DESC
    LIMIT 5
    """,
    conn,
)

Unnamed: 0,article_id,news_org,url,title_first,title_final,total_edits,is_live_blog
0,1,ap,http://hosted.ap.org/dynamic/stories/U/US_OBAM...,News from The Associated Press,News from The Associated Press,2,0


## Version Cadence by Article

In [11]:
pd.read_sql(
    """
    SELECT article_id, COUNT(*) AS version_count,
           MIN(timestamp_utc) AS first_seen,
           MAX(timestamp_utc) AS last_seen
    FROM versions
    GROUP BY article_id
    ORDER BY version_count DESC
    LIMIT 10
    """,
    conn,
)

Unnamed: 0,article_id,version_count,first_seen,last_seen
0,1,3,2017-01-18 15:48:03.845167,2017-01-18 19:30:03.362584


## Top Sources by Mentions

In [12]:
pd.read_sql(
    """
    SELECT source_canonical, source_type,
           COUNT(*) AS mentions,
           SUM(is_in_lede) AS lede_mentions,
           SUM(is_in_title) AS title_mentions
    FROM source_mentions
    GROUP BY source_canonical, source_type
    ORDER BY mentions DESC
    LIMIT 10
    """,
    conn,
)


Unnamed: 0,source_canonical,source_type,mentions,lede_mentions,title_mentions
0,Barack Obama,government,3,0,0
1,Chase Strangio,civil_society,3,0,0
2,Josh Earnest,government,3,0,0
3,Neil Eggleston,government,3,0,0
4,Paul Ryan,government,3,0,0
5,White House officials,government,3,0,0
6,Melinda Taylor,civil_society,2,0,0
7,Sean Spicer,government,1,0,0


## Numeric Change Overview

In [13]:
pd.read_sql(
    """
    SELECT change_type, COUNT(*) AS occurrences,
           AVG(confidence) AS avg_confidence
    FROM pair_numeric_changes
    GROUP BY change_type
    ORDER BY occurrences DESC
    """,
    conn,
)


Unnamed: 0,change_type,occurrences,avg_confidence
0,update,19,5.0
1,refinement,1,5.0
2,correction,1,5.0
