# AP Analysis Database Overview

This notebook previews the structured outputs generated by the news edits pipeline. It assumes the AP database has already been processed into `out/ap/analysis.db`. Use the cells below to inspect table layouts, row counts, and a few starter summaries.

In [38]:
from pathlib import Path
import sqlite3
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

In [39]:
DB_PATH = Path("../out/ap/analysis.db")
if not DB_PATH.exists():
    raise FileNotFoundError(f"Expected database at {DB_PATH.resolve()}")
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

## Available Tables

In [40]:
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name", conn)
tables

Unnamed: 0,name
0,article_metrics
1,articles
2,entity_mentions
3,pair_anon_named_replacements
4,pair_claims
5,pair_frame_cues
6,pair_numeric_changes
7,pair_source_transitions
8,pair_sources_added
9,pair_sources_removed


## Row Counts by Table

In [41]:
row_counts = []
for table in tables['name']:
    count_sql = f"SELECT COUNT(*) AS rowcount FROM \"{table}\""
    count = pd.read_sql(count_sql, conn)['rowcount'][0]
    row_counts.append({'table': table, 'rows': int(count)})
pd.DataFrame(row_counts).sort_values('rows', ascending=False).reset_index(drop=True)

Unnamed: 0,table,rows
0,entity_mentions,406350
1,versions,7769
2,version_metrics,7157
3,source_mentions,5043
4,pair_frame_cues,3622
5,pair_anon_named_replacements,3384
6,pair_source_transitions,3019
7,sources_agg,1934
8,articles,1602
9,article_metrics,1503


In [42]:
row_counts = []
for table in tables['name']:
    sql = f"SELECT * FROM \"{table}\" limit 5"
    res = pd.read_sql(sql, conn)

In [43]:
res

Unnamed: 0,version_id,article_id,news_org,version_num,timestamp_utc,title,char_len
0,1,1,ap,0,2017-01-18 15:48:03.845167,News from The Associated Press,7057
1,11,1,ap,1,2017-01-18 16:00:03.432431,News from The Associated Press,7170
2,112,1,ap,2,2017-01-18 19:30:03.362584,News from The Associated Press,6596
3,2,2,ap,0,2017-01-18 15:48:06.059903,News from The Associated Press,5319
4,88,2,ap,1,2017-01-18 18:45:03.881484,News from The Associated Press,5410


## Articles Snapshot

In [44]:
pd.read_sql(
    """
    SELECT article_id, news_org, url, title_first, title_final, total_edits, is_live_blog
    FROM articles
    ORDER BY total_edits DESC
    LIMIT 5
    """,
    conn,
)

Unnamed: 0,article_id,news_org,url,title_first,title_final,total_edits,is_live_blog
0,17,ap,http://hosted.ap.org/dynamic/stories/U/US_POLI...,News from The Associated Press,News from The Associated Press,18,0
1,1020,ap,http://hosted.ap.org/dynamic/stories/U/US_FEDE...,News from The Associated Press,News from The Associated Press,18,0
2,1060,ap,http://hosted.ap.org/dynamic/stories/U/US_CONG...,News from The Associated Press,News from The Associated Press,18,0
3,1115,ap,http://hosted.ap.org/dynamic/stories/A/AS_AUST...,News from The Associated Press,News from The Associated Press,18,0
4,1836,ap,http://hosted.ap.org/dynamic/stories/U/US_DAMA...,News from The Associated Press,News from The Associated Press,18,1


## Version Cadence by Article

In [45]:
pd.read_sql(
    """
    SELECT article_id, COUNT(*) AS version_count,
           MIN(timestamp_utc) AS first_seen,
           MAX(timestamp_utc) AS last_seen
    FROM versions
    GROUP BY article_id
    ORDER BY version_count DESC
    LIMIT 10
    """,
    conn,
)

Unnamed: 0,article_id,version_count,first_seen,last_seen
0,3085,19,2017-03-03 16:05:03.034706,2017-06-13 18:44:22.466007
1,2594,19,2017-02-24 06:45:53.852480,2017-08-09 14:55:08.533132
2,2485,19,2017-02-22 19:36:15.057070,2017-08-15 21:51:01.940473
3,2359,19,2017-02-20 22:39:20.298833,2017-11-02 06:32:37.304101
4,1959,19,2017-02-14 18:17:27.435989,2017-08-10 01:03:46.876207
5,1836,19,2017-02-13 02:08:25.604365,2017-02-17 03:55:14.020198
6,1115,19,2017-02-02 03:46:45.397450,2017-11-02 00:30:49.119688
7,1060,19,2017-02-01 18:04:49.393095,2017-02-07 23:47:46.045205
8,1020,19,2017-02-01 08:17:22.094850,2017-11-07 00:59:46.659978
9,17,19,2017-01-18 16:15:28.835009,2017-06-12 20:31:51.540050


## Top Sources by Mentions

In [47]:
pd.read_sql(
    """
    SELECT source_canonical, source_type,
           COUNT(*) AS mentions,
           SUM(is_in_lede) AS lede_mentions,
           SUM(is_in_title) AS title_mentions
    FROM source_mentions
    GROUP BY source_canonical, source_type
    ORDER BY mentions DESC
    LIMIT 10
    """,
    conn,
)


Unnamed: 0,source_canonical,source_type,mentions,lede_mentions,title_mentions
0,Donald Trump,government,184,17,0
1,Barack Obama,government,58,5,0
2,Donald Trump,individual,49,8,0
3,Sean Spicer,government,45,6,0
4,Aldo Fasci,government,40,0,0
5,Marine Le Pen,individual,31,4,0
6,Rick Perry,government,30,0,0
7,Christopher Geldart,government,27,0,0
8,Markeith Loyd,individual,25,1,0
9,officials,government,25,9,0
