In [23]:
from dotenv import load_dotenv
import os
import json

load_dotenv()

NEWSAPI_API_KEY = os.getenv("NEWSAPI_API_KEY")

In [24]:
class NewsSource:
    def __init__(self, key: str, domain: str, mbfc_rating: str):
        self.key = key
        self.domain = domain
        self.mbfc_rating = mbfc_rating

In [25]:
# Init all news sources
"""
- Right
    - Fox News - [foxnews.com](http://foxnews.com)
    - Breitbart - [breitbart.com](http://breitbart.com)
    - Dailymail - [dailymail.co.uk](http://dailymail.co.uk) → UK
    - The Sun - [thesun.co.uk](http://thesun.co.uk/) → UK
- Right-Center
    - Wall Street Journal - [wsj.com](http://wsj.com)
    - New York Post - [nypost.com](http://nypost.com/)
    - Forbes - [forbes.com](http://forbes.com)
    - India Times - [indiatimes.com](http://indiatimes.com) → India
    - News Week - [newsweek.com](http://newsweek.com/)
- Neutral
    - Reuters - [reuters.com](http://reuters.com)
    - The Hill - [thehill.com](http://thehill.com)
- Left-Center
    - New York Times - [nytimes.com](http://nytimes.com)
    - Washington Post - [washingtonpost.com](http://washingtonpost.com)
    - USA Today - [usatoday.com](http://usatoday.com)
    - Buzz Feed - [buzzfeed.com](http://buzzfeed.com)
    - CBS News - [cbsnews.com](http://cbsnews.com)
    - SF Gate - [sfgate.com](http://sfgate.com/)
    - Bloomberg - [bloomberg.com](http://bloomberg.com)
- Left
    - CNN - [cnn.com](http://cnn.com)
    - People - [people.com](http://people.com)
"""
news_sources = [
    # Right
    NewsSource("foxnews", "foxnews.com", "Right"),
    NewsSource("breitbart", "breitbart.com", "Right"),
    NewsSource("dailymail", "dailymail.co.uk", "Right"),
    NewsSource("thesun", "thesun.co.uk", "Right"),
    # Right-Center
    NewsSource("wsj", "wsj.com", "Right-Center"),
    NewsSource("nypost", "nypost.com", "Right-Center"),
    NewsSource("forbes", "forbes.com", "Right-Center"),
    NewsSource("indiatimes", "indiatimes.com", "Right-Center"),
    NewsSource("newsweek", "newsweek.com", "Right-Center"),
    # Neutral
    NewsSource("reuters", "reuters.com", "Neutral"),
    NewsSource("thehill", "thehill.com", "Neutral"),
    # Left-Center
    NewsSource("nytimes", "nytimes.com", "Left-Center"),
    NewsSource("washingtonpost", "washingtonpost.com", "Left-Center"),
    NewsSource("usatoday", "usatoday.com", "Left-Center"),
    NewsSource("buzzfeed", "buzzfeed.com", "Left-Center"),
    NewsSource("cbsnews", "cbsnews.com", "Left-Center"),
    NewsSource("sfgate", "sfgate.com", "Left-Center"),
    NewsSource("bloomberg", "bloomberg.com", "Left-Center"),
    # Left
    NewsSource("cnn", "cnn.com", "Left"),
    NewsSource("people", "people.com", "Left"),   
]

In [26]:
BASE_PATH = "../../outputs/newsapi/"
ARTICLES_PATH = BASE_PATH + "articles/"
AUTHORS_PATH = BASE_PATH + "authors/"

### Fetch The Articles

In [None]:
from eventregistry import EventRegistry, QueryArticlesIter
er = EventRegistry(apiKey=NEWSAPI_API_KEY)
max_articles = 10000

In [None]:
for source in news_sources:
    q = QueryArticlesIter(
        sourceUri = source.domain,
        lang = "eng",
        dateStart = "2025-01-01",
    )

    articles = []

    for art in q.execQuery(er, sortBy = "socialScore", maxItems = 10000):
        articles.append(art)
    
    # store the articles in a json file
    with open(f"{ARTICLES_PATH}{source.key}_articles.json", "w") as f:
        json.dump(articles, f, indent=4, ensure_ascii=False)


### Analyze Authors

In [9]:
import pandas as pd

In [26]:
for i, source in enumerate(news_sources):
    print(f"Processing {source.key}... {i+1}/{len(news_sources)}")
    # Load the articles
    with open(f"{ARTICLES_PATH}{source.key}_articles.json", "r") as f:
        articles = json.load(f)

    authors2count = {}

    for article in articles:
        authors = article.get("authors", [])
        for author in authors:
            author_name = author.get("name", "")
            if author_name in authors2count:
                authors2count[author_name]["count"] += 1
            else:
                authors2count[author_name] = {
                    "name": author_name,
                    "count": 1,
                    "source": source.key,
                    "mbfc_rating": source.mbfc_rating,
                    "percentage": 0
                }

    # Calculate the percentage
    total_count = sum(author["count"] for author in authors2count.values())
    for author in authors2count:
        authors2count[author]["percentage"] = authors2count[author]["count"] / total_count

    # Sort the authors by count
    authors2count = sorted(authors2count.values(), key=lambda x: x["count"], reverse=True)

    # Write the results to a csv file
    authors_df = pd.DataFrame(authors2count)
    authors_df.to_csv(f"{AUTHORS_PATH}{source.key}_authors.csv", index=False)

print("Done!")

Processing foxnews... 1/20
Processing breitbart... 2/20
Processing dailymail... 3/20
Processing thesun... 4/20
Processing wsj... 5/20
Processing nypost... 6/20
Processing forbes... 7/20
Processing indiatimes... 8/20
Processing newsweek... 9/20
Processing reuters... 10/20
Processing thehill... 11/20
Processing nytimes... 12/20
Processing washingtonpost... 13/20
Processing usatoday... 14/20
Processing buzzfeed... 15/20
Processing cbsnews... 16/20
Processing sfgate... 17/20
Processing bloomberg... 18/20
Processing cnn... 19/20
Processing people... 20/20
Done!


In [29]:
# Merge all the authors csv files (sorted by percentage)
merged_authors_df = pd.DataFrame()
for source in news_sources:
    try: 
        authors_df = pd.read_csv(f"{AUTHORS_PATH}{source.key}_authors.csv")
        merged_authors_df = pd.concat([merged_authors_df, authors_df])
    except:
        print(f"No articles found for {source.key}[{source.domain}]")

merged_authors_df = merged_authors_df.sort_values(by="percentage", ascending=False)
merged_authors_df.to_csv(f"{AUTHORS_PATH}all_authors.csv", index=False)

No articles found for forbes[forbes.com]
No articles found for usatoday[usatoday.com]
No articles found for buzzfeed[buzzfeed.com]
No articles found for cnn[cnn.com]


### Distribution Plots

In [27]:
# =========================
# News distributions panels
# =========================
# Requirements: pandas, matplotlib, numpy

import os, json, re
from typing import List, Tuple, Any, Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------- CONFIG ----------
FIG_DIR = BASE_PATH + "figures"              # where to save the PNGs
os.makedirs(FIG_DIR, exist_ok=True)

# Toggle if you want a log-scale x-axis for the author histogram (useful when long-tailed)
AUTHOR_HIST_LOGX = False

# Authors to ignore (case-insensitive). After cleaning, if an article's authors are all ignored, it counts as "no author".
IGNORE_AUTHORS = {
    "reuters",
    "pa media",
    "editorial board",
    "associated press",
    "field level media",
    "more",

}
# Regex patterns to ignore (case-insensitive), applied to the cleaned name.
IGNORE_AUTHOR_PATTERNS = [
    re.compile(r"\b(desk|newsroom)\b", re.I),
]
# If True, drop authors where authors[].isAgency == True (when present in JSON).
IGNORE_AGENCY_FLAG = True

RATING_ORDER = ["Right", "Right-Center", "Neutral", "Left-Center", "Left"]

# ---------- AUTHOR EXTRACTION ----------
_SPLIT = re.compile(r'\s*(?:,| and | & |;|\|)\s*', re.IGNORECASE)

def _split_tokens(s: str) -> List[str]:
    return [t for t in _SPLIT.split(s) if t and t.strip()]

def _clean_name(s: str) -> str:
    s = (s or "").strip()
    if not s:
        return ""
    # Remove "By ..." prefix
    s = re.sub(r'^\s*by\s+', '', s, flags=re.IGNORECASE)
    low = s.lower()
    # Drop obvious junk
    if "http://" in low or "https://" in low or ".com" in low or ".co." in low:
        return ""
    # Convert emails to a readable token (left side)
    if "@" in s:
        s = s.split("@")[0].replace(".", " ").strip()
    # collapse whitespace
    s = re.sub(r'\s+', ' ', s)
    return s

def _is_ignored_author(name: str, author_obj: Dict[str, Any] | None = None) -> bool:
    if not name:
        return True
    if author_obj and IGNORE_AGENCY_FLAG:
        try:
            if bool(author_obj.get("isAgency")):
                return True
        except Exception:
            pass
    low = name.lower()
    if low in IGNORE_AUTHORS:
        return True
    for pat in IGNORE_AUTHOR_PATTERNS:
        if pat.search(name):
            return True
    return False

def normalize_authors_from_record(rec: Dict[str, Any]) -> List[str]:
    """
    Preferred: rec['authors'] = [{name, isAgency, ...}, ...]
    Fallbacks: rec['author'] (str or list), rec['byline'] (str)
    Returns a de-duplicated list of cleaned author names after applying ignore rules.
    """
    names: List[str] = []

    # Primary: authors[].name
    if isinstance(rec.get('authors'), list):
        for a in rec['authors']:
            if isinstance(a, dict):
                nm = _clean_name(a.get('name', ''))
                if nm and not _is_ignored_author(nm, a):
                    names.append(nm)
            else:
                nm = _clean_name(str(a))
                if nm and not _is_ignored_author(nm, None):
                    names.append(nm)

    # Fallback: author (str or list)
    if not names and ('author' in rec):
        auth = rec['author']
        if isinstance(auth, list):
            for a in auth:
                nm = _clean_name(str(a))
                if nm and not _is_ignored_author(nm, None):
                    names.append(nm)
        else:
            for p in _split_tokens(str(auth)):
                nm = _clean_name(p)
                if nm and not _is_ignored_author(nm, None):
                    names.append(nm)

    # Fallback: byline (str)
    if not names and isinstance(rec.get('byline'), str):
        for p in _split_tokens(rec['byline']):
            nm = _clean_name(p)
            if nm and not _is_ignored_author(nm, None):
                names.append(nm)

    # de-dup, preserve order
    seen, out = set(), []
    for n in names:
        if n and n not in seen:
            seen.add(n); out.append(n)
    return out

def explode_authors_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['authors_norm'] = df.apply(normalize_authors_from_record, axis=1)
    exploded = df.explode('authors_norm', ignore_index=True)
    exploded['has_author'] = exploded['authors_norm'].notna()
    return exploded

# ---------- LOAD & METRICS ----------
def load_source_df(source_key: str) -> pd.DataFrame:
    fp = os.path.join(ARTICLES_PATH, f"{source_key}_articles.json")
    with open(fp, "r") as f:
        data = json.load(f)
    # supports list of records OR {"articles":[...]}
    if isinstance(data, dict) and 'articles' in data:
        data = data['articles']
    df = pd.DataFrame(data)
    # ensure a sentiment column exists and is numeric in [-1, 1]
    if 'sentiment' not in df.columns:
        df['sentiment'] = np.nan
    df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce').clip(-1, 1)
    return df

def per_author_article_counts(exploded_author_series: pd.Series) -> pd.Series:
    return exploded_author_series.value_counts()

def author_hist_bins(counts_per_author: pd.Series) -> np.ndarray:
    """
    Sensible bins for right-skewed author counts.
    - If max <= 40 → integer-centered bins: [0.5, 1.5, ..., max+0.5]
    - Else → 30 linear bins [1, max]
    """
    if counts_per_author.empty:
        return np.array([])
    max_k = int(counts_per_author.max())
    if max_k <= 40:
        return np.arange(0.5, max_k + 1.5, 1.0)
    return np.linspace(1, max_k, 30)

def author_count_stats(counts_per_author: pd.Series) -> Tuple[float, float]:
    if counts_per_author.empty:
        return (np.nan, np.nan)
    mu = float(counts_per_author.mean())
    sigma = float(counts_per_author.std(ddof=1)) if len(counts_per_author) > 1 else 0.0
    return mu, sigma

def sentiment_stats(sentiments: pd.Series) -> Dict[str, float]:
    s = sentiments.dropna().astype(float).clip(-1, 1)
    if s.empty:
        return dict(mean=np.nan, std=np.nan, pos=0.0, neu=1.0, neg=0.0)
    neg = (s < -0.2).mean()
    neu = ((s >= -0.2) & (s <= 0.2)).mean()
    pos = (s > 0.2).mean()
    return dict(
        mean=float(s.mean()),
        std=float(s.std(ddof=1)) if len(s) > 1 else 0.0,
        pos=float(pos), neu=float(neu), neg=float(neg),
    )

def _label_hbar_counts(ax):
    for p in ax.patches:
        w = p.get_width()
        if w > 0:
            ax.annotate(f'{int(w)}', (w, p.get_y()+p.get_height()/2),
                        ha='left', va='center', fontsize=9, xytext=(3,0), textcoords='offset points')

# ---------- PANELS ----------
def _plot_log_author_hist(ax, counts_per_author: pd.Series, title_prefix: str):
    """
    Bottom-right tile: histogram of log10(articles per author).
    Shows μ and σ in log space and highlights modal bin.
    """
    if counts_per_author.empty:
        ax.text(0.5, 0.5, "No author data", ha='center', va='center')
        ax.set_title(f"{title_prefix} Log Author Count Distribution")
        ax.set_xticks([]); ax.set_yticks([])
        return

    vals = counts_per_author.values.astype(float)
    vals = vals[vals > 0]  # safety
    log_vals = np.log10(vals)

    # bins: auto works well on log data; fall back to 20 if needed
    try:
        bins = np.histogram_bin_edges(log_vals, bins='auto')
    except Exception:
        bins = np.linspace(log_vals.min(), log_vals.max(), 20)

    n, b, patches = ax.hist(log_vals, bins=bins, edgecolor='black')
    # highlight modal bin
    if len(n) > 0:
        modal_idx = int(np.argmax(n))
        if 0 <= modal_idx < len(patches):
            patches[modal_idx].set_facecolor('tab:orange')

    mu_log = float(log_vals.mean()) if log_vals.size else np.nan
    sd_log = float(log_vals.std(ddof=1)) if log_vals.size > 1 else 0.0
    if not np.isnan(mu_log):
        ax.axvline(mu_log, linestyle='-', linewidth=2)

    ax.set_xlabel("log10(articles per author)")
    ax.set_ylabel("# of authors")
    ax.set_title(f"{title_prefix} Log Author Count Distribution")
    # annotate μ, σ (log space)
    ax.text(0.98, 0.95,
            f"μ(log10)={mu_log:.3f}\nσ(log10)={sd_log:.3f}",
            ha='right', va='top', transform=ax.transAxes, fontsize=10,
            bbox=dict(boxstyle="round,pad=0.3", alpha=0.1))

def make_panel_for_source(source: NewsSource, df: pd.DataFrame, save_dir: str):
    # Standardize columns for safety
    df = df.copy()
    if 'sentiment' not in df.columns:
        df['sentiment'] = np.nan
    if not any(col in df.columns for col in ('authors','author','byline')):
        df['authors'] = [[] for _ in range(len(df))]

    total_articles = int(len(df))

    # Authors (with ignore filtering)
    author_lists = df.apply(normalize_authors_from_record, axis=1)
    no_author_count = int((author_lists.apply(len) == 0).sum())

    exploded = df.copy()
    exploded['authors_norm'] = author_lists
    exploded = exploded.explode('authors_norm', ignore_index=True)
    exploded['has_author'] = exploded['authors_norm'].notna()

    top_authors = (exploded.loc[exploded['has_author'], 'authors_norm']
                   .value_counts()
                   .head(10))
    top10_total = int(top_authors.sum()) if not top_authors.empty else 0

    counts_per_author = per_author_article_counts(
        exploded.loc[exploded['has_author'], 'authors_norm']
    )
    mu_auth, sigma_auth = author_count_stats(counts_per_author)
    bins_auth = author_hist_bins(counts_per_author)

    # Sentiment (JSON only)
    s_stats = sentiment_stats(df['sentiment'])
    avg_sent, std_sent = s_stats['mean'], s_stats['std']

    # Plotting
    fig = plt.figure(figsize=(14, 10), constrained_layout=True)
    gs = fig.add_gridspec(2, 2, height_ratios=[1,1])
    ax11 = fig.add_subplot(gs[0,0])   # author count histogram (linear)
    ax12 = fig.add_subplot(gs[0,1])   # top 10 authors
    ax21 = fig.add_subplot(gs[1,0])   # sentiment histogram
    ax22 = fig.add_subplot(gs[1,1])   # log author count histogram (NEW)

    # (1,1) Author count histogram (linear x)
    if not counts_per_author.empty and bins_auth.size > 0:
        n, bins, patches = ax11.hist(counts_per_author.values, bins=bins_auth, edgecolor='black')
        ax11.set_xlabel("Articles per author")
        ax11.set_ylabel("# of authors")
        ax11.set_title(f"[{source.key}] Author→Article Count Distribution")
        # highlight modal bin
        if len(n) > 0:
            modal_idx = int(np.argmax(n))
            if 0 <= modal_idx < len(patches):
                patches[modal_idx].set_facecolor('tab:orange')
        # mean line
        if not np.isnan(mu_auth):
            ax11.axvline(mu_auth, linestyle='-', linewidth=2)
        if AUTHOR_HIST_LOGX:
            ax11.set_xscale('log')
        # annotation
        ax11.text(0.98, 0.95,
                  f"Total articles: {total_articles}\n"
                  f"No-author articles: {no_author_count}\n"
                  f"μ (per-author count): {mu_auth:.2f}\nσ: {sigma_auth:.2f}",
                  ha='right', va='top', transform=ax11.transAxes, fontsize=10,
                  bbox=dict(boxstyle="round,pad=0.3", alpha=0.1))
    else:
        ax11.text(0.5, 0.5, "No author data", ha='center', va='center')
        ax11.set_title(f"[{source.key}] Author→Article Count Distribution")
        ax11.set_xticks([]); ax11.set_yticks([])

    # (1,2) Top 10 authors
    if not top_authors.empty:
        top_authors.iloc[::-1].plot(kind='barh', ax=ax12)
        ax12.set_xlabel("Article count")
        ax12.set_ylabel("Author")
        ax12.set_title(f"[{source.key}] Top 10 Authors (total={top10_total})")
        # highlight the top bar (rightmost)
        if len(ax12.patches) > 0:
            ax12.patches[-1].set_facecolor('tab:red')
        _label_hbar_counts(ax12)
    else:
        ax12.text(0.5, 0.5, "No authors found", ha='center', va='center')
        ax12.set_title(f"[{source.key}] Top 10 Authors (total=0)")
        ax12.set_xticks([]); ax12.set_yticks([])

    # (2,1) Sentiment histogram
    s = df['sentiment'].dropna().astype(float).clip(-1,1)
    if not s.empty:
        n_s, bins_s, patches_s = ax21.hist(s, bins=21, edgecolor='black')
        ax21.set_xlim(-1, 1)
        ax21.set_xlabel("Sentiment (JSON, [-1,1])")
        ax21.set_ylabel("# of articles")
        ax21.set_title(f"[{source.key}] Sentiment Distribution")
        ax21.axvline(0, linestyle='--', linewidth=1)
        if not np.isnan(avg_sent):
            ax21.axvline(avg_sent, linestyle='-', linewidth=2)
        ax21.text(0.98, 0.95,
                  (f"μ={avg_sent:.3f}\nσ={std_sent:.3f}\n"
                   f"pos>{0.2}: {s_stats['pos']*100:.1f}%\n"
                   f"-0.2≤neu≤0.2: {s_stats['neu']*100:.1f}%\n"
                   f"neg<-0.2: {s_stats['neg']*100:.1f}%")
                  if not np.isnan(avg_sent) else "μ=NaN\nσ=NaN",
                  ha='right', va='top', transform=ax21.transAxes, fontsize=10,
                  bbox=dict(boxstyle="round,pad=0.3", alpha=0.1))
    else:
        ax21.text(0.5, 0.5, "No sentiment data in JSON", ha='center', va='center')
        ax21.set_title(f"[{source.key}] Sentiment Distribution")
        ax21.set_xticks([]); ax21.set_yticks([])

    # (2,2) NEW: Log author count histogram
    _plot_log_author_hist(ax22, counts_per_author, f"[{source.key}]")

    fig.suptitle(f"{source.key} — Distributions Panel", fontsize=14)  # (kept exactly as you asked)
    out_path = os.path.join(save_dir, f"{source.key}_panel.png")
    fig.savefig(out_path, dpi=160)
    plt.close(fig)

def make_panel_cumulative(all_records: List[Tuple[NewsSource, pd.DataFrame]], save_dir: str):
    frames = []
    for src, df in all_records:
        if df is None or df.empty:
            continue
        tmp = df.copy()
        tmp['__rating__'] = src.mbfc_rating
        tmp['__source__'] = src.key
        # ensure sentiment column exists
        if 'sentiment' not in tmp.columns:
            tmp['sentiment'] = np.nan
        frames.append(tmp)

    if not frames:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.text(0.5, 0.5, "No data across sources", ha='center', va='center')
        fig.savefig(os.path.join(save_dir, "all_sources_panel.png"), dpi=160)
        plt.close(fig)
        return

    big = pd.concat(frames, ignore_index=True)
    if 'sentiment' not in big.columns:
        big['sentiment'] = np.nan

    # authors (WITH ignore filtering)
    author_lists = big.apply(normalize_authors_from_record, axis=1)
    exploded = big.copy()
    exploded['authors_norm'] = author_lists
    exploded = exploded.explode('authors_norm', ignore_index=True)
    exploded['has_author'] = exploded['authors_norm'].notna()

    top_authors = (exploded.loc[exploded['has_author'], 'authors_norm']
                   .value_counts()
                   .head(10))
    top10_total = int(top_authors.sum()) if not top_authors.empty else 0

    counts_per_author = per_author_article_counts(
        exploded.loc[exploded['has_author'], 'authors_norm']
    )
    mu_auth, sigma_auth = author_count_stats(counts_per_author)
    bins_auth = author_hist_bins(counts_per_author)

    # sentiment
    big['sentiment'] = pd.to_numeric(big['sentiment'], errors='coerce').clip(-1, 1)
    s_stats = sentiment_stats(big['sentiment'])
    avg_sent, std_sent = s_stats['mean'], s_stats['std']

    # plotting
    fig = plt.figure(figsize=(14, 10), constrained_layout=True)
    gs = fig.add_gridspec(2, 2, height_ratios=[1,1])
    ax11 = fig.add_subplot(gs[0,0])
    ax12 = fig.add_subplot(gs[0,1])
    ax21 = fig.add_subplot(gs[1,0])
    ax22 = fig.add_subplot(gs[1,1])

    # (1,1) cumulative author histogram (linear)
    if not counts_per_author.empty and bins_auth.size > 0:
        n, bins, patches = ax11.hist(counts_per_author.values, bins=bins_auth, edgecolor='black')
        ax11.set_xlabel("Articles per author")
        ax11.set_ylabel("# of authors")
        ax11.set_title("[All] Author→Article Count Distribution")
        if len(n) > 0:
            modal_idx = int(np.argmax(n))
            if 0 <= modal_idx < len(patches):
                patches[modal_idx].set_facecolor('tab:orange')
        if not np.isnan(mu_auth):
            ax11.axvline(mu_auth, linestyle='-', linewidth=2)
        if AUTHOR_HIST_LOGX:
            ax11.set_xscale('log')
        ax11.text(0.98, 0.95,
                  f"μ (per-author count): {mu_auth:.2f}\nσ: {sigma_auth:.2f}",
                  ha='right', va='top', transform=ax11.transAxes, fontsize=10,
                  bbox=dict(boxstyle="round,pad=0.3", alpha=0.1))
    else:
        ax11.text(0.5, 0.5, "No author data", ha='center', va='center')
        ax11.set_title("[All] Author→Article Count Distribution")
        ax11.set_xticks([]); ax11.set_yticks([])

    # (1,2) cumulative top 10 authors
    if not top_authors.empty:
        top_authors.iloc[::-1].plot(kind='barh', ax=ax12)
        ax12.set_xlabel("Article count")
        ax12.set_ylabel("Author")
        ax12.set_title(f"[All] Top 10 Authors (total={top10_total})")
        if len(ax12.patches) > 0:
            ax12.patches[-1].set_facecolor('tab:red')
        _label_hbar_counts(ax12)
    else:
        ax12.text(0.5, 0.5, "No authors found", ha='center', va='center')
        ax12.set_title(f"[All] Top 10 Authors (total=0)")
        ax12.set_xticks([]); ax12.set_yticks([])

    # (2,1) cumulative sentiment histogram
    s = big['sentiment'].dropna().astype(float).clip(-1,1)
    if not s.empty:
        n_s, bins_s, patches_s = ax21.hist(s, bins=21, edgecolor='black')
        ax21.set_xlim(-1, 1)
        ax21.set_xlabel("Sentiment [-1,1]")
        ax21.set_ylabel("# of articles")
        ax21.set_title("[All] Sentiment Distribution")
        ax21.axvline(0, linestyle='--', linewidth=1)
        if not np.isnan(avg_sent):
            ax21.axvline(avg_sent, linestyle='-', linewidth=2)
        ax21.text(0.98, 0.95,
                  (f"μ={avg_sent:.3f}\nσ={std_sent:.3f}"),
                  ha='right', va='top', transform=ax21.transAxes, fontsize=10,
                  bbox=dict(boxstyle="round,pad=0.3", alpha=0.1))
    else:
        ax21.text(0.5, 0.5, "No sentiment data in JSON", ha='center', va='center')
        ax21.set_title("[All] Sentiment Distribution")
        ax21.set_xticks([]); ax21.set_yticks([])

    # (2,2) NEW: cumulative log author count histogram
    _plot_log_author_hist(ax22, counts_per_author, "[All]")

    fig.suptitle("All Sources — Distributions Panel", fontsize=14)  # (kept)
    fig.savefig(os.path.join(save_dir, "all_sources_panel.png"), dpi=160)
    plt.close(fig)

# ---------- MAIN ----------
if __name__ == "__main__":
    all_data: List[Tuple[NewsSource, pd.DataFrame]] = []

    for i, source in enumerate(news_sources):
        print(f"Processing {source.key}... {i+1}/{len(news_sources)}")
        try:
            df_src = load_source_df(source.key)
        except FileNotFoundError:
            print(f"  ⚠️ Missing file: {source.key}_articles.json — skipping.")
            # Create an empty frame with a sentiment column so downstream code is safe
            df_src = pd.DataFrame({'sentiment': []})
        all_data.append((source, df_src))
        make_panel_for_source(source, df_src, FIG_DIR)

    make_panel_cumulative(all_data, FIG_DIR)
    print(f"✅ Done. Saved {len(news_sources)} per-source panels + 1 cumulative panel → {os.path.abspath(FIG_DIR)}")


Processing foxnews... 1/20
Processing breitbart... 2/20
Processing dailymail... 3/20
Processing thesun... 4/20
Processing wsj... 5/20
Processing nypost... 6/20
Processing forbes... 7/20
Processing indiatimes... 8/20
Processing newsweek... 9/20
Processing reuters... 10/20
Processing thehill... 11/20
Processing nytimes... 12/20
Processing washingtonpost... 13/20
Processing usatoday... 14/20
Processing buzzfeed... 15/20
Processing cbsnews... 16/20
Processing sfgate... 17/20
Processing bloomberg... 18/20
Processing cnn... 19/20
Processing people... 20/20
✅ Done. Saved 20 per-source panels + 1 cumulative panel → /Users/baturalpkabadayi/Dev/Research/JournalistLLM/outputs/newsapi/figures
