<a href="https://colab.research.google.com/github/bhussn/Bias-Velocity-Research-Project/blob/main/Narrative_Velocity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch numpy

In [None]:
# Step 1: Web Scrape Articles
import feedparser
from newspaper import Article
import csv
from datetime import datetime, timedelta, timezone
import requests
import re

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; NewsScraper/1.0; +http://yourdomain.com)'
}

topics = {
    "immigration": {
        "conservative": {
            "Fox News Politics": "http://feeds.foxnews.com/foxnews/politics",
            "Fox News US Immigration": "https://www.foxnews.com/category/us/immigration/feed",
            "The Daily Caller": "https://dailycaller.com/feed/",
            "The Blaze": "https://www.theblaze.com/rss",
            "Breitbart": "http://feeds.feedburner.com/breitbart",
            "Breitbart Politics": "https://www.breitbart.com/politics/feed/",
            "National Review": "https://www.nationalreview.com/feed/",
            "The Washington Times": "https://www.washingtontimes.com/rss/feed/",
            "The Epoch Times": "https://www.theepochtimes.com/feed.xml",
            "Newsmax": "https://www.newsmax.com/rss/",
            "Townhall": "https://townhall.com/rss/rss.xml",
            "The Federalist": "https://thefederalist.com/feed/",
            "Daily Wire": "https://www.dailywire.com/rss",
            "One America News": "https://www.oann.com/feed/",
            "Washington Examiner": "https://www.washingtonexaminer.com/feed/rss",
            "American Thinker": "https://www.americanthinker.com/rss.xml",
            "The American Conservative": "https://www.theamericanconservative.com/feed/",
            "The Daily Signal": "https://www.dailysignal.com/feed/"
        },
        "moderate": {
            "Reuters": "https://www.reutersagency.com/feed/?best-topics=politics",
            "Associated Press Top News": "https://apnews.com/apf-topnews?format=rss",
            "Associated Press Politics": "https://apnews.com/apf-topnews?format=rss",
            "NPR General": "https://www.npr.org/rss/rss.php?id=1001",
            "NPR Politics": "https://www.npr.org/rss/rss.php?id=1014",
            "USA Today Nation": "https://rssfeeds.usatoday.com/UsatodaycomNation-TopStories",
            "USA Today Politics": "https://rssfeeds.usatoday.com/UsatodaycomPolitics-TopStories",
            "PBS NewsHour": "https://www.pbs.org/newshour/feed/",
            "PBS Newshour Politics": "https://www.pbs.org/newshour/politics/feed/",
            "Bloomberg Politics": "https://www.bloomberg.com/feed/podcast/politics.xml",
            "Politico": "https://www.politico.com/rss/politics08.xml",
            "The Hill": "https://thehill.com/rss/syndicator/19109",
            "CBS News Politics": "https://www.cbsnews.com/latest/rss/politics",
            "ABC News Politics": "https://abcnews.go.com/abcnews/politicsheadlines",
            "The Wall Street Journal General": "https://www.wsj.com/xml/rss/3_7014.xml",
            "The Wall Street Journal Politics": "https://www.wsj.com/xml/rss/3_7014.xml",
            "Financial Times": "https://www.ft.com/?format=rss",
            "The Christian Science Monitor": "https://www.csmonitor.com/feeds/rss",
            "Axios Politics": "https://www.axios.com/feed.xml",
            "BBC News US & Canada": "http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml",
            "Al Jazeera English": "https://www.aljazeera.com/xml/rss/all.xml"
        },
        "liberal": {
            "CNN Politics": "http://rss.cnn.com/rss/edition_politics.rss",
            "CNN Immigration": "http://rss.cnn.com/rss/edition_us_immigration.rss",
            "The Guardian Immigration": "https://www.theguardian.com/us-news/immigration/rss",
            "Mother Jones": "https://www.motherjones.com/feed/",
            "MSNBC Latest": "http://www.msnbc.com/feeds/latest",
            "HuffPost Politics": "https://www.huffpost.com/section/politics/feed",
            "Vox": "https://www.vox.com/rss/index.xml",
            "Daily Kos": "https://www.dailykos.com/rss/main",
            "Salon": "https://www.salon.com/feed/",
            "The New Republic": "https://newrepublic.com/rss.xml",
            "The Atlantic": "https://www.theatlantic.com/feed/all/",
            "Slate": "https://slate.com/feed",
            "ThinkProgress (Archive)": "https://archive.thinkprogress.org/feed/",
            "The Nation": "https://www.thenation.com/feed/",
            "Common Dreams": "https://www.commondreams.org/feed/rss.xml",
            "Raw Story": "https://www.rawstory.com/rss/",
            "Truthout": "https://truthout.org/feed/",
            "Democracy Now": "https://www.democracynow.org/democracynow.rss"
        }
    }
}

MAX_ARTICLES_PER_IDEOLOGY = 3
MAX_ARTICLES_PER_OUTLET = 4

KEYWORDS = [
    "ice",
    "immigration and customs enforcement",
    "immigration enforcement",
    "deportation",
    "border patrol",
    "customs and border protection",
    "cbp",
    "detention center",
    "immigrant detention",
    "immigration raids",
    "immigration crackdown",
    "immigration policy",
    "immigration reform",
    "immigration laws",
    "immigration agents",
    "immigration officials",
    "border security",
    "migrant detention",
    "immigration detention facility",
    "immigration court",
    "immigrant rights",
    "family separation",
    "sanctuary cities",
    "deportee",
    "ice agents",
    "undocumented immigrants",
    "migrant caravan",
    "asylum seekers",
    "border crossing",
    "illegal immigration",
    "immigration ban",
    "visa policy",
    "naturalization",
    "immigration detention center",
    "immigration raid",
    "deportee",
    "migration policy",
    "refugee status",
    "immigration",
    "border",
    "border security",
    "asylum",
    "visa",
    "green card",
    "immigration reform",
    "deportation",
    "ICE",
    "CBP",
    "citizenship",
    "migrant policy",
    "migrant caravan",
    "border wall",
    "refugee",
    "work permit",
    "naturalization",
    "detention center",
    "family separation",
    "Title 42",
    "parole program",
    "illegal immigration",
    "mass migration",
    "undocumented immigrants",
    "sanctuary city",
    "sanctuary cities",
    "amnesty",
    "open borders",
    "migrant surge",
    "immigration",
    "border",
    "border security",
    "asylum",
    "visa",
    "green card",
    "immigration reform",
    "deportation",
    "ICE",
    "CBP",
    "citizenship",
    "migrant policy",
    "migrant caravan",
    "border wall",
    "refugee",
    "work permit",
    "naturalization",
    "detention center",
    "family separation",
    "Title 42",
    "parole program",
    "illegal immigration",
    "mass migration",
    "undocumented immigrants",
    "sanctuary city",
    "sanctuary cities",
    "amnesty",
    "open borders",
    "migrant surge",
    "border patrol",
    "immigration raid",
    "ICE raid",
    "removal proceedings",
    "immigration detention",
    "immigration enforcement",
    "immigration crackdown",
    "expedited removal",
    "temporary protected status",
    "TPS",
    "DACA",
    "Dreamers",
    "E-Verify",
    "immigration court",
    "customs enforcement",
    "ICE facility",
    "immigration prison",
    "deferred action",
    "catch and release",
    "migrant processing",
    "detention facility",
    "ICE detention",

]

KEYWORDS = [k.lower() for k in KEYWORDS]

def url_exists(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5, headers=HEADERS)
        if response.status_code == 200:
            return True
        response = requests.get(url, stream=True, timeout=5, headers=HEADERS)
        return response.status_code == 200
    except requests.RequestException:
        return False

def to_naive(dt):
    if dt is None:
        return None
    if dt.tzinfo is not None:
        return dt.astimezone(timezone.utc).replace(tzinfo=None)
    return dt

def is_recent(publish_date, days=30):
    publish_date_naive = to_naive(publish_date)
    now_naive = datetime.utcnow()
    if not publish_date_naive:
        return False
    return publish_date_naive >= now_naive - timedelta(days=days)

def contains_keyword_in_title(title, keywords):
    title = title.lower()
    for kw in keywords:
        pattern = r'\b' + re.escape(kw) + r'\b'
        if re.search(pattern, title):
            return True
    return False

output_rows = []
topic = "immigration"
ideologies = topics[topic]

counts = {ideo: 0 for ideo in ideologies}
outlet_counts = {ideo: {outlet: 0 for outlet in outlets} for ideo, outlets in ideologies.items()}

print(f"Starting scraping articles on '{topic}' topic...")

# Loop until each ideology reaches max article count
while any(counts[ideo] < MAX_ARTICLES_PER_IDEOLOGY for ideo in counts):
    for ideology, outlets in ideologies.items():
        if counts[ideology] >= MAX_ARTICLES_PER_IDEOLOGY:
            continue

        for outlet, feed_url in outlets.items():
            if outlet_counts[ideology][outlet] >= MAX_ARTICLES_PER_OUTLET:
                continue

            print(f"Fetching feed: {outlet} ({ideology})")
            feed = feedparser.parse(feed_url)
            for entry in feed.entries:
                if counts[ideology] >= MAX_ARTICLES_PER_IDEOLOGY or outlet_counts[ideology][outlet] >= MAX_ARTICLES_PER_OUTLET:
                    break

                url = entry.link
                if not url_exists(url):
                    continue

                try:
                    article = Article(url)
                    article.download()
                    article.parse()
                except Exception:
                    continue

                # Get publish date
                publish_date = None
                if hasattr(article, 'publish_date') and article.publish_date:
                    publish_date = article.publish_date
                elif hasattr(entry, 'published_parsed'):
                    publish_date = datetime(*entry.published_parsed[:6])

                if not is_recent(publish_date):
                    continue

                title = article.title if article and article.title else (entry.title if hasattr(entry, 'title') else "")
                sample_text = article.text[:10000].strip() if article and article.text else ""

                if not sample_text:
                    continue

                # STRICT keyword check only in title (whole word matching)
                if not contains_keyword_in_title(title, KEYWORDS):
                    continue

                # Skip duplicates
                if any(row["url"] == url for row in output_rows):
                    continue

                datetime_str = publish_date.strftime("%Y-%m-%d %H:%M") if publish_date else ""

                output_rows.append({
                    "topic": topic,
                    "outlet": outlet,
                    "datetime": datetime_str,
                    "title": title,
                    "url": url,
                    "sample_text": sample_text,
                    "ideological_stance": ideology,
                    "factual_grounding": "",
                    "framing_choices": "",
                    "emotional_tone": "",
                    "source_transparency": ""
                })

                counts[ideology] += 1
                outlet_counts[ideology][outlet] += 1

                print(f"Added article ({counts[ideology]}/{MAX_ARTICLES_PER_IDEOLOGY}) from {outlet} ({ideology})")

            if counts[ideology] >= MAX_ARTICLES_PER_IDEOLOGY:
                print(f"Reached max articles for {ideology}")

print("Scraping done. Saving to CSV...")

csv_columns = [
    "topic", "outlet", "datetime", "title", "url", "sample_text",
    "ideological_stance", "factual_grounding", "framing_choices", "emotional_tone", "source_transparency"
]

with open("news_bias_articles.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(output_rows)

print("Done! Articles saved to news_bias_articles.csv")

In [None]:
# Step 2: Bias Scoring
import pandas as pd
from transformers import pipeline
import numpy as np
import time

OUTLET_TO_IDEOLOGY = {
    # Conservative outlets (matching your new keys exactly)
    "Fox News Politics": "conservative",
    "Fox News US Immigration": "conservative",
    "The Daily Caller": "conservative",
    "The Blaze": "conservative",
    "Breitbart": "conservative",
    "Breitbart Politics": "conservative",
    "National Review": "conservative",
    "The Washington Times": "conservative",
    "The Epoch Times": "conservative",
    "Newsmax": "conservative",
    "Townhall": "conservative",
    "The Federalist": "conservative",
    "Daily Wire": "conservative",
    "One America News": "conservative",
    "Washington Examiner": "conservative",
    "American Thinker": "conservative",
    "The American Conservative": "conservative",
    "The Daily Signal": "conservative",

    # Moderate outlets
    "Reuters": "moderate",
    "Associated Press Top News": "moderate",
    "Associated Press Politics": "moderate",
    "NPR General": "moderate",
    "NPR Politics": "moderate",
    "USA Today Nation": "moderate",
    "USA Today Politics": "moderate",
    "PBS NewsHour": "moderate",
    "PBS Newshour Politics": "moderate",
    "Bloomberg Politics": "moderate",
    "Politico": "moderate",
    "The Hill": "moderate",
    "CBS News Politics": "moderate",
    "ABC News Politics": "moderate",
    "The Wall Street Journal General": "moderate",
    "The Wall Street Journal Politics": "moderate",
    "Financial Times": "moderate",
    "The Christian Science Monitor": "moderate",
    "Axios Politics": "moderate",
    "BBC News US & Canada": "moderate",
    "Al Jazeera English": "moderate",

    # Liberal outlets
    "CNN Politics": "liberal",
    "CNN Immigration": "liberal",
    "The Guardian Immigration": "liberal",
    "Mother Jones": "liberal",
    "MSNBC Latest": "liberal",
    "HuffPost Politics": "liberal",
    "Vox": "liberal",
    "Daily Kos": "liberal",
    "Salon": "liberal",
    "The New Republic": "liberal",
    "The Atlantic": "liberal",
    "Slate": "liberal",
    "ThinkProgress (Archive)": "liberal",
    "The Nation": "liberal",
    "Common Dreams": "liberal",
    "Raw Story": "liberal",
    "Truthout": "liberal",
    "Democracy Now": "liberal",
}


# Bias dimensions, model returns lowercase labels, so lowercase here
BIAS_DIMENSIONS = {
    "ideological_stance": ["left", "center", "right"],
    "factual_grounding": ["low", "medium", "high"],
    "framing_choices": ["biased", "balanced", "unbiased"],
    "emotional_tone": ["neutral", "mild", "inflammatory"],
    "source_transparency": ["opaque", "moderate", "transparent"]
}

# Load zero-shot classifier pipeline once
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Map model labels to numeric ideology scores
model_label_to_score = {"left": 0, "center": 50, "right": 100}
# Map outlet labels (lowercase) to numeric ideology scores to match your ideology labels in data
outlet_label_to_score = {"liberal": 0, "moderate": 50, "conservative": 100}

def score_text(text, max_retries=3):
    results = {}
    for dim, labels in BIAS_DIMENSIONS.items():
        for attempt in range(max_retries):
            try:
                res = classifier(text, labels, multi_label=False)
                returned_labels = [label.lower() for label in res['labels']]
                probs = np.array(res['scores'])
                probs = probs / probs.sum()  # normalize

                label_to_score = {label: idx * 50 for idx, label in enumerate(labels)}
                weighted_score = sum(probs[i] * label_to_score[returned_labels[i]] for i in range(len(labels)))

                results[dim] = round(weighted_score, 2)
                break
            except Exception as e:
                print(f"Error scoring '{dim}', attempt {attempt + 1}/{max_retries}: {e}")
                time.sleep(1)
                if attempt == max_retries - 1:
                    results[dim] = None
    return results

def main():
    df = pd.read_csv("news_bias_articles.csv")
    total_rows = len(df)
    print(f"Total articles to process: {total_rows}")

    # Do NOT lowercase outlet names — keep as is for matching
    df['outlet'] = df['outlet'].str.strip()  # Just strip spaces, no lowercase

    # Add columns if missing
    for col in list(BIAS_DIMENSIONS.keys()) + ["ideology_label", "combined_ideological_stance"]:
        if col not in df.columns:
            df[col] = np.nan

    weight_outlet = 0.7  # weight of outlet ideology in final score
    weight_model = 0.3   # weight of model predicted ideology in final score

    for idx, row in df.iterrows():
        text = row.get('sample_text', "")
        if pd.isna(text) or not text.strip():
            print(f"Skipping empty text at index {idx}")
            continue

        outlet = row.get('outlet', "")
        outlet_label = OUTLET_TO_IDEOLOGY.get(outlet)
        if outlet_label is None:
            print(f"Unknown outlet ideology for '{outlet}' at index {idx}, skipping...")
            continue

        # Store outlet ideology label (lowercase)
        df.at[idx, "ideology_label"] = outlet_label

        # Get outlet ideology numeric score
        outlet_score = outlet_label_to_score[outlet_label]

        # Get model scores for all dimensions, including ideological_stance
        scores = score_text(text.strip())
        for dim in BIAS_DIMENSIONS.keys():
            df.at[idx, dim] = scores.get(dim)

        # Combine outlet and model ideological stance scores
        model_score = scores.get("ideological_stance")
        if model_score is None:
            combined_score = outlet_score  # fallback if model failed
        else:
            combined_score = round(weight_outlet * outlet_score + weight_model * model_score, 2)

        df.at[idx, "combined_ideological_stance"] = combined_score

        if (idx + 1) % 10 == 0 or (idx + 1) == total_rows:
            print(f"Processed {idx + 1}/{total_rows} ({(idx + 1) / total_rows * 100:.1f}%)")

    df.to_csv("news_bias_articles_scored.csv", index=False)
    print("Saved scored CSV as news_bias_articles_scored.csv")

if __name__ == "__main__":
    main()


In [None]:
# Step 3: Set Narrative Clusters
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

def narrative_clustering_and_labeling(
    input_csv="news_bias_articles_scored.csv",
    output_csv="news_bias_articles_clustered_labeled.csv",
    n_clusters=3
):
    # Load scored CSV with ideological_stance scores
    df = pd.read_csv(input_csv)

    # Load Sentence-BERT model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Initialize cluster assignment lists
    cluster_ids = [-1] * len(df)
    cluster_labels = [None] * len(df)

    print(f"Clustering articles within each topic into {n_clusters} clusters each...")

    # Process each topic separately
    for topic in df['topic'].unique():
        subset_idx = df[df['topic'] == topic].index.tolist()
        texts = df.loc[subset_idx, 'sample_text'].fillna("").tolist()
        ideological_scores = df.loc[subset_idx, 'ideological_stance'].fillna(50).tolist()  # Default center if missing

        # Filter out empty texts (no embeddings for empty)
        valid_indices = [i for i, t in enumerate(texts) if t.strip() != ""]
        if len(valid_indices) == 0:
            print(f"No valid texts for topic '{topic}', skipping.")
            continue

        texts_nonempty = [texts[i] for i in valid_indices]
        ideology_nonempty = [ideological_scores[i] for i in valid_indices]

        print(f"Computing embeddings for topic '{topic}' with {len(texts_nonempty)} articles...")
        embeddings = model.encode(texts_nonempty, show_progress_bar=True)

        # Combine embeddings with ideological stance as additional feature
        ideology_array = np.array(ideology_nonempty).reshape(-1, 1)
        combined_features = np.hstack([embeddings, ideology_array])

        n_clust = min(n_clusters, len(texts_nonempty))
        print(f"Clustering topic '{topic}' into {n_clust} clusters using embeddings + ideology...")
        kmeans = KMeans(n_clusters=n_clust, random_state=42)
        labels = kmeans.fit_predict(combined_features)

        # Assign cluster IDs back to main dataframe indices for valid texts
        for i, label in zip(valid_indices, labels):
            cluster_ids[subset_idx[i]] = label

        # Create temporary df slice with cluster info to calculate cluster means
        sub_df = df.loc[subset_idx].copy()
        sub_df['cluster_id'] = -1
        for i, label in zip(valid_indices, labels):
            sub_df.at[subset_idx[i], 'cluster_id'] = label

        # Calculate mean ideological_stance per cluster for label mapping
        cluster_means = sub_df.groupby('cluster_id')['ideological_stance'].mean().dropna()
        sorted_clusters = cluster_means.sort_values().index.tolist()

        # Assign human-readable bias labels depending on cluster count
        bias_labels_map = {
            3: ["Liberal", "Unbiased", "Conservative"],
            2: ["Liberal", "Conservative"],
            1: ["Unbiased"]
        }
        bias_labels = bias_labels_map.get(len(sorted_clusters), [f"Cluster_{i}" for i in range(len(sorted_clusters))])
        cluster_label_map = {cid: bias_labels[i] for i, cid in enumerate(sorted_clusters)}

        print(f"Topic '{topic}' cluster label mapping: {cluster_label_map}")

        # Assign cluster labels to all articles in topic
        for idx in subset_idx:
            c_id = cluster_ids[idx]
            cluster_labels[idx] = cluster_label_map.get(c_id, None)

    # Add cluster info columns to DataFrame
    df['cluster_id'] = cluster_ids
    df['cluster_label'] = cluster_labels

    # Save output CSV in current working directory
    df.to_csv(output_csv, index=False)
    print(f"Saved clustered and labeled articles to {output_csv}")

if __name__ == "__main__":
    narrative_clustering_and_labeling()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def score_to_label(score):
    if pd.isna(score):
        return "Unknown"
    elif score <= 33:
        return "Liberal"
    elif score <= 66:
        return "Moderate"
    else:
        return "Conservative"

def analyze_velocity(input_csv="news_bias_articles_scored.csv"):
    # Load data with datetime parsing
    df = pd.read_csv(input_csv, parse_dates=['datetime'])

    # Map numeric ideological stance to label
    df['combined_ideology_label'] = df['ideological_stance'].apply(score_to_label)

    # Clean dataset
    df = df.dropna(subset=['datetime', 'combined_ideology_label', 'outlet'])

    # Sort by datetime ascending
    df = df.sort_values('datetime')

    print(f"Total articles analyzed: {len(df)}\n")

    # === Per ideology summary ===
    first_article_times = {}
    for ideology in ['Liberal', 'Moderate', 'Conservative']:
        sub = df[df['combined_ideology_label'] == ideology]
        if len(sub) == 0:
            print(f"No articles found for ideology: {ideology}\n")
            continue

        start_time = sub['datetime'].min()
        end_time = sub['datetime'].max()
        count = len(sub)
        first_article_times[ideology] = start_time

        print(f"Ideology: {ideology}")
        print(f"  Articles: {count}")
        print(f"  Time range: {start_time} to {end_time}")
        print(f"  First 3 articles:")
        print(sub[['datetime', 'outlet', 'title']].head(3).to_string(index=False))
        print(f"  Last 3 articles:")
        print(sub[['datetime', 'outlet', 'title']].tail(3).to_string(index=False))
        print()

    # === Publication counts per day per ideology ===
    df['date'] = df['datetime'].dt.date
    counts = df.groupby(['date', 'combined_ideology_label']).size().unstack(fill_value=0)
    print("Publication counts per day per ideology:")
    print(counts)
    print()

    # === Calculate lag times between first article publications (in hours) per ideology ===
    print("Lag times between first article publications (hours):")
    ideologies = ['Liberal', 'Moderate', 'Conservative']
    for i in range(len(ideologies)):
        for j in range(i+1, len(ideologies)):
            a, b = ideologies[i], ideologies[j]
            if a in first_article_times and b in first_article_times:
                lag = (first_article_times[b] - first_article_times[a]).total_seconds() / 3600
                print(f"  {a} -> {b}: {lag:.2f} hours")
    print()

    # === Per outlet first article times (to find initiators) ===
    print("First article publication times per outlet:")
    outlets = df['outlet'].unique()
    outlet_first_times = {}
    for outlet in outlets:
        out_sub = df[df['outlet'] == outlet]
        first_time = out_sub['datetime'].min()
        outlet_first_times[outlet] = first_time
        print(f"  {outlet}: {first_time}")
    print()

    # === Visualization: Timeline scatter plot with ideological stance ===
    plt.figure(figsize=(14, 7))
    sns.scatterplot(
        data=df,
        x='datetime',
        y='ideological_stance',
        hue='combined_ideology_label',
        style='combined_ideology_label',
        palette={'Liberal':'blue', 'Moderate':'green', 'Conservative':'red'},
        s=100,
        alpha=0.7
    )
    plt.title("Publication Timeline: Ideological Stance Over Time")
    plt.xlabel("Publication DateTime")
    plt.ylabel("Ideological Stance Score (0=Liberal, 100=Conservative)")
    plt.legend(title="Ideology")
    plt.tight_layout()
    plt.show()

    # === Additional Visualization: Article count over time by ideology and outlet ===
    plt.figure(figsize=(14, 6))
    sns.lineplot(
        data=df,
        x='date',
        y=df.groupby(['date', 'combined_ideology_label']).size().reindex(df['date']).values,
        hue='combined_ideology_label',
        palette={'Liberal':'blue', 'Moderate':'green', 'Conservative':'red'}
    )
    plt.title("Daily Publication Counts by Ideology")
    plt.xlabel("Date")
    plt.ylabel("Number of Articles")
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    analyze_velocity()