In [1]:
from dotenv import load_dotenv
import os
import json

load_dotenv()

NEWSAPI_API_KEY = os.getenv("NEWSAPI_API_KEY")

In [2]:
class NewsSource:
    def __init__(self, key: str, domain: str, mbfc_rating: str):
        self.key = key
        self.domain = domain
        self.mbfc_rating = mbfc_rating

In [5]:
# Init all news sources
"""
- Right
    - Fox News - [foxnews.com](http://foxnews.com)
    - Breitbart - [breitbart.com](http://breitbart.com)
    - Dailymail - [dailymail.co.uk](http://dailymail.co.uk) → UK
    - The Sun - [thesun.co.uk](http://thesun.co.uk/) → UK
- Right-Center
    - Wall Street Journal - [wsj.com](http://wsj.com)
    - New York Post - [nypost.com](http://nypost.com/)
    - Forbes - [forbes.com](http://forbes.com)
    - India Times - [indiatimes.com](http://indiatimes.com) → India
    - News Week - [newsweek.com](http://newsweek.com/)
- Neutral
    - Reuters - [reuters.com](http://reuters.com)
    - The Hill - [thehill.com](http://thehill.com)
- Left-Center
    - New York Times - [nytimes.com](http://nytimes.com)
    - Washington Post - [washingtonpost.com](http://washingtonpost.com)
    - USA Today - [usatoday.com](http://usatoday.com)
    - Buzz Feed - [buzzfeed.com](http://buzzfeed.com)
    - CBS News - [cbsnews.com](http://cbsnews.com)
    - SF Gate - [sfgate.com](http://sfgate.com/)
    - Bloomberg - [bloomberg.com](http://bloomberg.com)
- Left
    - CNN - [cnn.com](http://cnn.com)
    - People - [people.com](http://people.com)
"""
news_sources = [
    # Right
    NewsSource("foxnews", "foxnews.com", "Right"),
    NewsSource("breitbart", "breitbart.com", "Right"),
    NewsSource("dailymail", "dailymail.co.uk", "Right"),
    NewsSource("thesun", "thesun.co.uk", "Right"),
    # Right-Center
    NewsSource("wsj", "wsj.com", "Right-Center"),
    NewsSource("nypost", "nypost.com", "Right-Center"),
    NewsSource("forbes", "forbes.com", "Right-Center"),
    NewsSource("indiatimes", "indiatimes.com", "Right-Center"),
    NewsSource("newsweek", "newsweek.com", "Right-Center"),
    # Neutral
    NewsSource("reuters", "reuters.com", "Neutral"),
    NewsSource("thehill", "thehill.com", "Neutral"),
    # Left-Center
    NewsSource("nytimes", "nytimes.com", "Left-Center"),
    NewsSource("washingtonpost", "washingtonpost.com", "Left-Center"),
    NewsSource("usatoday", "usatoday.com", "Left-Center"),
    NewsSource("buzzfeed", "buzzfeed.com", "Left-Center"),
    NewsSource("cbsnews", "cbsnews.com", "Left-Center"),
    NewsSource("sfgate", "sfgate.com", "Left-Center"),
    NewsSource("bloomberg", "bloomberg.com", "Left-Center"),
    # Left
    NewsSource("cnn", "cnn.com", "Left"),
    NewsSource("people", "people.com", "Left"),   
]

In [10]:
BASE_PATH = "../../outputs/newsapi/"
ARTICLES_PATH = BASE_PATH + "articles/"
AUTHORS_PATH = BASE_PATH + "authors/"

### Fetch The Articles

In [None]:
from eventregistry import EventRegistry, QueryArticlesIter
er = EventRegistry(apiKey=NEWSAPI_API_KEY)
max_articles = 10000

In [None]:
for source in news_sources:
    q = QueryArticlesIter(
        sourceUri = source.domain,
        lang = "eng",
        dateStart = "2025-01-01",
    )

    articles = []

    for art in q.execQuery(er, sortBy = "socialScore", maxItems = 10000):
        articles.append(art)
    
    # store the articles in a json file
    with open(f"{ARTICLES_PATH}{source.key}_articles.json", "w") as f:
        json.dump(articles, f, indent=4, ensure_ascii=False)


### Analyze Authors

In [9]:
import pandas as pd

In [26]:
for i, source in enumerate(news_sources):
    print(f"Processing {source.key}... {i+1}/{len(news_sources)}")
    # Load the articles
    with open(f"{ARTICLES_PATH}{source.key}_articles.json", "r") as f:
        articles = json.load(f)

    authors2count = {}

    for article in articles:
        authors = article.get("authors", [])
        for author in authors:
            author_name = author.get("name", "")
            if author_name in authors2count:
                authors2count[author_name]["count"] += 1
            else:
                authors2count[author_name] = {
                    "name": author_name,
                    "count": 1,
                    "source": source.key,
                    "mbfc_rating": source.mbfc_rating,
                    "percentage": 0
                }

    # Calculate the percentage
    total_count = sum(author["count"] for author in authors2count.values())
    for author in authors2count:
        authors2count[author]["percentage"] = authors2count[author]["count"] / total_count

    # Sort the authors by count
    authors2count = sorted(authors2count.values(), key=lambda x: x["count"], reverse=True)

    # Write the results to a csv file
    authors_df = pd.DataFrame(authors2count)
    authors_df.to_csv(f"{AUTHORS_PATH}{source.key}_authors.csv", index=False)

print("Done!")

Processing foxnews... 1/20
Processing breitbart... 2/20
Processing dailymail... 3/20
Processing thesun... 4/20
Processing wsj... 5/20
Processing nypost... 6/20
Processing forbes... 7/20
Processing indiatimes... 8/20
Processing newsweek... 9/20
Processing reuters... 10/20
Processing thehill... 11/20
Processing nytimes... 12/20
Processing washingtonpost... 13/20
Processing usatoday... 14/20
Processing buzzfeed... 15/20
Processing cbsnews... 16/20
Processing sfgate... 17/20
Processing bloomberg... 18/20
Processing cnn... 19/20
Processing people... 20/20
Done!


In [29]:
# Merge all the authors csv files (sorted by percentage)
merged_authors_df = pd.DataFrame()
for source in news_sources:
    try: 
        authors_df = pd.read_csv(f"{AUTHORS_PATH}{source.key}_authors.csv")
        merged_authors_df = pd.concat([merged_authors_df, authors_df])
    except:
        print(f"No articles found for {source.key}[{source.domain}]")

merged_authors_df = merged_authors_df.sort_values(by="percentage", ascending=False)
merged_authors_df.to_csv(f"{AUTHORS_PATH}all_authors.csv", index=False)

No articles found for forbes[forbes.com]
No articles found for usatoday[usatoday.com]
No articles found for buzzfeed[buzzfeed.com]
No articles found for cnn[cnn.com]
