In [10]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
import os
import requests
import json
from query_articles import query_articles

In [11]:
API_KEY = os.getenv("NEWS_API_KEY")

In [15]:
def get_sources_id_by_country(country):
    url = "https://newsapi.org/v2/sources"
    response = requests.get(url, params={"apiKey": API_KEY, "country": country})
    response.raise_for_status()
    sources = response.json()["sources"]
    return [source["id"] for source in sources]

In [None]:
#query = "Elon Musk AND (trump OR harris OR politics OR political OR election OR PAC OR endorse OR endorsement OR free speech OR censorship OR vote OR tarrifs OR tarrif OR efficiency)"
query = "Elon Musk"
us_sources = get_sources_id_by_country("us")
ca_sources = get_sources_id_by_country("ca")
all_sources = us_sources + ca_sources
print(f"Found {len(all_sources)} sources")

all_articles = pd.DataFrame()
for source_index in range(0, len(all_sources), 20):
    source_batch = all_sources[source_index:source_index+20]
    print(f"Querying for sources: {source_batch}")
    articles = query_articles(query, API_KEY, sort_by="popularity", nb_pages=5, sources=",".join(source_batch))
    print(f"Query returned {articles.shape[0]} articles")
    all_articles = pd.concat([all_articles, articles], ignore_index=True)

print(f"Total query returned {all_articles.shape[0]} articles")
articles = articles.drop_duplicates()
articles = articles[articles["source"] != "[Removed]"]
articles = articles.reset_index(drop=True)

print(f"Cleaning left {articles.shape[0]} articles")
print(f"Articles from the following sources were found: {articles['source'].unique()}")
print(f"Here are the first 5 articles:")

# DF to csv
articles = articles.drop(columns=["index", "content", "url", "urlToImage", "publishedAt"])
articles.to_csv("articles.csv", index=False)
print(articles["title"].head())

Found 60 sources
Querying for sources: ['abc-news', 'al-jazeera-english', 'ars-technica', 'associated-press', 'axios', 'bleacher-report', 'bloomberg', 'breitbart-news', 'business-insider', 'buzzfeed', 'cbs-news', 'cnn', 'cnn-es', 'crypto-coins-news', 'engadget', 'entertainment-weekly', 'espn', 'espn-cric-info', 'fortune', 'fox-news']
