In [1]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
import os
import requests
from time import sleep
import json
from query_articles import query_articles

In [2]:
API_KEY = os.getenv("NEWS_API_KEY")

In [3]:
def get_sources_id_by_country(country):
    url = "https://newsapi.org/v2/sources"
    response = requests.get(url, params={"apiKey": API_KEY, "country": country})
    response.raise_for_status()
    sources = response.json()["sources"]
    return [source["id"] for source in sources]

In [4]:
# load sources
sources = pd.read_csv("sources.csv")
# sources from us and ca
sources = sources[sources["country"].isin(["us", "ca"])]
sources_id = sources["source_id"].tolist()

In [7]:
query = "Donald Trump"

articles_first = query_articles(query, API_KEY, sort_by="popularity", nb_pages=5, sources=",".join(sources_id), from_date="2024-10-29", to_date="2024-11-15")
sleep(15)
articles_second = query_articles(query, API_KEY, sort_by="popularity", nb_pages=5, sources=",".join(sources_id), from_date="2024-11-16", to_date="2024-11-30")
articles = pd.concat([articles_first, articles_second])

print(f"Total query returned {articles.shape[0]} articles")
articles = articles.drop_duplicates()
articles = articles[articles["source_name"] != "[Removed]"]
articles = articles.reset_index(drop=True)

print(f"Cleaning left {articles.shape[0]} articles")
print(f"Articles from the following sources were found: {articles['source_id'].unique()}")
print(f"Here are the first 5 articles:")

# DF to csv
articles.to_csv("articles_raw.csv", index=False)
articles = articles.drop(columns=["index", "content", "url", "urlToImage", "publishedAt"])
# first 500 articles
articles = articles.iloc[:500]
articles.to_csv("articles.csv", index=False)
print(articles["title"].head())

Total query returned 603 articles
Cleaning left 603 articles
Articles from the following sources were found: ['wired' 'business-insider' 'time' 'abc-news' 'cbc-news' 'cbs-news'
 'msnbc' 'usa-today' 'breitbart-news' 'the-verge' 'newsweek' 'fortune'
 'nbc-news' 'cnn']
Here are the first 5 articles:
0    Far-Right Donald Trump Supporters Celebrate Hi...
1    ICE Started Ramping Up Its Surveillance Arsena...
2    Did you need another reminder that Donald Trum...
3    Joe Rogan endorsed Donald Trump, saying Elon M...
4    How the US voted in every election, from Georg...
Name: title, dtype: object


In [None]:
# Overwrite sources.csv
sources = articles.groupby("source_id").size().reset_index(name="count")
sources.to_csv("sources.csv", index=False)