In [2]:
from newsapi import NewsApiClient
from dotenv import load_dotenv
import os

load_dotenv(override=True)

True

In [3]:
news_api_key = os.getenv("NEWS_API_KEY")
if not news_api_key:
    print("NEWS_API_KEY environment variable not set.")

In [4]:
newsapi = NewsApiClient(api_key=news_api_key)

top_headlines = newsapi.get_top_headlines(country='us', language='en', page_size=100)

top_headlines["articles"]

[{'source': {'id': None, 'name': 'CBS Sports'},
  'author': 'Tyler Sullivan',
  'title': 'NFL preseason week 2 schedule, live updates, scores, highlights: Quinn Ewers tosses first TD - CBS Sports',
  'description': 'Everything to know about the second week of the preseason right here',
  'url': 'https://www.cbssports.com/nfl/news/nfl-preseason-week-2-schedule-live-updates-scores-highlights-dolphins-quinn-ewers-tosses-first-td/live/',
  'urlToImage': 'https://sportshub.cbsistatic.com/i/r/2025/08/10/d2975724-9aa8-4945-8cb3-e6c6b7a0d1ed/thumbnail/1200x675/37f63e27aecb5e5843e3b19f11455558/ewers.jpg',
  'publishedAt': '2025-08-16T19:59:38Z',
  'content': 'Week 2 of the NFL preseason kicked off on Friday with two games as the Titans topped the Falcons and the Seahawks upended the Chiefs. A total of 11 games are on tap for Saturday, starting with 5 game… [+1169 chars]'},
 {'source': {'id': 'associated-press', 'name': 'Associated Press'},
  'author': None,
  'title': 'Israel prepares to move P

In [5]:
import chromadb
from chromadb.utils import embedding_functions

In [6]:
ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-large"
)

In [7]:
chroma_client = chromadb.PersistentClient()

#collection = chroma_client.create_collection(name="headlines", embedding_function=ef)
collection = chroma_client.get_collection(name="headlines", embedding_function=ef)

In [8]:
articles = top_headlines["articles"]

for article in articles:
    collection.add(
        ids=[article["url"]],
        documents=[f"{article["title"]}\n\n{article["description"]}"]
    )

In [9]:
results = collection.query(
    query_texts=["Bob Odenkirk"],
    n_results=5
)

results

{'ids': [['http://www.hollywoodreporter.com/movies/movie-news/weapons-box-office-nobody-2-highest-to-lowest-1236346020/',
   'https://www.npr.org/2025/08/16/g-s1-83121/marwan-barghouti-video-famous-palestinian-prisoner',
   'https://www.axios.com/2025/08/12/trump-bls-ej-antoni-economists',
   'https://www.ufc.com/news/chicago-prelim-results-du-plessis-chimaev-ufc-319',
   'https://www.cbsnews.com/news/man-rescued-after-2-days-trapped-behind-waterfall-california/']],
 'embeddings': None,
 'documents': [["Box Office: ‘Nobody 2’ Can’t Stop ‘Weapons’ as Sydney Sweeney’s ‘Americana’ All-Out Bombs - The Hollywood Reporter\n\nSpike Lee's Denzel Washington-starrer 'Highest 2 Lowest' also may be struggling in its limited theatrical debut before landing relatively quickly on Apple TV+.",
   'Video shows prominent Palestinian prisoner for the first time in years - NPR\n\nThe world got a glimpse of Marwan Barghouti for the first time in years in a video of a far-right Israeli minister berating him

In [10]:
filtered_results = []
for doc, dist in zip(results['documents'][0], results['distances'][0]):
    if dist <= 1.5:
        filtered_results.append(doc)

filtered_results

["Box Office: ‘Nobody 2’ Can’t Stop ‘Weapons’ as Sydney Sweeney’s ‘Americana’ All-Out Bombs - The Hollywood Reporter\n\nSpike Lee's Denzel Washington-starrer 'Highest 2 Lowest' also may be struggling in its limited theatrical debut before landing relatively quickly on Apple TV+."]