In [2]:
from newsapi import NewsApiClient
from config import NEWS_API_KEY

# Init
newsapi = NewsApiClient(api_key=NEWS_API_KEY)

# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin')

In [3]:
import pandas as pd

df = pd.DataFrame(all_articles['articles'])
print(df.columns)

Index(['source', 'author', 'title', 'description', 'url', 'urlToImage',
       'publishedAt', 'content'],
      dtype='object')


In [4]:
from pinecone import Pinecone
from config import PINECONE_API_KEY
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'news-articles-llms'
index = pc.Index(index_name)

articles = df.to_dict(orient='records')
for article in articles:
    text = article['title'] + ' ' + article['content']
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    embedding = embeddings[0].numpy()
    index.upsert([(article['url'], embedding.tolist())])



In [13]:
query = "What most have impacted Bitcoin price recently?"

inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)

query_embedding = embeddings[0].numpy().tolist()

query_embedding = np.clip(np.array(query_embedding, dtype=np.float32), -1.0, 1.0).tolist()

results = index.query(vector=[query_embedding], top_k=5)

relevant_news = results['matches']

print(relevant_news)

[{'id': 'https://readwrite.com/bitcoin-slumps-below-59000-amid-market-uncertainty/',
 'score': 0.661143482,
 'values': []}, {'id': 'https://readwrite.com/bitcoin-transaction-cost-hits-four-year-low/',
 'score': 0.635518,
 'values': []}, {'id': 'https://readwrite.com/bitcoin-plunge-below-63k-triggers-wave-of-liquidations/',
 'score': 0.61461252,
 'values': []}, {'id': 'https://readwrite.com/bitcoin-surges-following-assassination-attempt-on-donald-trump/',
 'score': 0.595728636,
 'values': []}, {'id': 'https://readwrite.com/cryptocurrency-market-tumbles-665m-liquidated-as-bitcoin-falls/',
 'score': 0.588863492,
 'values': []}]


In [11]:
summary = f"The most significant factors impacting {query} include:\n\n"

for article in relevant_news:
    summary += f"- {article['title']}: {article['content'][:200]}...\n"
    summary += f"  [Read more]({article['url']})\n\n"
 
print(summary)

PineconeApiAttributeError: ScoredVector has no attribute 'title' at ['['received_data', 'matches', 0]']['title']