# Weaviate crypto news processor

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd

# Do this to enable importing modules
src_path = os.path.join(os.path.abspath(""), "..")
sys.path.insert(0, src_path)

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv()) # read local .env file


True

## Data crawling

In [18]:
from datetime import datetime, timedelta
from crawlers import CoinTelegraphCrawler


date_format="%Y-%m-%d"
coin_crawler = CoinTelegraphCrawler()
start_dt = datetime.now() - timedelta(days=200)
end_dt = datetime.now()
ct_articles = coin_crawler.extract(n_scrolls=50, start_date=start_dt.strftime(date_format), end_date=end_dt.strftime(date_format))

INFO:crawlers.coin_telegraph:Scraping articles from: https://cointelegraph.com/


In [19]:
df = pd.DataFrame.from_records([article.model_dump() for article in ct_articles])
df.to_csv("../data/articles.csv", index=False)

## Data Augmentation

In [2]:
df = pd.read_csv("../data/articles.csv")
df.head(3)

Unnamed: 0,source,title,content,summary,published_at
0,coin_telepraph,Bitcoin dips 2% as Mt. Gox wallets move over ...,"Bitcoin BTC $68,460 dipped 2% on May 28 aft...",Billions in Bitcoin reportedly belonging to c...,2024-05-28
1,coin_telepraph,Grayscale’s Ethereum ETF could bleed $110M da...,"Grayscale’s yet-to-launch spot Ether ETH $3,...",If Grayscale’s slated spot Ether ETF follows ...,2024-05-28
2,coin_telepraph,Empowering equality: How Web3 and AI technolo...,"Carrying digital content on blockchain, REVOX...",Integrating blockchain and AI revolutionizes ...,2024-05-27


In [None]:
# Use this cell if you want to extract entities from the dataset above

# from data_augmentation.data_formatter import DataFormatter
# from data_augmentation.openai_handler import OpenAIHandler

# response = []
# batch_size = 1
# for i in range(0, len(df), batch_size):
#     batch = df.iloc[i : i + batch_size]["content"].to_list()
#     initial_prompt = DataFormatter.format_prompt(batch, i)
#     response += OpenAIHandler().request(initial_prompt)

In [14]:
# Use this cell to chunk the articles

# from data_handlers.chunk_handler import ChunkingHandler

# total_chunk_list = []
# for index, item in df.iterrows():
#     total_chunk_list += ChunkingHandler.handle_article(index, item)


## Weaviate initialization

In [4]:
import weaviate
import weaviate.classes as wvc
import os
import requests
import json

client = weaviate.connect_to_wcs(
    cluster_url="news-db-h6x724lk.weaviate.network",
    skip_init_checks=True,
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_API_KEY")),
    headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]
    }
)

# client.close()

  client = weaviate.connect_to_wcs(


In [5]:
if client.collections.exists("Articles"):
    client.collections.delete("Articles")  # Replace with your collection name

Articles = client.collections.create(
    name="Articles",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),  
    generative_config=wvc.config.Configure.Generative.openai()
)

In [6]:
articles_objs = list()
for i, d in df.iterrows():
    articles_objs.append({
        "title": d["title"],
        "summary": d["summary"],
        "content": d["content"],
        "published_at": d["published_at"]
    })

articles = client.collections.get("Articles")
articles.data.insert_many(articles_objs)

BatchObjectReturn(all_responses=[UUID('2e140d38-0dba-4e5a-bb84-7c583c937d36'), UUID('ffd6a14c-9fa2-4a96-a58f-d01a221202f4'), UUID('96ffd763-2708-4234-8eb4-621f318de589'), UUID('0a8fab3c-90c5-4ace-b66d-9114ba123dec'), UUID('06356092-1fef-4a47-a435-1e59bcefa14c'), UUID('4f102e85-1d56-4a81-a7ef-4bfdbde6d8aa'), UUID('d181eebf-cfc9-429c-88c0-ab3fba57452a'), UUID('a532fe5c-6544-4e6d-83fa-825a9e29b310'), UUID('4cc1e259-503a-4996-8e1d-2b7584b6fdfe'), UUID('91487036-a498-485e-99be-4c5e988dbf1e'), UUID('9ccbd47d-96ec-455a-b3c9-3867fa19e418'), UUID('ecb0a2ec-3b6a-4337-baa4-056a427d65fc'), UUID('98b24246-47d9-4c43-bb80-3c11b912b0ac'), UUID('33c29af1-f473-47bb-a89b-65b767abc980'), UUID('12d63fe6-b95f-4fea-aaf1-a03587c6cbf6'), UUID('53e98413-db90-4d19-a9b1-5c92dfeb87b5'), UUID('e4321aac-be82-4d56-a477-616defc5be0d'), UUID('6aa78d5b-bcd6-4855-ac91-226c39d844f3'), UUID('d0a7f150-9b3f-4bf5-a927-6c45c120bda1'), UUID('74d49d40-5010-4a72-8233-938968ce90dc'), UUID('d08ab393-fe09-4c44-ab28-1aa768fc26fe')], 

## Semantic search

In [32]:
articles = client.collections.get("Articles")

user_query = "News about the bitcoin ETFs" 

response = articles.query.near_text(
    query="etfs",
    limit=5
)

print(response.objects[0].properties)  # Inspect the first object

{'title': ' Spot Ether ETFs are now officially legal in the US: Law Decoded  ', 'content': 'In a second landmark decision this year, the United States Securities and Exchange Commission has\xa0given the regulatory green light to spot Ether exchange-traded funds (ETFs) in the country. The SEC approved the 19b-4 filings from VanEck, BlackRock, Fidelity, Grayscale, Franklin Templeton, ARK 21Shares, Invesco Galaxy and Bitwise, approving the rule changes allowing spot Ether  ETH  $3,890  ETFs to be listed and traded on their respective exchanges.  ETH   $3,890  Unlike the spot Bitcoin  BTC  $68,460  ETFs approved via voting by a five-member committee including SEC Chair Gary Gensler, spot Ether ETFs were approved by the SEC’s Trading and Markets Division.  BTC   $68,460  Another major difference between the approval processes of the two crypto ETFs is that all 10 BTC ETFs started trading the day after their approval, as they also got S-1 form clearance. Spot Ether ETFs might be weeks or mon

In [42]:
response = articles.generate.near_text(
    query="banks",
    limit=5,
    single_prompt="Classify the sentiment of the text {content} from 0 to 10. Give your answer as a single number and do not include any text with it"
)

print(response.objects[0].generated)  # Inspect the generated text

7


In [43]:
for item in response.objects:
    print(item.generated)

7
5
7
7
5


In [15]:
# client.close()  # Close client gracefully