In [1]:
# ! sh ../bin/install_requirements_databricks.sh
# dbutils.library.restartPython()

In [2]:
import sys
import os

cwd = os.getcwd()
repo_path = os.path.abspath(os.path.join(cwd, '..'))
if repo_path not in sys.path:
    sys.path.append(repo_path)

In [None]:
import chromadb as db 
import pandas as pd
from langchain_chroma import Chroma

from lib.vector.structure import build_vector_db_structure
from lib.scraping.scrap import collect_rss_feed, extract_news_content_from_url_to_dataframe, load_rss_urls_from_config
from lib.embedding.custom_embedding import CustomHuggingFaceEmbeddings

In [3]:
READ_RSS = False        
SCRAP_ARTICLES_CONTENT = False
UPDATE = True

In [4]:
if READ_RSS:
    rss_urls = load_rss_urls_from_config(os.path.join(repo_path, 'config', 'rss_urls.yaml'))
    rss_feed_df = collect_rss_feed(rss_urls)
    rss_feed_df.to_csv(os.path.join(repo_path, 'data', 'rss_feed_df.csv'), index=False)

In [5]:
if SCRAP_ARTICLES_CONTENT:
    rss_feed_df = pd.read_csv(os.path.join(repo_path, 'data', 'rss_feed_df.csv'))
    rss_feed_df = extract_news_content_from_url_to_dataframe(rss_feed_df, url_column = 'Link', output_column = 'Content')
    rss_feed_df.to_csv(os.path.join(repo_path, 'data', 'rss_feed_with_content_df.csv'), index=False)


In [6]:
news_data = pd.read_csv(os.path.join(repo_path, 'data', 'rss_feed_with_content_df.csv')).dropna().reset_index()
metadatas_cols = ['Published','Link','Title','Source','Summary']

In [8]:
embedding_model = CustomHuggingFaceEmbeddings(model_name="thenlper/gte-small")  # sentence-transformers/all-MiniLM-l6-v2

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
db_path = os.path.join(os.path.dirname(os.path.abspath("")), "data", "vector_db")
chroma_client = db.PersistentClient(path=db_path)
collection_name = "news"
if collection_name not in [c.name for c in chroma_client.list_collections()]:
    chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"},
        embedding_function=embedding_model,
    )
    collection_one = chroma_client.get_collection(name=collection_name)
    vect_db_structure = build_vector_db_structure(news_data, metadatas_cols, 'index', 'Content')
    collection_one.add(
        documents=vect_db_structure['datas'],
        metadatas=vect_db_structure['metadatas'],
        ids=vect_db_structure['ids']
    )
else:
    if UPDATE==True:
        collection_one = chroma_client.get_collection(name=collection_name)
        news_data['index'] = news_data['index'] + max([int(id) for id in collection_one.get()['ids']])
        vect_db_structure = build_vector_db_structure(news_data, metadatas_cols, 'index', 'Content')
    else:
        collection_one = chroma_client.get_collection(name=collection_name)

In [10]:
langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="news",
    embedding_function=embedding_model,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

There are 1013 in the collection


In [11]:
query="Tesla stocks"
docs_chroma = langchain_chroma.similarity_search_with_score(query, k=5)

In [12]:
docs_chroma[-1]

(Document(page_content='CNN — It’s time for investors to start making safer bets That’s what Howard Marks cochairman of Oaktree Capital told CNNMoney editoratlarge Richard Quest on “Markets Now” on Wednesday “Defense is more important than offense” right now said Marks the author of “Mastering the Market Cycle Getting the Odds on Your Side” Investors should consider taking a stake in utilities and decreasing their investments in more volatile tech stocks he said Defense is the name of the game for a few reasons Though stocks have been soaring Marks warned that we may be nearing the end of the bull cycle “I’m not saying get out” he said “I think that being out of the market is pretty dangerous today and I think it would be a mistake to raise cash” But more reliable stocks can protect investors from big losses if the climate changes Marks also pointed to the trade war with China as another reason for investors to tread carefully “We have a trade battle with China it’s probably going to g