In [None]:
! sh ../bin/install_requirements_databricks.sh
dbutils.library.restartPython()

In [None]:
import sys
import os

cwd = os.getcwd()
repo_path = os.path.abspath(os.path.join(cwd, '..'))
if repo_path not in sys.path:
    sys.path.append(repo_path)

import chromadb as db 

from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
    HuggingFaceEmbeddings,
)
import pandas as pd
from lib.utils import clean_scraped_text
from lib.vector.structure import build_vector_db_structure

In [None]:
UPDATE = True

In [None]:
import pandas as pd
from lib.scraping.scrap import collect_rss_feed, extract_news_content_from_url_to_dataframe, load_rss_urls_from_config

rss_urls = load_rss_urls_from_config(os.path.join(repo_path, 'config', 'rss_urls.yaml'))

rss_feed_df = collect_rss_feed(rss_urls)
rss_feed_df.to_csv(os.path.join(repo_path, 'data', 'rss_feed_df.csv'), index=False)


In [None]:
rss_feed_df = extract_news_content_from_url_to_dataframe(rss_feed_df, url_column = 'Link', output_column = 'Content')
rss_feed_df.to_csv(os.path.join(repo_path, 'data', 'rss_feed_with_content_df.csv'), index=False)


In [None]:
news_data = pd.read_csv(os.path.join(repo_path, 'data', 'rss_feed_with_content_df.csv')).dropna().reset_index()
metadatas_cols = ['Published','Link','Title','Source','Summary']

In [None]:
# embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

In [None]:
db_path = os.path.join(os.path.dirname(os.path.abspath("")), "data", "vector_db")
chroma_client = db.PersistentClient(path=db_path)
collection_name = "news"
if collection_name not in [c.name for c in chroma_client.list_collections()]:
    chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"},
        embedding_function=embedding_model,
    )
    collection_one = chroma_client.get_collection(name=collection_name)
    vect_db_structure = build_vector_db_structure(news_data, metadatas_cols, 'index', 'Content')
    collection_one.add(
        documents=vect_db_structure['datas'],
        metadatas=vect_db_structure['metadatas'],
        ids=vect_db_structure['ids']
    )
elif UPDATE==True:
    collection_one = chroma_client.get_collection(name=collection_name)
    news_data['index'] = news_data['index'] + max([int(id) for id in collection_one.get()['ids']])
    vect_db_structure = build_vector_db_structure(news_data, metadatas_cols, 'index', 'Content')


else:
    collection_one = chroma_client.get_collection(name=collection_name)

In [None]:
from langchain_chroma import Chroma

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="news",
    embedding_function=embedding_model,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

In [None]:
query="Tesla stocks"
docs_chroma = langchain_chroma.similarity_search_with_score(query, k=5)

In [None]:
docs_chroma[-1]