In [13]:
import sys
import os

cwd = os.getcwd()
repo_path = os.path.abspath(os.path.join(cwd, '..'))
if repo_path not in sys.path:
    sys.path.append(repo_path)

import chromadb as db 

from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import pandas as pd
from lib.utils import clean_scraped_text
from lib.vector.structure import build_vector_db_structure

In [14]:
news_data = (
    pd.read_csv('news_data_processed.csv')
    [
        ['EventId', 'Date', 'ArticleUrl', 'NumMentions', 'AvgTone', 'GoldsteinScale', 'news_title', 'news_summary', 'news_content']
    ]
    .drop_duplicates(subset=['ArticleUrl'], keep='first')
    .reset_index(drop=True)
)

news_data['news_content'] = news_data['news_content'].apply(clean_scraped_text)
metadatas_cols = ['Date','ArticleUrl','NumMentions','AvgTone','GoldsteinScale']
vect_db_structure = build_vector_db_structure(news_data, metadatas_cols, 'EventId', 'news_content')

  text = BeautifulSoup(text, 'html.parser').get_text().encode('utf-8').decode('unicode_escape')


In [16]:
embedding_model = SentenceTransformerEmbeddings(model_name="thenlper/gte-small")


  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
db_path = os.path.join(os.path.dirname(os.path.abspath("")), "data", "vector_db")
chroma_client = db.PersistentClient(path=db_path)
collection_name = "news"
if collection_name not in [c.name for c in chroma_client.list_collections()]:
    chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"},
        embedding_function=embedding_model,
    )
    collection_one = chroma_client.get_collection(name=collection_name)
    collection_one.add(
        documents=vect_db_structure['datas'],
        metadatas=vect_db_structure['metadatas'],
        ids=vect_db_structure['ids']
    )
else:
    collection_one = chroma_client.get_collection(name=collection_name)

In [18]:
from langchain_chroma import Chroma

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="news",
    embedding_function=embedding_model,
)

print("There are", langchain_chroma._collection.count(), "in the collection")

There are 68 in the collection


In [20]:
query="The US is sending military aid to Ukraine"
docs_chroma = langchain_chroma.similarity_search_with_score(query, k=5)
"\n\n".join([doc.page_content for doc, _score in docs_chroma])

'Ukrainian and Western leaders laud US aid package while the Kremlin warns of further ruin Ukrainian and Western leaders have welcomed the passing of a desperately needed aid package for Ukraine by the US House of Representatives\n\nPresident Joe Biden has signed a  billion aid package into law providing crucial military assistance to Ukraine and Israel The package which was passed by the Senate on Tuesday April  includes nearly  billion in aid to Ukraine  billion for Israel and  billion for the IndoPacific region The legislation also includes a provision that could lead to the banning of TikTok in the United States Speaking from the White House after signing the bill Biden described it as a good day for America a good day for Ukraine and a good day for world peace He added that the aid package is going to make America safer Its going to make the world safer And it continues Americas leadership in the world The signing of the aid package follows months of negotiations and personal lobb