In [2]:
import os
from google.cloud import storage
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'
BUCKET_NAME = "arxiv-researcher-bucket"
GCS_PERSIST_PATH = "chroma_db/"
LOCAL_PERSIST_PATH = "./local_chromadb/"

# Embedding model 
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

def get_vectorstore(gcs_directory=GCS_PERSIST_PATH, local_directory=LOCAL_PERSIST_PATH, bucket_name=BUCKET_NAME):
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=gcs_directory)

    # Download Chroma persisted data from GCS to local directory
    for blob in blobs:
        if not blob.name.endswith("/"):  # Avoid directory blobs
            relative_path = os.path.relpath(blob.name, gcs_directory)
            local_file_path = os.path.join(local_directory, relative_path)
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            blob.download_to_filename(local_file_path)

    # Load the stored vector database
    vectorstore = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)

    # Retrieve all stored documents
    return vectorstore

vectorstore = get_vectorstore()

  embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)


In [4]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
query = "Neural networks for image recognition"
retrieved_docs = retriever.get_relevant_documents(query)

i = 1
for doc in retrieved_docs:
    print(i)
    print(doc.page_content)
    print(doc.metadata)
    print("\n")
    i += 1


  retrieved_docs = retriever.get_relevant_documents(query)


1
comparing robustness of pairwise and multiclass neuralnetwork systems
  for face recognition   noise corruptions and variations in face images can seriously hurt the
performance of face recognition systems to make such systems robust
multiclass neuralnetwork classifiers capable of learning from noisy data have
been suggested however on large face data sets such systems cannot provide the
robustness at a high level in this paper we explore a pairwise neuralnetwork
system as an alternative approach to improving the robustness of face
recognition in our experiments this approach is shown to outperform the
multiclass neuralnetwork system in terms of the predictive accuracy on the
face images corrupted by noise

{'id': '0704.3515', 'year': '2007'}


2
the parameterless selforganizing map algorithm   the parameterless selforganizing map plsom is a new neural network
algorithm based on the selforganizing map som it eliminates the need for a
learning rate and annealing schemes for learning r

In [5]:
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEndpoint
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [6]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_length=128,
    temperature=0.5,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    task="text-generation"
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


In [7]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

query1 = "Which articles use LLM in Finance, and return back the id and metadata of this article"
response1 = qa_chain.invoke(query1)
print(response1.keys())



dict_keys(['query', 'result'])


In [9]:
response1['query']

'Which articles use LLM in Finance, and return back the id and metadata of this article'

In [10]:
response1['result']

' Based on the context provided, it seems that the articles discussed in the texts are related to the use of access data and digital libraries for paper recommendations. However, none of the articles mention LLM in Finance specifically. Therefore, I cannot provide you with the id and metadata of articles that use LLM in Finance based on this context.'