In [22]:
import os
from google.cloud import storage
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, RetrievalQA
from langchain.chat_models import ChatHuggingFace
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
storage_client = storage.Client()

In [3]:
def download_directory_from_gcs(gcs_directory, local_directory, bucket_name):
    """Download all files from a GCS directory to a local directory."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=gcs_directory)

    for blob in blobs:
        if not blob.name.endswith("/"):  # Avoid directory blobs
            relative_path = os.path.relpath(blob.name, gcs_directory)
            local_file_path = os.path.join(local_directory, relative_path)
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            blob.download_to_filename(local_file_path)
            print(f"Downloaded {blob.name} to {local_file_path}")

In [4]:
BUCKET_NAME = os.getenv("BUCKET_NAME")
GCS_PERSIST_PATH = os.getenv("GCS_PERSIST_PATH")
LOCAL_PERSIST_PATH = os.getenv("LOCAL_PERSIST_PATH")

In [5]:
BUCKET_NAME

'arxiv-researcher-bucket'

In [6]:
# Download Chroma persisted data from GCS to local directory
download_directory_from_gcs(GCS_PERSIST_PATH, LOCAL_PERSIST_PATH, BUCKET_NAME)

Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/data_level0.bin to ../chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/data_level0.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/header.bin to ../chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/header.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/index_metadata.pickle to ../chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/index_metadata.pickle
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/length.bin to ../chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/length.bin
Downloaded chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/link_lists.bin to ../chroma_db/4d8e52f2-6027-41ef-b06b-77f96cb894fa/link_lists.bin
Downloaded chroma_db/chroma.sqlite3 to ../chroma_db/chroma.sqlite3


In [7]:
EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Load the stored vector database
db = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)

# Now use db for retrieval
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

  from .autonotebook import tqdm as notebook_tqdm
  db = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)


In [19]:
query = "Neural networks for image recognition"
retrieved_docs = retriever.get_relevant_documents(query)

In [21]:
i = 1
for doc in retrieved_docs:
    print(i)
    print(doc.page_content)
    print(doc.metadata)
    print("\n")
    i += 1

1
comparing robustness of pairwise and multiclass neuralnetwork systems
  for face recognition   noise corruptions and variations in face images can seriously hurt the
performance of face recognition systems to make such systems robust
multiclass neuralnetwork classifiers capable of learning from noisy data have
been suggested however on large face data sets such systems cannot provide the
robustness at a high level in this paper we explore a pairwise neuralnetwork
system as an alternative approach to improving the robustness of face
recognition in our experiments this approach is shown to outperform the
multiclass neuralnetwork system in terms of the predictive accuracy on the
face images corrupted by noise

{'id': '0704.3515', 'year': '2007'}


2
the parameterless selforganizing map algorithm   the parameterless selforganizing map plsom is a new neural network
algorithm based on the selforganizing map som it eliminates the need for a
learning rate and annealing schemes for learning r

### langchain QA

In [None]:
from langchain.chat_models import ChatHuggingFace
from langchain.chains import RetrievalQA

In [28]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

In [33]:
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [None]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_length=128,
    temperature=0.5,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    task="text-generation"
)
# llm_chain = prompt | llm
# print(llm_chain.invoke({"question": question}))

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


 The FIFA World Cup is a soccer (or football, depending on your preference) tournament that takes place every four years. The 1994 World Cup was indeed held in the United States. Now, let's find out who won it.

The 1994 FIFA World Cup final was played on July 17, 1994. The two teams that made it to the final were Brazil and Italy. Brazil won the match 0-0 after extra time, and then 3-2 in a penalty shootout.

So, the answer is Brazil. They won the FIFA World Cup in the year 1994.


In [42]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

In [None]:
query1 = "Which articles use LLM in Finance, and return back the id and metadata of this article"
response1 = qa_chain.invoke(query1)
print(response1)



{'query': 'Which articles use LLM in Finance, and return back the id and metadata of this article', 'result': " To answer this question, we would need to use a combination of access log data and citation information. The article investigating the use of access logs for paper recommendations on arXiv mentions finance as one of the fields of physics it covers. However, we cannot directly determine which specific articles in finance are being referred to without further investigation. We could look for articles in the arXiv finance category that have high access frequencies or use coaccess measures to identify related papers. Additionally, we could check the citation information of papers in the finance category to see which ones have a high number of citations in the finance literature. An online recommendation system has been built based on this research to help scientists find further relevant literature, so we could potentially use this system to identify articles in finance that have

In [46]:
query2 = "les articles dans cette base de données concernent quels domains?"
response2 = qa_chain.invoke(query2)
print(response2)



{'query': 'les articles dans cette base de données concernent quels domains?', 'result': ' These articles in this database concern mathematics and the statistical properties of European universities in the context of the science system. Specifically, they discuss scaling rules, such as the sizedependent cumulative advantage for citations and the relation between journal impact and field citation density. They also compare typological profiles of languages.'}
