In [3]:
! pip install --upgrade google-cloud-storage --quiet
from google.cloud import storage

In [5]:
import os
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

BUCKET_NAME = "bucket_arxiv_researcher"
GCS_PERSIST_PATH = "chroma_db/"
LOCAL_PERSIST_PATH = "./local_chromadb/"

# Initialize GCS client
storage_client = storage.Client()

def download_directory_from_gcs(gcs_directory, local_directory, bucket_name):
    """Download all files from a GCS directory to a local directory."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=gcs_directory)

    for blob in blobs:
        if not blob.name.endswith("/"):  # Avoid directory blobs
            relative_path = os.path.relpath(blob.name, gcs_directory)
            local_file_path = os.path.join(local_directory, relative_path)
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            blob.download_to_filename(local_file_path)
            print(f"Downloaded {blob.name} to {local_file_path}")

# Download Chroma persisted data from GCS to local directory
download_directory_from_gcs(GCS_PERSIST_PATH, LOCAL_PERSIST_PATH, BUCKET_NAME)

Downloaded chroma_db/5a42027a-353b-44cf-9370-6c19f5ea675f/data_level0.bin to ./local_chromadb/5a42027a-353b-44cf-9370-6c19f5ea675f/data_level0.bin
Downloaded chroma_db/5a42027a-353b-44cf-9370-6c19f5ea675f/header.bin to ./local_chromadb/5a42027a-353b-44cf-9370-6c19f5ea675f/header.bin
Downloaded chroma_db/5a42027a-353b-44cf-9370-6c19f5ea675f/length.bin to ./local_chromadb/5a42027a-353b-44cf-9370-6c19f5ea675f/length.bin
Downloaded chroma_db/5a42027a-353b-44cf-9370-6c19f5ea675f/link_lists.bin to ./local_chromadb/5a42027a-353b-44cf-9370-6c19f5ea675f/link_lists.bin
Downloaded chroma_db/chroma.sqlite3 to ./local_chromadb/chroma.sqlite3


In [8]:
!pip install langchain_huggingface --quiet
from langchain_huggingface import HuggingFaceEmbeddings
EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

# Load the stored vector database
db = Chroma(persist_directory=LOCAL_PERSIST_PATH, embedding_function=embeddings)

# Now use db for retrieval
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})