In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append('../../')

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import LocalFileStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.ingestion import load_pdf

import os
import uuid

import chromadb
import pickle

In [4]:
RAW_DOCS_PATH = "../../data/raw"
PROCESSED_DOCS_PATH = "../../data/processed"

In [5]:
docs = [load_pdf(os.path.join(RAW_DOCS_PATH, f)) for f in os.listdir(RAW_DOCS_PATH)]

In [23]:
persistent_client = chromadb.PersistentClient(path='../../data/chroma')

In [7]:
def save_to_pickle(obj, filename):
    with open(filename, "wb") as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)

def load_from_pickle(filename):
    with open(filename, "rb") as file:
        return pickle.load(file)

# Parent Child Retriever

In [31]:
def get_parent_child_splits(docs, parent_chunk_size=1200, parent_chunk_overlap=400, child_chunk_size=300, child_chunk_overlap=0, id_key="parent_doc_id"):
    parent_splitter = RecursiveCharacterTextSplitter(
            chunk_size=parent_chunk_size,
            chunk_overlap=parent_chunk_overlap,
            separators=['\n\n\n', '\n\n', '\n', r'\.\s+', ' ', '']
    )

    child_splitter = RecursiveCharacterTextSplitter(
        chunk_size=child_chunk_size,
        chunk_overlap=child_chunk_overlap,
    )

    parent_docs = parent_splitter.split_documents(docs)
    parent_docs_ids = [str(uuid.uuid4()) for _ in parent_docs]

    child_docs = []
    for i, doc in enumerate(parent_docs):
        _id = parent_docs_ids[i]
        _child_docs = child_splitter.split_documents([doc])
        for _doc in _child_docs:
            _doc.metadata[id_key] = _id
        child_docs.extend(_child_docs)

    return parent_docs, parent_docs_ids, child_docs

In [44]:
def get_embedding_function(model_name):
    return HuggingFaceEmbeddings(model_name=model_name)

def get_multivector_retriever(chroma_client, embedding_model_name, collection_name, save_path, parent_docs=[], parent_docs_ids=[], child_docs=[], id_key="parent_doc_id"):
    
    # Create save directories
    os.makedirs(os.path.join(save_path), exist_ok=True)
    docstore_path = os.path.join(save_path, 'docstore', collection_name)
    vectorstore_path = os.path.join(save_path, 'chroma')

    # Get embedding_function
    embedding_function = get_embedding_function(embedding_model_name)

    vectorstore = Chroma(
        client=chroma_client,
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=vectorstore_path
    )

    store = LocalFileStore(docstore_path)

    # The retriever (empty to start)
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        byte_store=store,
        id_key=id_key,
    )

    # If vectorstore isn't populated, populate and persist
    if not len(retriever.vectorstore.get()['documents']) == len(child_docs):

        if child_docs:
            retriever.vectorstore.add_documents(child_docs)
            retriever.docstore.mset(list(zip(parent_docs_ids, parent_docs)))

            # Save the vectorstore and docstore to disk
            retriever.vectorstore.persist()

    return retriever    

In [45]:
save_path = 'D:\Ahmed\saudi-rag-project\data'

  save_path = 'D:\Ahmed\saudi-rag-project\data'


In [48]:
%%time

PARENT_CHUNK_SIZE = 1200
PARENT_CHUNK_OVERLAP = 400

CHILD_CHUNK_SIZE = 300
CHILD_CHUNK_OVERLAP = 0

EMBEDDING_MODEL_NAME = 'intfloat/multilingual-e5-small'

COLLECTION_NAME = f"PARENT_{PARENT_CHUNK_SIZE}_{PARENT_CHUNK_OVERLAP}_CHILD_{CHILD_CHUNK_SIZE}_{CHILD_CHUNK_OVERLAP}_{EMBEDDING_MODEL_NAME}"
COLLECTION_NAME = COLLECTION_NAME.replace('/', '_').replace('-', '_')

parent_docs, parent_docs_ids, child_docs = get_parent_child_splits(docs)
retriever = get_multivector_retriever(persistent_client, EMBEDDING_MODEL_NAME, COLLECTION_NAME, save_path, parent_docs, parent_docs_ids, child_docs)

CPU times: total: 40.7 s
Wall time: 24.9 s
