In [1]:
import chromadb

from sentence_transformers import SentenceTransformer

import re

from langchain.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = chromadb.Client()
client = chromadb.PersistentClient(path='./chromadb/')

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
speaker_name_tag = "Sanjay Sarma: "

In [5]:
def get_document_chunks(file_path, chunk_size=500, chunk_overlap=100):
    loader = Docx2txtLoader(file_path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    return [chunk.page_content for chunk in chunks]

def get_quoted_document_chunks(file_path, chunk_size=300, chunk_overlap=50):
    loader = Docx2txtLoader(file_path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    return [(speaker_name_tag + chunk.page_content) for chunk in chunks] 

In [6]:
collection_name = "sanjay_sarma"
try:
    collection = client.get_or_create_collection(name=collection_name)
except Exception as e:
    print(f"Error creating or getting collection: {e}")

In [7]:
doc_path = "./data/Doc1_Sanjay_Info.docx"
filter_pattern = r"(?<=./data/)(.*?)(?=\.docx)"
doc_title = re.search(filter_pattern, doc_path).group(0)

chunks = get_document_chunks(doc_path)
ids = [(doc_title + str(i)) for i in range(len(chunks))]
metadata = [{'title': doc_title, 'text': chunk} for chunk in chunks]
embeddings = model.encode(chunks)

collection.add(
    ids = ids,
    documents=chunks,
    metadatas=metadata,
    embeddings=embeddings
)

In [8]:
doc_path = "./data/Doc2_Sanjay_Interview.docx"
filter_pattern = r"(?<=./data/)(.*?)(?=\.docx)"
doc_title = re.search(filter_pattern, doc_path).group(0)

chunks = get_quoted_document_chunks(doc_path)
ids = [(doc_title + str(i)) for i in range(len(chunks))]
metadata = [{'title': doc_title, 'text': chunk} for chunk in chunks]
embeddings = model.encode(chunks)

collection.add(
    ids = ids,
    documents=chunks,
    metadatas=metadata,
    embeddings=embeddings
)

In [9]:
print("PDF document has been loaded, chunked, embedded, and stored in the database successfully!")

PDF document has been loaded, chunked, embedded, and stored in the database successfully!


In [10]:
# from langchain.vectorstores import Chroma

# vectordb = Chroma(
#     collection_name='sanjay_sarma',
#     embedding_function=model,
#     persist_directory="./chromadb")