In [None]:
!pip install chromadb

In [2]:
import os
import shutil
import re
import json
import logging

from datetime import datetime
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader

logger = logging.getLogger('__name__')

In [3]:
AZURE_OPENAI_API_KEY=os.environ.get('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT=os.environ.get('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_API_VERSION="2024-06-01"

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY

embedding_deployment_name="azure-text-embedding-3-small"
chunk_size = 1000
chunk_overlap = 150
model_name = "azure-gpt-4o"
persist_directory = "./db"
documents_directory = "doc_pool"

if not os.path.exists(persist_directory):
    os.mkdir(persist_directory)

In [3]:
%%time

loader = PyPDFDirectoryLoader(documents_directory)
docs = loader.load()

CPU times: user 19 s, sys: 140 ms, total: 19.1 s
Wall time: 19.1 s


In [12]:
docs[0].page_content

'1\nWHITE PAPER ON \nHEALTHIER SG\n'

In [4]:
%%time

try:
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    split_docs = text_splitter.split_documents(docs)
    logger.info(
        f"Number of documents after passing through text_splitter: {len(split_docs)}"
    )
    # define embedding
    # embeddings.embed_query('this is test')
    embedding = AzureOpenAIEmbeddings(
        model=embedding_deployment_name
    )

    # create vector database from data
    vectordb = Chroma.from_documents(
        collection_name='sg_health',
        documents=split_docs,
        embedding=embedding,
        persist_directory=persist_directory,
    )
    logger.info(f"VectorDB collection count: {vectordb._collection.count()}")
except Exception as e:
    logger.exception(f"Error indexing the data: {e}")

CPU times: user 3.04 s, sys: 161 ms, total: 3.2 s
Wall time: 10.7 s


## test chromadb

In [None]:
embedding = AzureOpenAIEmbeddings(
    model=embedding_deployment_name
)

vector_store = Chroma(
    collection_name="sg_health",
    embedding_function=embedding,
    persist_directory=persist_directory
)

results = vector_store.similarity_search(
    "What are the key features of healthier SG",
    k=5
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

In [6]:
vector_store.persist()

  vector_store.persist()


In [7]:
res

Document(metadata={'page': 3, 'source': 'doc_pool/healthiersg-whitepaper-pdf.pdf'}, page_content='6 7EXECUTIVE SUMMARY\nBUILDING UPON STRONG FOUNDATIONS\nWe are building Healthier SG on strong foundations. Over the years, we have expanded \nthe capacity of our healthcare system across primary, acute and long-term care; \nstrengthened the capabilities of our healthcare workforce by expanding education and \ntraining opportunities; and kept healthcare accessible and affordable by enhancing \nhealthcare subsidies and introducing MediShield Life and CareShield Life.\nWe have also achieved very good health outcomes. Life expectancy in Singapore is among \nthe highest in the world, and we have kept our healthcare expenditure as a percentage  \nof Gross Domestic Product \n(GDP) at a sustainable level. \nClinically, the performance  \nof the healthcare system \nis improving, as shown by \nthe reduction in hospital  \nre-admissions.Healthier SG is a major transformation of our healthcare system