In [None]:
from unstructured.partition.pdf import partition_pdf
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

file_path = "C:/Users/User/Downloads/tmj_rag_app/data/pdfs"
base_file_name = "tmjDoc1"


In [None]:
#Import env variables
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [None]:
# Partition the PDF file
elements = partition_pdf(
    filename=f"{file_path}/{base_file_name}.pdf",
    strategy="hi_res",
    infer_table_structure=True
)



In [None]:
# Display the elements partitioned from the PDF file
for i, element in enumerate(elements):
    print(f"\n--- Element {i} ---")
    print(f"Type: {element.category}")
    print(f"Text preview: {element.text[:100]}...")
    print(f"Metadata: {element.metadata}")

In [None]:
# Convert to LangChain Documents
docs = []
for el in elements:
    text = getattr(el, "text", None)
    if not text:
        continue
    meta = el.to_dict().get("metadata", {}) or {}
    docs.append(
        Document(
            page_content=text,
            metadata={
                "source": base_file_name,
                "page_number": meta.get("page_number"),
                "type": el.category if hasattr(el, "category") else meta.get("type"),
            },
        )
    )

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_docs = splitter.split_documents(docs)

In [None]:
#Print the chunked documents
for i, doc in enumerate(chunked_docs, 1):
    meta = doc.metadata
    print(f"--- Chunk {i} ---")
    print(f"source={meta.get('source')} page={meta.get('page_number')} type={meta.get('type')} section={meta.get('section')}")
    print(doc.page_content)
    print()

In [None]:
#Create embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

vector = embeddings.embed_query("Hello world")

In [None]:
#Create Chroma db
vector_store = Chroma(
    collection_name="tmj_rag_app",
    embedding_function=embeddings,
    persist_directory="../data/chroma_db"
)

In [None]:
#Add documents to the vector store and persist vector store
vector_store.add_documents(chunked_docs)