In [33]:
from unstructured.partition.pdf import partition_pdf
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

file_path = "C:/Users/User/Downloads/tmj_rag_app/data/pdfs"
base_file_name = "tmjDoc1"


In [10]:
#Import env variables
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [11]:
# Partition the PDF file
elements = partition_pdf(
    filename=f"{file_path}/{base_file_name}.pdf",
    strategy="hi_res",
    infer_table_structure=True
)





In [18]:
# Display the elements partitioned from the PDF file
for i, element in enumerate(elements):
    print(f"\n--- Element {i} ---")
    print(f"Type: {element.category}")
    print(f"Text preview: {element.text}...")
    print(f"Metadata: {element.metadata}")


--- Element 0 ---
Type: Header
Text preview: 12/6/25, 10:50 AM...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001A405C62690>

--- Element 1 ---
Type: Header
Text preview: TMD | NIDCR...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001A40C1170B0>

--- Element 2 ---
Type: NarrativeText
Text preview: = An official website of the United States government Here’s how you know...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001A40C81B530>

--- Element 3 ---
Type: Image
Text preview: ...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001A40C81B2C0>

--- Element 4 ---
Type: NarrativeText
Text preview: National Institute of Dental and Craniofacial Research...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001A40C81AAB0>

--- Element 5 ---
Type: UncategorizedText
Text preview: </>...
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x00

In [20]:
# Convert to LangChain Documents
docs = []
for el in elements:
    #Skip headers, dont contain useful info
    if el.category == "Header":
        continue
    
    text = getattr(el, "text", None)
    if not text:
        continue
    meta = el.to_dict().get("metadata", {}) or {}
    docs.append(
        Document(
            page_content=text,
            metadata={
                "source": base_file_name,
                "page_number": meta.get("page_number"),
                "type": el.category if hasattr(el, "category") else meta.get("type"),
            },
        )
    )

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,
    add_start_index=True
)
chunked_docs = splitter.split_documents(docs)

In [21]:
#Print the chunked documents
for i, doc in enumerate(chunked_docs, 1):
    meta = doc.metadata
    print(f"--- Chunk {i} ---")
    print(f"source={meta.get('source')} page={meta.get('page_number')} type={meta.get('type')} section={meta.get('section')}")
    print(doc.page_content)
    print()

--- Chunk 1 ---
source=tmjDoc1 page=1 type=NarrativeText section=None
= An official website of the United States government Here’s how you know

--- Chunk 2 ---
source=tmjDoc1 page=1 type=NarrativeText section=None
National Institute of Dental and Craniofacial Research

--- Chunk 3 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
</>

--- Chunk 4 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
</espanol/temas-de-salud/los-trastornos-temporomandibulares>

--- Chunk 5 ---
source=tmjDoc1 page=1 type=NarrativeText section=None
Espafiol

--- Chunk 6 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
MENU

--- Chunk 7 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
Search

--- Chunk 8 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
[a |

--- Chunk 9 ---
source=tmjDoc1 page=1 type=Title section=None
TMD (Temporomandibular Disorders)

--- Chunk 10 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
f

--- Chunk 11 ---


In [22]:
#Create embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

vector = embeddings.embed_query("Hello world")

In [23]:
#Create Chroma db
vector_store = Chroma(
    collection_name="tmj_rag_app",
    embedding_function=embeddings,
    persist_directory="../data/chroma_db"
)

In [24]:
#Add documents to the vector store and persist vector store
vector_store.add_documents(chunked_docs)

['b7ec8c95-71e7-4625-a1d1-ac1b6958bd7f',
 '938523e6-4eb8-4705-bd0c-d886b17d8184',
 'b1e55ff8-dbcc-41b1-b7f0-d6e194711d91',
 '0088ee51-ca4a-4a23-84f4-876463548148',
 'c48d2211-4600-4d65-8959-3c5d9373db00',
 '140e50e0-992d-4327-82d7-6cc22b4e1698',
 '1e7e3047-d4ec-454d-9906-7b7063289d58',
 '594c826a-01d7-4112-ad62-d081ea0a9492',
 '6fa4a6d6-f1e5-47a3-bf33-1b6e35574727',
 '5af451cf-6c95-4994-87c7-548c234f98e1',
 '5bbdace8-f7df-411a-bec6-6eadc67b9eed',
 '4305b0ee-2fbc-479e-a85f-90c2662b887d',
 'f48a4681-6787-4362-9618-0f05f6950603',
 '2053a62a-1527-47ed-a1ad-a2a0c59fc115',
 'b1c6299b-67a9-42b4-b717-a3e4f68f7778',
 'aa8dbaac-1fff-4988-8db7-3e49547a2328',
 '81b36dca-3b31-4ee0-83aa-b7cd21eea6df',
 '21b56692-6d16-42a9-bf8d-314312df04f7',
 'c024c352-3889-4c48-99f1-0604888d6d48',
 '42c57c14-b98e-4586-ad16-d5c1a96b6fd0',
 'ae402011-5866-411f-be94-a9cbb471637b',
 'd1ba47df-aa38-4f15-8371-76b4559c244a',
 '7a465747-9388-4ecf-8915-40c4410a3e3e',
 '5978965d-e07b-41c6-89bd-1b1bbb1dba36',
 '11a3ecf1-17b7-

In [25]:
#Similarity serach for related docs
similar_docs = vector_store.similarity_search("Explain arthroscopy for TMJ", k=3)

In [36]:
#Prompt template
medical_template = ChatPromptTemplate.from_messages([
    ("system",
     "You are a medical information assistant specializing in TMJ disorders. "
     "Answer ONLY using the provided context. "
     "If the context does not contain enough information, say so clearly. "
     "Use precise medical terminology. "
     "This is for informational purposes only and does not replace professional medical advice."),
    ("human",
     "Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:")
])

#Build retriever from vector store
retriever = vector_store.as_retriever(
    searchType="mmr",
    searchKwargs={"k": 3}
)


#Function to format docs and create RunnableLambda
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

format_docs_runnable = RunnableLambda(format_docs)

output_parser = StrOutputParser()

In [34]:
#Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature = 0.0
)

In [37]:
#Create RAG chain
rag_chain = (
    {
        "context": retriever | format_docs_runnable,
        "question": RunnablePassthrough(),
    }
    | medical_template
    | llm
    | output_parser  
)

In [38]:
#Query the chain
answer = rag_chain.invoke("Explain arthroscopy for TMJ")
print(answer)

Arthroscopy for TMJ involves the insertion of an instrument equipped with a tiny video camera into the temporomandibular joint (TMJ). This allows the physician to visualize the joint, aiding in diagnosis. During the procedure, the doctor can also perform interventions such as removing adhesions or repositioning the joint's disc. Arthroscopy has been shown to provide moderate improvement in pain and function for patients with TMJ disorders.
