In [14]:
from unstructured.partition.pdf import partition_pdf
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
import os
import glob

load_dotenv()

file_path = "C:/Users/User/Downloads/tmj_rag_app/Ingestion/data/pdfs"
index_name = os.getenv("PINECONE_INDEX")


In [15]:
# Get all PDF files in the folder
pdf_files = glob.glob(os.path.join(file_path, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files to process")

# Partition all PDF files
all_elements = []
for pdf_file in pdf_files:
    base_file_name = os.path.splitext(os.path.basename(pdf_file))[0]
    print(f"Processing: {base_file_name}.pdf")
    
    elements = partition_pdf(
        filename=pdf_file,
        strategy="auto",
        infer_table_structure=True
    )
    
    # Store elements with their source file name
    for element in elements:
        element._source_file = base_file_name  # Store source file name
    
    all_elements.extend(elements)
    print(f"  Extracted {len(elements)} elements from {base_file_name}.pdf")

print(f"\nTotal elements extracted: {len(all_elements)}")


Found 11 PDF files to process
Processing: tmjDoc1.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


  Extracted 251 elements from tmjDoc1.pdf
Processing: tmjDoc10.pdf
  Extracted 59 elements from tmjDoc10.pdf
Processing: tmjDoc11.pdf
  Extracted 19 elements from tmjDoc11.pdf
Processing: tmjDoc2.pdf
  Extracted 164 elements from tmjDoc2.pdf
Processing: tmjDoc3.pdf
  Extracted 33 elements from tmjDoc3.pdf
Processing: tmjDoc4.pdf
  Extracted 137 elements from tmjDoc4.pdf
Processing: tmjDoc5.pdf
  Extracted 143 elements from tmjDoc5.pdf
Processing: tmjDoc6.pdf
  Extracted 24 elements from tmjDoc6.pdf
Processing: tmjDoc7.pdf
  Extracted 63 elements from tmjDoc7.pdf
Processing: tmjDoc8.pdf
  Extracted 72 elements from tmjDoc8.pdf
Processing: tmjDoc9.pdf
  Extracted 37 elements from tmjDoc9.pdf

Total elements extracted: 1002


In [16]:
# Display the elements partitioned from the PDF file
for i, element in enumerate(elements):
    print(f"\n--- Element {i} ---")
    print(f"Type: {element.category}")
    print(f"Text: {element.text}")
    print(f"Metadata: {element.metadata}")


--- Element 0 ---
Type: Header
Text: 12/5/25, 6:33 PM
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001BB96995460>

--- Element 1 ---
Type: Header
Text: Temporomandibular Disorders (TMD) Devices | FDA
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001BB96997980>

--- Element 2 ---
Type: Title
Text: Temporomandibular Disorders (TMD) Devices
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001BB96994860>

--- Element 3 ---
Type: NarrativeText
Text: Temporomandibular disorders (TMD) refer to an orofacial (face, head, or neck) pain condition in which pain and discomfort affect the temporomandibular joint (TMJ), the muscles, or the contiguous tissue components.
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001BB562B96D0>

--- Element 4 ---
Type: Title
Text: Updates on TMD Devices
Metadata: <unstructured.documents.elements.ElementMetadata object at 0x000001BB959EDA90>

--- Element 5 -

In [19]:
# Convert to LangChain Documents from all PDFs
docs = []
MIN_CHAR_LENGTH = 50

for el in all_elements:
    #Skip headers, dont contain useful info
    if el.category == "Header":
        continue
    
    text = getattr(el, "text", None)
    if not text:
        continue

    if (len(text.strip()) < MIN_CHAR_LENGTH):
        continue

    # Get source file name from the element
    source_file = getattr(el, "_source_file", "unknown")
    
    meta = el.to_dict().get("metadata", {}) or {}
    docs.append(
        Document(
            page_content=text,
            metadata={
                "source": source_file,
                "page_number": meta.get("page_number"),
                "type": el.category if hasattr(el, "category") else meta.get("type"),
            },
        )
    )

print(f"Total documents created: {len(docs)}")
print(f"Documents from {len(set(doc.metadata.get('source') for doc in docs))} different PDFs")

#Print first few documents as sample
for i, doc in enumerate(docs[:5], 1):  # Show first 5 as sample
    meta = doc.metadata
    print(f"--- Document {i} ---")
    print(f"Source: {meta.get('source')} | Page: {meta.get('page_number')} | Type: {meta.get('type')}")
    print(f"Content length: {len(doc.page_content)} characters")
    print(f"Content:\n{doc.page_content}")
    print()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,
    add_start_index=True
)
chunked_docs = splitter.split_documents(docs)
print(f"Total chunks created: {len(chunked_docs)}")

Total documents created: 426
Documents from 11 different PDFs
--- Document 1 ---
Source: tmjDoc1 | Page: 1 | Type: NarrativeText
Content length: 73 characters
Content:
= An official website of the United States government Here’s how you know

--- Document 2 ---
Source: tmjDoc1 | Page: 1 | Type: NarrativeText
Content length: 54 characters
Content:
National Institute of Dental and Craniofacial Research

--- Document 3 ---
Source: tmjDoc1 | Page: 1 | Type: UncategorizedText
Content length: 60 characters
Content:
</espanol/temas-de-salud/los-trastornos-temporomandibulares>

--- Document 4 ---
Source: tmjDoc1 | Page: 2 | Type: NarrativeText
Content length: 221 characters
Content:
Temporomandibular disorders (TMDs) area group of more than 30 conditions that cause pain and dysfunction in the jaw joint and muscles that control jaw movement. “TMDs” refers to the disorders, and “TMJ” refers only to the

--- Document 5 ---
Source: tmjDoc1 | Page: 2 | Type: UncategorizedText
Content length: 81 cha

In [20]:
#Print the chunked documents
for i, doc in enumerate(chunked_docs, 1):
    meta = doc.metadata
    print(f"--- Chunk {i} ---")
    print(f"source={meta.get('source')} page={meta.get('page_number')} type={meta.get('type')} section={meta.get('section')}")
    print(doc.page_content)
    print()

--- Chunk 1 ---
source=tmjDoc1 page=1 type=NarrativeText section=None
= An official website of the United States government Here’s how you know

--- Chunk 2 ---
source=tmjDoc1 page=1 type=NarrativeText section=None
National Institute of Dental and Craniofacial Research

--- Chunk 3 ---
source=tmjDoc1 page=1 type=UncategorizedText section=None
</espanol/temas-de-salud/los-trastornos-temporomandibulares>

--- Chunk 4 ---
source=tmjDoc1 page=2 type=NarrativeText section=None
Temporomandibular disorders (TMDs) area group of more than 30 conditions that cause pain and dysfunction in the jaw joint and muscles that control jaw movement. “TMDs” refers to the disorders, and “TMJ” refers only to the

--- Chunk 5 ---
source=tmjDoc1 page=2 type=UncategorizedText section=None
Healthy temporomandibular joint during mouth opening & closing. temporomandibular

--- Chunk 6 ---
source=tmjDoc1 page=2 type=NarrativeText section=None
itself. People have two TMJs; one on each side of the jaw. You can feel t

In [21]:
#Create embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


In [22]:
#Create Pinecone vector store
vector_store = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings
)

In [23]:
#Add documents to the vector store and persist vector store
vector_store.add_documents(chunked_docs)

['31207198-5589-4927-ab41-8cd95c9b614a',
 'ba7f80a4-9ee1-44ec-bb2f-933725d71fb1',
 'b8580911-30f6-491a-86bb-76133eb72cb5',
 'c62d8c23-d596-48df-bc61-6b6ff985bfb2',
 '928f80e7-bf54-46b1-af5d-e4a5734ef432',
 '875d90a8-e909-4371-aae7-8e9640b14f3c',
 '6dfd9790-ded6-412b-917a-6f7ca02eafe3',
 '68f199a5-e249-45b4-b1c4-1ba08c0ebe0b',
 'bdd866df-7c4f-4493-b21d-0d153540b4cb',
 'a8e0432f-655a-419d-910d-e63f60f8470e',
 'e2393a95-d69d-48a1-8878-69af95cd911d',
 '7c59ee7f-e561-4002-bf76-3274d312a8ba',
 '964add7c-a0f8-47d7-9085-116291105f93',
 '68fd9ab8-9459-4b4d-afbf-35d5f0b28ec9',
 'abdc8b86-577c-421d-9bb4-c98f01fb91bf',
 '9768055c-9887-46e4-8f5d-41a3f31b3b75',
 '29d277f4-23eb-49dc-902f-b067bcfa9933',
 '30a49c6d-8b64-45ab-a670-56e189b70e2b',
 '5a821778-e188-40ac-9350-e80785efa48f',
 'be69f710-81be-4b93-bd34-6f80a59c2950',
 '47f44bf6-28e3-4363-b99b-88f717ccd481',
 'e46be0cf-8755-4d45-8e64-58a7c20fd54d',
 '3264544f-d17e-4d07-bf20-ec3f0908fb6f',
 '84e9697d-5c20-4bbe-9297-078ee6b24d35',
 'cbbd5d98-868e-

In [24]:
#Prompt template
medical_template = ChatPromptTemplate.from_messages([
    ("system",
     "You are a medical information assistant specializing in TMJ disorders. "
     "Answer ONLY using the provided context. "
     "If the context does not contain enough information, say so clearly. "
     "Use precise medical terminology. "
     "This is for informational purposes only and does not replace professional medical advice."),
    ("human",
     "Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:")
])

#Build retriever from vector store
retriever = vector_store.as_retriever(
    searchType="mmr",
    searchKwargs={"k": 5}
)


#Function to format docs and create RunnableLambda
def format_docs(docs):
    formatted = "\n\n".join(doc.page_content for doc in docs)
    print(formatted)
    return formatted

format_docs_runnable = RunnableLambda(format_docs)

output_parser = StrOutputParser()

In [25]:
#Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature = 0.0
)

In [26]:
#Create RAG chain
rag_chain = (
    {
        "context": retriever | format_docs_runnable,
        "question": RunnablePassthrough(),
    }
    | medical_template
    | llm
    | output_parser  
)

In [27]:
#Query the chain
answer = rag_chain.invoke("What should I know about TMJ disorders?")
print(answer)

TMJ disorders [https://medlineplus.gov/ency/article/001227 htm]

TMJ disorders are not uncommon and have a variety of symptoms. Patients may complain of earaches, headaches and limited ability to open their mouth. They may also complain of clicking or grating sounds in the joint and feel pain when opening and closing their mouth.

e TMJ Disorders [https://www.mayoclinic.org/diseases-conditions/tmj/symptoms-causes/syc-20350941 ?p=1] (Mayo Foundation for Medical Education and Research)

Temporomandibular disorders, commonly called “TMD,” are a group of painful conditions that affect the jaw joint and the muscles that control jaw movement. “TMD” refers to the disorders, and “TMJ” refers to the temporomandibular joint itself. People have two TMJs; one on each side of the jaw. Injury to the jaw can sometimes lead to TMDs, but in most cases the cause is not clear. Recent research suggests a combination of genes, psychological and life stressors, and how someone perceives pain may play a part