In [71]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import BaseOutputParser

# Other modules and packages
import os
import tempfile
import pandas as pd
from dotenv import load_dotenv

from pdf2image import convert_from_path
import pytesseract as pt
from PIL import Image

### Extracting content from pdf

content from pdf is extracted using pytessareact and saved in txt format

In [54]:


def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    text_content = []

    for page_number, page in enumerate(pages, 1):
        text = pt.image_to_string(page)
        text_content.append(f'Page {page_number}: \n {text} \n')

    
    return '\n'.join(text_content)





In [55]:
# cleaning data extracted from pdf 
# converting data to ascii format for recognition within LLM and embeddings
def clean_text(text):
    # Remove or replace problematic characters
    cleaned_text = ''
    for char in text:
        try:
            # Try to encode the character to check if it's valid
            char.encode('ascii')
            cleaned_text += char
        except UnicodeEncodeError:
            # Skip the problematic character
            continue
    return cleaned_text

In [56]:
# saving data to txt file
def save_data_to_file(data, output_path):
    print("writing data to file")
    with open(output_path, 'w', encoding='ascii') as file:
        file.write(data)

    file.close()
    print("file save! Operation complete")

In [57]:
PDF_PATH = "data/bill_of_items.pdf"
OUTPUT_PATH = "data/extracted_text.txt"

data = extract_text_from_pdf(PDF_PATH)
cleaned_data = clean_text(data)
save_data_to_file(cleaned_data, OUTPUT_PATH)

writing data to file
file save! Operation complete


#### Loading API Key from Enviourment Variables


In [58]:
load_dotenv()
OPENAI_API_KEY =os.environ.get("OPENAI_API_KEY")

### Creating an instance of LLM


In [59]:
# llm instasnace for text extraction
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

In [60]:
# Testing working of llm model
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 13, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-419e48bb-6eff-40f0-822d-61875b4f352a-0', usage_metadata={'input_tokens': 13, 'output_tokens': 20, 'total_tokens': 33, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Data Preprocessing

loading data, creating neccesary embedding storing them in vector database


In [None]:
# loading data from txt file after extraction from OCR

loader = TextLoader("data/extracted_text.txt")
pages = loader.load()
pages

In [61]:
# splitting text into chunks for creating good embeddings and staying under limit of LLM
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Embedding Generation

In [62]:

# for embeddings creation
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )

    return embeddings

embedding_function = get_embedding_function()


In [63]:
# saving embeddings to a vector store
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    
    return vectorstore

In [64]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_test3")

In [65]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_test3", embedding_function=embedding_function)

In [66]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Increase k for more documents
)
relevant_chunks = retriever.invoke("what are tests mentioned")
relevant_chunks

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


[Document(metadata={'source': 'data/extracted_text.txt'}, page_content='Page 1: \n Final Bill\n\nName: Mrs, BHUVANESHWARI V i  y\nAge/Gender: 27 Y F & Chile\nAddress: NO,23, 2NDMAIN, 2ND STAGE, WOC ROAD, MAHALAKSHMIPURAM POST MR No: fen ie\n\nLocation: BANGALORE,KARNATAKA. Visit ID;\n\nDoctor: Dr. BHARATHI RAJANNA Admission Date: seeptl\n\nDepartment: Obstetrics & Gynaecology Ward/Bed DEL\n\nRate Plan: Cradle Rajaji Nagar Gen_25012023 Discharge Date:\n\nSponsor: Referred By: i} 08 | ATH] RAJANNA\n\nBill No: RRAJ-ICR-822(Bill Later) Bill Date:08-06-2024\n\nCharges Ord# Head Description Rate Qty Amount\n\nPackages\n\n08-06-2024 Package Charges Lscs 106,000.00 i 106,000.00\nSub Total: 106,000.00\n\nDiagnostics\n\n08-06-2024 16166221 Lab Tests HCV Tri Dot 495,00 1.00 495.00\n\n08-06-2024 16166221 Lab Tests HIV RAPID 1,731.00 1,00 1,731.00\n\n08-06-2024 16166221 Lab Tests (VDRL) RPR QUALITATIVE- SERUM 440.00 1.00 440.00\n\n08-06-2024 16166221  Lab Tests HBS AG SCREENING(RAPID) 1,327.00 1,00

### Promt Template


In [67]:
PROMPT_TEMPLATE = """
You are a highly skilled information extraction model.

Extract all items under "Diagnostics," "items," and "Services & Procedures" sections from the document below.
Some item names may start with a number. Each item should include the following details, if available: date, code, name, rate, quantity, and amount.

If there are multiple items in a section, list each item separately in the output.

**If an item is not formatted as expected (e.g., if the date or "Inventory Item" appears later in the text), reorder it so that it follows this structure:**
- **[Date] Inventory Item [Item Code] [Item Name] [Details] [Rate] [Quantity] [Amount] ([Reference No.])**

Ensure that extracted information is provided consistently according to this ordering.
Return the results as a JSON array with each item having the keys: name, rate, quantity, and amount.

Document:
----------------
{context}
----------------

"""



#### Adding text from pdf to the prompt

In [68]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = PROMPT_TEMPLATE.format(context=context_text, )
print(prompt)


You are a highly skilled information extraction model.

Extract all items under "Diagnostics," "items," and "Services & Procedures" sections from the document below.
Some item names may start with a number. Each item should include the following details, if available: date, code, name, rate, quantity, and amount.

If there are multiple items in a section, list each item separately in the output.

**If an item is not formatted as expected (e.g., if the date or "Inventory Item" appears later in the text), reorder it so that it follows this structure:**
- **[Date] Inventory Item [Item Code] [Item Name] [Details] [Rate] [Quantity] [Amount] ([Reference No.])**

Ensure that extracted information is provided consistently according to this ordering.
Return the results as a JSON array with each item having the keys: name, rate, quantity, and amount.

Document:
----------------
Page 1: 
 Final Bill

Name: Mrs, BHUVANESHWARI V i  y
Age/Gender: 27 Y F & Chile
Address: NO,23, 2NDMAIN, 2ND STAGE, W

### Output Schema

In [None]:
# using pydantic to create ouput schema for LLM
from typing import Optional
from pydantic import BaseModel, Field

class Equipments(BaseModel):
    """ 
    Information about a medical equipment, service, or test.
    """
    name: str = Field(description="Name of equipment/service/test")
    quantity: Optional[int] = Field(default=None, description="Quantity of each product")
    rate: Optional[float] = Field(default=None, description="Rate of a product")
    amount: Optional[float] = Field(default=None, description="Multiplication of rate and quantity")





## Invoking Model

In [69]:
result = llm.invoke(input=prompt)
print(result)

content='```json\n[\n    {\n        "name": "HCV Tri Dot",\n        "rate": "495.00",\n        "quantity": "1.00",\n        "amount": "495.00"\n    },\n    {\n        "name": "HIV RAPID",\n        "rate": "1,731.00",\n        "quantity": "1.00",\n        "amount": "1,731.00"\n    },\n    {\n        "name": "(VDRL) RPR QUALITATIVE- SERUM",\n        "rate": "440.00",\n        "quantity": "1.00",\n        "amount": "440.00"\n    },\n    {\n        "name": "HBS AG SCREENING(RAPID)",\n        "rate": "1,327.00",\n        "quantity": "1.00",\n        "amount": "1,327.00"\n    },\n    {\n        "name": "LACTATION CONSULTATION CHARGES",\n        "rate": "1,100.00",\n        "quantity": "1.00",\n        "amount": "1,100.00"\n    },\n    {\n        "name": "PHYSIOTHERAPY CONSULTATION",\n        "rate": "1,100.00",\n        "quantity": "1.00",\n        "amount": "1,100.00"\n    },\n    {\n        "name": "DIET CHARGES",\n        "rate": "1,000.00",\n        "quantity": "3.00",\n        "amount":

In [70]:
# saving results from LLm to file for cassification through LightRag
import re
match = re.search(r'\[.*?\]', result.content, re.DOTALL)

if match:
    # Get the extracted data and clean it up (remove backticks and newlines)
    data_in_brackets = match.group(0)
    cleaned_data = data_in_brackets.strip().replace("`", "").replace("\n", "")

    # Replace commas within numbers with periods
    cleaned_data = re.sub(r'(\d+),(\d+)', r'\1.\2', cleaned_data)

    # Step 2: Write the cleaned data directly to a .txt file
    with open("structured_output.txt", "w") as file:
        file.write(cleaned_data)

    print("Data written to output.txt.")
else:
    print("No content found between square brackets.")

Data written to output.txt.


### LightRag use for classification

In [None]:
from lightrag import LightRAG, QueryParam
from lightrag.llm import gpt_4o_complete

In [None]:
import os

from lightrag import LightRAG, QueryParam
from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete

#########
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
import nest_asyncio
nest_asyncio.apply()
#########

WORKING_DIR = "./dickens"


if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=gpt_4o_mini_complete  # Use gpt_4o_mini_complete LLM model
    # llm_model_func=gpt_4o_complete  # Optionally, use a stronger model
)

with open("/content/new_output.txt") as f:
    rag.insert(f.read())

# Perform naive search
print(rag.query("Classify the items in a table as medical or non-medical along with amount, quantity?", param=QueryParam(mode="naive")))

# Perform hybrid search
print(rag.query("Classify the items in a table as medical or non-medical along with amount, quantity?", param=QueryParam(mode="hybrid")))