In [None]:
# 

### PDF Conversion to Image Folder 

In [20]:
import os
import fitz  # PyMuPDF

def convert_pdf_to_img(pdf_path, drop_location):

    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    os.makedirs(drop_location, exist_ok=True)
    

    doc = fitz.open(pdf_path)
    
    image_paths = []
    for i, page in enumerate(doc, start=1):
      
        pix = page.get_pixmap()
        
        # Save the image as {pdf_name}_1.png, {pdf_name}_2.png, etc.
        img_path = os.path.join(drop_location, f"{i}.png")
        pix.save(img_path)
        image_paths.append(img_path)
    
    print(f"All images saved in: {drop_location}")


pdf_path = "3.pdf"           # Path to your PDF file
drop_location = "pdf3/"     
convert_pdf_to_img(pdf_path, drop_location)


All images saved in: pdf3/


In [1]:
!pip install google-generativeai

 ## Extraction using Gemini  

In [1]:
import google.generativeai as genai
import os
import base64

# os.environ['GOOGLE_API_KEY']
API_KEY = os.getenv('GOOGLE_API_KEY')

# api_key = os.getenv("UPSTAGE_DOCUMENT_AI_API_KEY")
genai.configure(api_key=API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


 ### Code to Test full PDF Folder  

In [11]:
import os
import json
from pathlib import Path

def prep_image(image_path):
  
    sample_file = genai.upload_file(path=image_path, display_name=f"Diagram-{Path(image_path).stem}")
    print(f"Uploaded file '{sample_file.display_name}' as: {sample_file.uri}")
    file = genai.get_file(name=sample_file.name)
    return sample_file

def extract_text_from_image(image_path, prompt):
 
    model = genai.GenerativeModel(model_name="gemini-1.5-pro")
    generation_config = {
        "temperature": 0.9,
        "response_mime_type": "application/json"
    }
    response = model.generate_content([image_path, prompt], generation_config=generation_config)
    # Parse the response text as JSON
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        print(f"Warning: Could not parse response as JSON. Raw response: {response.text}")
        return {"error": "Could not parse response as JSON", "raw_text": response.text}

def process_image_folder(folder_path, prompt_template):
 
    # Get all PNG files in the folder
    image_files = sorted(
        [f for f in os.listdir(folder_path) if f.lower().endswith('.png')],
        key=lambda x: int(''.join(filter(str.isdigit, x)))  # Sort numerically
    )
    
    if not image_files:
        raise Exception(f"No PNG files found in {folder_path}")
    
    # Initialize the consolidated JSON
    consolidated_json = {
        "total_images": len(image_files),
        "images": {}
    }
    
    for image_file in image_files:
        full_path = os.path.join(folder_path, image_file)
        print(f"\nProcessing {image_file}...")
        
        try:
            # Upload and process each image
            uploaded_file = prep_image(full_path)
            extracted_content = extract_text_from_image(
                uploaded_file,
                prompt_template
            )
            
            # Add to consolidated JSON
            image_number = ''.join(filter(str.isdigit, image_file))
            consolidated_json["images"][f"image_{image_number}"] = extracted_content
            
        except Exception as e:
            print(f"Error processing {image_file}: {str(e)}")
            consolidated_json["images"][f"image_{image_number}"] = {
                "error": str(e)
            }
            continue
    
    return consolidated_json


my_api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=my_api_key)
    
    # Your existing template
template = """
    Convert the following text into a proper JSON format.
    Extract all relevant information as key-value pairs.
    Return ONLY a valid JSON object, nothing else.
    
    Text to convert:
    {text}
    
    Rules:
    1. Structure all information into logical key-value pairs
    2. Group related information together
    3. Use arrays for multiple related items
    4. Maintain proper JSON syntax
    5. Return only the JSON object with no additional text or explanation
    """
    
    # Process all images in the folder
folder_path = "pdf3"  # Replace with your folder path
consolidated_results = process_image_folder(folder_path, template)
    
    # Print the consolidated JSON in a readable format
print("\nConsolidated JSON Output:")
print(json.dumps(consolidated_results, indent=2, ensure_ascii=False))


print(consolidated_results)
    


Processing 1.png...
Uploaded file 'Diagram-1' as: https://generativelanguage.googleapis.com/v1beta/files/jgrd43i8tyu3

Processing 2.png...
Uploaded file 'Diagram-2' as: https://generativelanguage.googleapis.com/v1beta/files/1gqiqr2fom3o

Processing 3.png...
Uploaded file 'Diagram-3' as: https://generativelanguage.googleapis.com/v1beta/files/16nlqrgkzwew

Processing 4.png...
Uploaded file 'Diagram-4' as: https://generativelanguage.googleapis.com/v1beta/files/pilv6zhxj9a9

Consolidated JSON Output:
{
  "total_images": 4,
  "images": {
    "image_1": {
      "patient_information": {
        "patient_name": "VAISHALI VIJAY BELLUBBI",
        "patient_id": "FH.3991490",
        "client_patient_id": "UID: 3991490",
        "age": "68 Years",
        "sex": "Female",
        "date_of_birth": "26/02/1956"
      },
      "report_information": {
        "accession_no": "0081XD012004",
        "drawn": "16/04/2024 02:36",
        "received": "16/04/2024 05:55",
        "reported": "16/04/2024 07

In [13]:
# consolidated_results   
type(consolidated_results)

dict

In [14]:
import json
print(json.dumps(consolidated_results, indent=4))

{
    "total_images": 4,
    "images": {
        "image_1": {
            "patient_information": {
                "patient_name": "VAISHALI VIJAY BELLUBBI",
                "patient_id": "FH.3991490",
                "client_patient_id": "UID: 3991490",
                "age": "68 Years",
                "sex": "Female",
                "date_of_birth": "26/02/1956"
            },
            "report_information": {
                "accession_no": "0081XD012004",
                "drawn": "16/04/2024 02:36",
                "received": "16/04/2024 05:55",
                "reported": "16/04/2024 07:06",
                "client_name": "FHSL BG ROAD -IPD",
                "referring_doctor": "DR. Rajpal RL. Singh",
                "clinical_information": "UID: 3991490 REQNO-17006613\nIPD-L3 CCU I (ICU I)\nIPID-88910/24/1113",
                "test_report_status": "Final"
            },
            "haematology_cbc": {
                "blood_counts": {
                    "hemoglobin": {
  

<!-- 
## Extraction using Amazon Textract -->

## Amazon Textract 

https://python.langchain.com/docs/integrations/document_loaders/amazon_textract/

In [6]:
# # !pip install amazon-textract-caller
# # !pip install amazon-textract-textractor
# # !pip install amazon-textract-response-parser
# !pip install amazon-textract-response-parser



In [15]:
from langchain_community.document_loaders import AmazonTextractPDFLoader


loader = AmazonTextractPDFLoader("1.pdf")
documents = loader.load()

# parseformKV=form_kv_from_JSON(response)
# parseformTables=get_tables_fromJSON(response)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [16]:
documents

[Document(metadata={'source': '1.pdf', 'page': 1}, page_content='Fortis Hospitals, Limited\n\n\n154/9, Bannerghatta Roage 1 of 3\n\n\nFortis\n\n\nOpp. IIM-B, Bengaluru - 560 076\n\n\nTel. : +91-80-6621 4444, 2254 4444\n\n\nFax +91-80-6621 4242.\n\n\ncare.bng@fortishealthcare.com\n\n\nCIN No. U93000DL2009PLC222166\n\n\nDEPARTMENT OF CARDIOLOGY\n\n\nDate : 18/Apr/2024\n\n\nDischarge Summary\n\n\nMrs. Vaishali Vijay\n\n\nPatient Name\n\n\nUHID I Old UHID\n\n\n3991490 | WHBG.0000576179\n\n\nBellubbi\n\n\nAge / Gender\n\n\n68 Years / Female\n\n\nEpisode No\n\n\n88910/24/1113\n\n\nContact No\n\n\n9686571334\n\n\nDate of Admission\n\n\n14 Apr 2024\n\n\nDischarge Type\n\n\nROUTINE\n\n\nDate of Discharge\n\n\n17 Apr 2024\n\n\n201, GURU PRIYA NEAR ROYAL RESIDENCY APT BTM 4TH\n\n\nAddress\n\n\nTAGE,Bangalore.,Karnataka,India,560074\n\n\nName of Consultant Dr.Rajpal RL Singh\n\n\nDoctor Team\n\n\nDiagnosis\n\n\nACUTE INFERIOR WALL MYOCARDIAL INFARCTION\n\n\nISCHEMIC HEART DISEASE, STATUS POST PCI 

In [17]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import PromptTemplate
import json

def convert_text_to_json_with_llm(input_text):
  
    # Create a generic prompt template for JSON conversion
    template = """
    Convert the following text into a proper JSON format.
    Extract all relevant information as key-value pairs.
    Return ONLY a valid JSON object, nothing else.
    
    Text to convert:
    {text}
    
    Rules:
    1. Structure all information into logical key-value pairs
    2. Group related information together
    3. Use arrays for multiple related items
    4. Maintain proper JSON syntax
    5. Return only the JSON object with no additional text or explanation
    """
    
    prompt = PromptTemplate(
        template=template,
        input_variables=["text"]
    )
    
    # Initialize the LLM
    llm = OllamaLLM(model="llama3.1:latest")
    
    # Generate the formatted prompt
    formatted_prompt = prompt.format(text=input_text)
    
    # Get response from LLM
    response = llm(formatted_prompt)
    
    try:
        # Clean the response
        cleaned_response = response.strip()
        # Find JSON boundaries
        start_idx = cleaned_response.find('{')
        end_idx = cleaned_response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            cleaned_response = cleaned_response[start_idx:end_idx + 1]
            
            # Parse and validate JSON
            parsed_json = json.loads(cleaned_response)
            
            # Return formatted JSON string
            return json.dumps(parsed_json, indent=2, ensure_ascii=False)
        else:
            raise ValueError("No valid JSON object found in response")
            
    except Exception as e:
        raise Exception(f"Failed to parse LLM output into JSON: {str(e)}")


In [None]:
try:
        json_output_amazon_textract = convert_text_to_json_with_llm(documents)
        json_output_amazon_textract = json.loads(json_output_amazon_textract)
        print(json_output_amazon_textract)
except Exception as e:
        print(f"Error: {str(e)}")

  response = llm(formatted_prompt)


In [10]:
json_output_amazon_textract

# type(json_output_amazon_textract)

{'metadata': {'source': '1.pdf', 'page': 1},
 'page_content': ['Fortis Hospitals, Limited\n\n\n154/9, Bannerghatta Roage 1 of 3\n\n\nFortis\n\n\nOpp. IIM-B, Bengaluru - 560 076\n\n\nTel. : +91-80-6621 4444, 2254 4444\n\n\nFax +91-80-6621 4242.\n\n\ncare.bng@fortishealthcare.com\n\n\nCIN No. U93000DL2009PLC222166\n\n\nDEPARTMENT OF CARDIOLOGY\n\n\nDate : 18/Apr/2024\n\n\nDischarge Summary\n\n\nMrs. Vaishali Vijay\n\n\nPatient Name\n\n\nUHID I Old UHID\n\n\n3991490 | WHBG.0000576179\n\n\nBellubbi\n\n\nAge / Gender\n\n\n68 Years / Female\n\n\nEpisode No\n\n\n88910/24/1113\n\n\nContact No\n\n\n9686571334\n\n\nDate of Admission\n\n\n14 Apr 2024\n\n\nDischarge Type\n\n\nROUTINE\n\n\nDate of Discharge\n\n\n17 Apr 2024\n\n\n201, GURU PRIYA NEAR ROYAL RESIDENCY APT BTM 4TH\n\n\nAddress\n\n\nTAGE,Bangalore.,Karnataka,India,560074\n\n\nName of Consultant Dr.Rajpal RL Singh\n\n\nDoctor Team\n\n\nDiagnosis',
  ['ACUTE INFERIOR WALL MYOCARDIAL INFARCTION',
   'ISCHEMIC HEART DISEASE, STATUS POST PCI

<!-- ## Upstage Extraction  -->

### Langchain Upstage or Upstage. AI    

https://www.upstage.ai/
https://python.langchain.com/docs/integrations/providers/upstage/

In [13]:
pip install langchain_upstage
!pip install python-dotenv

In [11]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Now you can access the environment variable
api_key = os.getenv("UPSTAGE_DOCUMENT_AI_API_KEY")

from langchain_upstage import UpstageLayoutAnalysisLoader

file_path = "3.pdf"

loader = UpstageLayoutAnalysisLoader(file_path,use_ocr=True,output_type="text")
data = loader.load()




In [12]:
data

[Document(metadata={'total_pages': 4}, page_content='Diagnostics ReportDIAGNOSTIC REPORTTOMSagilus>> agirus\'s C S\nH O S P I T ALPATIENT NAME : VAISHALI VIJAY BELLUBBIPATIENT ID : FH.3991490 CLIENT PATIENT ID : UID:3991490 \n ACCESSION NO : 0081XD012004 AGE : 68 Years SEX : Female DATE RECEIVED : 16/04/2024 05:55 OF BIRTH : 26/02/1956 REPORTED \n DRAWN : 16/04/2024 02:36  : 16/04/2024 07:06 \n CLIENT NAME : FHSL BG ROAD - IPD REFERRING DOCTOR : DR. Rajpal RL Singh  \n CLINICAL INFORMATION :   \n UID:3991490 REQNO-17006613   \n IPD-L3 CCU I (ICU I) IPID-88910/24/1113   \n Test Report Status Final Results Biological Reference Interval UnitsHAEMATOLOGY - CBCCBC-5, EDTA WHOLE BLOOD\n * BLOOD COUNTS, EDTA WHOLE BLOOD\n HEMOGLOBIN (HB) 13.3  12.0 - 15.0 g/dL\n METHOD : SLS METHOD    mil/�L\n RED BLOOD CELL (RBC) COUNT METHOD : AUTOMATED CELL COUNTER:HYDRO DYNAMIC FOUSING (DC DETECTION) 4.73  3.8 - 4.8 \n WHITE BLOOD CELL (WBC) COUNT 8.96  4.0 - 10.0 thou/uL\n     thou/�L\n METHOD : FLOW CYT

In [13]:
try:
        json_output_upstage = convert_text_to_json_with_llm(data)
        json_output_upstage = json.loads(json_output_upstage)
        print(json_output_upstage)
except Exception as e:
        print(f"Error: {str(e)}")

{'Patient Information': {'Name': 'Vaishali Vijay Bellubbi', 'ID': 'FH.3991490', 'Client ID': 'UID:3991490', 'Age': 68, 'Sex': 'Female', 'Date of Birth': '26/02/1956'}, 'Sample Information': {'Received Time': '16/04/2024 05:55', 'Drawn Time': '16/04/2024 02:36', 'Reported Time': '16/04/2024 07:06'}, 'Clinical Information': {'Patient Location': 'IPD-L3 CCU I (ICU I)', 'Reference ID': 'IPID-88910/24/1113'}, 'Test Results': [{'Name': 'Condition of Laboratory Testing & Reporting', 'Description': 'Conditions for laboratory testing and reporting'}], 'Laboratory Information': {'Address': '154/9, Bannerghatta Road, Opp. Iim-B, Bangalore, 560076, Karnataka, India', 'Name': 'Agilus Diagnostics Ltd', 'Phone Number': '91115 91115'}}


In [14]:
json_output_upstage

{'Patient Information': {'Name': 'Vaishali Vijay Bellubbi',
  'ID': 'FH.3991490',
  'Client ID': 'UID:3991490',
  'Age': 68,
  'Sex': 'Female',
  'Date of Birth': '26/02/1956'},
 'Sample Information': {'Received Time': '16/04/2024 05:55',
  'Drawn Time': '16/04/2024 02:36',
  'Reported Time': '16/04/2024 07:06'},
 'Clinical Information': {'Patient Location': 'IPD-L3 CCU I (ICU I)',
  'Reference ID': 'IPID-88910/24/1113'},
 'Test Results': [{'Name': 'Condition of Laboratory Testing & Reporting',
   'Description': 'Conditions for laboratory testing and reporting'}],
 'Laboratory Information': {'Address': '154/9, Bannerghatta Road, Opp. Iim-B, Bangalore, 560076, Karnataka, India',
  'Name': 'Agilus Diagnostics Ltd',
  'Phone Number': '91115 91115'}}

<!-- ## LLAMA-OCR  -->

<!--  -->