In [2]:
!pip install rapidocr_onnxruntime

Collecting rapidocr_onnxruntime
  Downloading rapidocr_onnxruntime-1.4.3-py3-none-any.whl.metadata (1.3 kB)
Downloading rapidocr_onnxruntime-1.4.3-py3-none-any.whl (14.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidocr_onnxruntime
Successfully installed rapidocr_onnxruntime-1.4.3


#  Convertor (text ot JSON) using LLM 

In [6]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import PromptTemplate
import json

def convert_text_to_json_with_llm(input_text):
    """
    Converts any input text into a structured JSON format.
    
    Parameters:
    - input_text (str): Any text input to be converted to JSON
    
    Returns:
    - str: A JSON string containing structured data
    """
    # Create a generic prompt template for JSON conversion
    template = """
    Convert the following text into a proper JSON format.
    Extract all relevant information as key-value pairs.
    Return ONLY a valid JSON object, nothing else.
    
    Text to convert:
    {text}
    
    Rules:
    1. Structure all information into logical key-value pairs
    2. Group related information together
    3. Use arrays for multiple related items
    4. Maintain proper JSON syntax
    5. Return only the JSON object with no additional text or explanation
    """
    
    prompt = PromptTemplate(
        template=template,
        input_variables=["text"]
    )
    
    # Initialize the LLM
    llm = OllamaLLM(model="llama3.1:latest")
    
    # Generate the formatted prompt
    formatted_prompt = prompt.format(text=input_text)
    
    # Get response from LLM
    response = llm(formatted_prompt)
    
    try:
        # Clean the response
        cleaned_response = response.strip()
        # Find JSON boundaries
        start_idx = cleaned_response.find('{')
        end_idx = cleaned_response.rfind('}')
        
        if start_idx != -1 and end_idx != -1:
            cleaned_response = cleaned_response[start_idx:end_idx + 1]
            
            # Parse and validate JSON
            parsed_json = json.loads(cleaned_response)
            
            # Return formatted JSON string
            return json.dumps(parsed_json, indent=2, ensure_ascii=False)
        else:
            raise ValueError("No valid JSON object found in response")
            
    except Exception as e:
        raise Exception(f"Failed to parse LLM output into JSON: {str(e)}")


## Rapid OCR

In [8]:
from rapidocr_onnxruntime import RapidOCR

engine = RapidOCR()

img_path = '1.png'
result_RapidOCR, elapse = engine(img_path)
print(result)
print(elapse)

[[[[427.0, 32.0], [551.0, 31.0], [551.0, 47.0], [427.0, 48.0]], 'Fortis Hospitals Limited', 0.928359697262446], [[[64.0, 47.0], [170.0, 51.0], [169.0, 88.0], [63.0, 84.0]], 'Fortis', 0.9813376267751058], [[[426.0, 45.0], [564.0, 40.0], [564.0, 59.0], [427.0, 64.0]], '154/9.Baeagof3', 0.7397212577717645], [[[428.0, 55.0], [547.0, 55.0], [547.0, 71.0], [428.0, 71.0]], 'Opp.M-B.Bengaluru-560076', 0.878893873343865], [[[429.0, 67.0], [558.0, 67.0], [558.0, 80.0], [429.0, 81.0]], 'Tel.:+91-80-66214444,2254 4444', 0.9282006283601125], [[[428.0, 78.0], [517.0, 78.0], [517.0, 91.0], [428.0, 91.0]], 'Fax: +91-80-6621 4242.', 0.9088793694972992], [[[429.0, 89.0], [540.0, 89.0], [540.0, 102.0], [429.0, 102.0]], 'care.bng@fortishealthcare.com', 0.929301203324877], [[[429.0, 99.0], [559.0, 99.0], [559.0, 112.0], [429.0, 112.0]], 'CIN No.U93000DL2009PLC222166', 0.954205005296639], [[[241.0, 143.0], [399.0, 143.0], [399.0, 156.0], [241.0, 156.0]], 'DEPARTMENTOFCARDIOLOGY', 0.9958724189888347], [[[78.

In [9]:
json_output_rapidOCR = convert_text_to_json_with_llm(result_RapidOCR)
json_output_rapidOCR = json.loads(json_output_rapidOCR)
print(json_output_rapidOCR)

  response = llm(formatted_prompt)


{'PatientInformation': {'Name': '', 'Age': None, 'Gender': '', 'ContactNumber': [''], 'Address': ''}, 'MedicalHistory': {'Diseases': [{'name': '', 'date_diagnosed': ''}], 'Medications': [{'name': '', 'dosage': '', 'frequency': ''}]}, 'CurrentIllness': {'NameOfIllness': '', 'Duration': None, 'Symptoms': [], 'Diagnosis': []}, 'VitalSigns': {'BloodPressure': [{'systolic': '', 'diastolic': ''}], 'RespiratoryRate': None, 'Temperature': None}, 'Allergies': {'NameOfSubstance': [], 'Severity': []}, 'Immunizations': {'VaccineType': [], 'DateAdministered': []}, 'LaboratoryResults': {'TestType': [], 'ResultValue': [], 'Unit': []}, 'TreatmentPlan': {'Medications': [{'name': '', 'dosage': '', 'frequency': ''}], 'Surgery': None, 'FollowUpVisit': []}, 'DischargeInstructions': {'MedicationsToTake': [], 'ActivityLevel': [], 'Diet': []}, 'ContactInformation': {'NameOfContactPerson': '', 'RelationshipWithPatient': '', 'PhoneNumbers': [''], 'Addresses': [{'streetAddress': '', 'city': '', 'state': '', 'zip

In [12]:
result_RapidOCR

[[[[427.0, 32.0], [551.0, 31.0], [551.0, 47.0], [427.0, 48.0]],
  'Fortis Hospitals Limited',
  0.928359697262446],
 [[[64.0, 47.0], [170.0, 51.0], [169.0, 88.0], [63.0, 84.0]],
  'Fortis',
  0.9813376267751058],
 [[[426.0, 45.0], [564.0, 40.0], [564.0, 59.0], [427.0, 64.0]],
  '154/9.Baeagof3',
  0.7397212577717645],
 [[[428.0, 55.0], [547.0, 55.0], [547.0, 71.0], [428.0, 71.0]],
  'Opp.M-B.Bengaluru-560076',
  0.878893873343865],
 [[[429.0, 67.0], [558.0, 67.0], [558.0, 80.0], [429.0, 81.0]],
  'Tel.:+91-80-66214444,2254 4444',
  0.9282006283601125],
 [[[428.0, 78.0], [517.0, 78.0], [517.0, 91.0], [428.0, 91.0]],
  'Fax: +91-80-6621 4242.',
  0.9088793694972992],
 [[[429.0, 89.0], [540.0, 89.0], [540.0, 102.0], [429.0, 102.0]],
  'care.bng@fortishealthcare.com',
  0.929301203324877],
 [[[429.0, 99.0], [559.0, 99.0], [559.0, 112.0], [429.0, 112.0]],
  'CIN No.U93000DL2009PLC222166',
  0.954205005296639],
 [[[241.0, 143.0], [399.0, 143.0], [399.0, 156.0], [241.0, 156.0]],
  'DEPARTMENT

In [10]:
json_output_rapidOCR

{'PatientInformation': {'Name': '',
  'Age': None,
  'Gender': '',
  'ContactNumber': [''],
  'Address': ''},
 'MedicalHistory': {'Diseases': [{'name': '', 'date_diagnosed': ''}],
  'Medications': [{'name': '', 'dosage': '', 'frequency': ''}]},
 'CurrentIllness': {'NameOfIllness': '',
  'Duration': None,
  'Symptoms': [],
  'Diagnosis': []},
 'VitalSigns': {'BloodPressure': [{'systolic': '', 'diastolic': ''}],
  'RespiratoryRate': None,
  'Temperature': None},
 'Allergies': {'NameOfSubstance': [], 'Severity': []},
 'Immunizations': {'VaccineType': [], 'DateAdministered': []},
 'LaboratoryResults': {'TestType': [], 'ResultValue': [], 'Unit': []},
 'TreatmentPlan': {'Medications': [{'name': '',
    'dosage': '',
    'frequency': ''}],
  'Surgery': None,
  'FollowUpVisit': []},
 'DischargeInstructions': {'MedicationsToTake': [],
  'ActivityLevel': [],
  'Diet': []},
 'ContactInformation': {'NameOfContactPerson': '',
  'RelationshipWithPatient': '',
  'PhoneNumbers': [''],
  'Addresses': [

#### RapidOCR not giving good results 

In [13]:
pip install marker-pdf

Collecting marker-pdf
  Downloading marker_pdf-1.1.0-py3-none-any.whl.metadata (18 kB)
Collecting ftfy<7.0.0,>=6.1.1 (from marker-pdf)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting markdownify<0.14.0,>=0.13.1 (from marker-pdf)
  Downloading markdownify-0.13.1-py3-none-any.whl.metadata (8.5 kB)
Collecting pdftext<0.5.0,>=0.4.0 (from marker-pdf)
  Downloading pdftext-0.4.0-py3-none-any.whl.metadata (8.1 kB)
Collecting surya-ocr<0.9.0,>=0.8.0 (from marker-pdf)
  Downloading surya_ocr-0.8.1-py3-none-any.whl.metadata (29 kB)
Collecting tabled-pdf<0.3.0,>=0.2.0 (from marker-pdf)
  Downloading tabled_pdf-0.2.0-py3-none-any.whl.metadata (8.4 kB)
Collecting texify<0.3.0,>=0.2.1 (from marker-pdf)
  Downloading texify-0.2.1-py3-none-any.whl.metadata (10 kB)
Downloading marker_pdf-1.1.0-py3-none-any.whl (86 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
Downloading markdownify-0.13.1-py3-none-any.whl (10 kB)
Downloading pdftext-0.4.0-py3-none-any.whl (16 kB)
Downloadi

In [31]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [20]:
# https://noedgeai.com/

##  work with API 