In [1]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [5]:
import cv2
import pytesseract
from PIL import Image
import spacy
import numpy as np
import matplotlib.pyplot as plt # Import matplotlib for visualization

# (Optional for Windows users) Point to your Tesseract executable
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Step 1: Load and preprocess image for handwritten text on lined paper
image_path = '/content/sample .jpg'
image = cv2.imread(image_path)

# Check if the image was loaded successfully
if image is None:
    print(f"Error: Could not load image from {image_path}. Please ensure the file exists and the path is correct.")
else:
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a light Gaussian Blur to reduce noise while preserving character edges
    # (3,3) is a good starting point, try (5,5) if text is thick or (1,1) if very thin
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)

    # --- Advanced Line Removal for Lined Paper ---
    # 1. Apply Adaptive Threshold to get a binary image that highlights lines and text
    # This helps emphasize the lines so they can be detected morphologically
    # Adjust blockSize (odd number, larger for thicker lines) and C (offset)
    thresh_for_lines = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 25, 10)
    # Invert the thresholded image so text/lines are white on black (easier for morphology)
    thresh_for_lines = cv2.bitwise_not(thresh_for_lines)


    # 2. Create a horizontal structuring element (kernel)
    # The length of the kernel should be roughly the width of the document to cover horizontal lines
    # The height is 1 to target only horizontal features
    kernel_len = np.array(gray).shape[1] // 40 # Adjust divisor to change kernel length
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))

    # 3. Apply morphological operations to detect and remove horizontal lines
    # Erode to make lines thinner, then Dilate to make them stand out
    # This process helps isolate the strong horizontal features (the lines)
    horizontal_lines_detected = cv2.erode(thresh_for_lines, horizontal_kernel, iterations=1)
    horizontal_lines_detected = cv2.dilate(horizontal_lines_detected, horizontal_kernel, iterations=1)

    # 4. Subtract the detected lines from the original *inverted* thresholded image
    # This aims to 'erase' the lines from the image, leaving mostly the text
    # cv2.subtract removes pixels that are common between the two images
    no_lines_image = cv2.subtract(thresh_for_lines, horizontal_lines_detected)

    # 5. Re-invert the image to get black text on a white background (Tesseract prefers this)
    no_lines_image = cv2.bitwise_not(no_lines_image)

    # --- Final Thresholding on the image with removed lines ---
    # Use Otsu's thresholding here as the background should now be more uniform
    # after line removal. This will give a clean black/white image.
    ret, final_thresh = cv2.threshold(no_lines_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Use the preprocessed image for Tesseract
    pil_image = Image.fromarray(final_thresh)

    # Step 2: OCR to extract text
    # --psm 6 (Assume a single uniform block of text) is a good starting point.
    # --oem 3 (Use the latest OCR Engine Mode, LSTM-based)
    # Tesseract might still struggle significantly with handwriting.
    text = pytesseract.image_to_string(pil_image, config='--psm 6 --oem 3')
    print("Extracted Text:\n", text)

    # Step 3: NLP setup (your existing code is good here)
    nlp = spacy.load("en_core_web_sm")

    # Symptom keyword list
    SYMPTOMS = ['chest pain', 'fever', 'vomiting', 'shortness of breath', 'headache']

    # Step 4: Function to extract symptoms
    def extract_symptoms(text):
        doc = nlp(text.lower())
        found = [s for s in SYMPTOMS if s in doc.text]
        return found

    # Step 5: Function to determine triage level
    def triage_level(symptoms):
        if 'chest pain' in symptoms or 'shortness of breath' in symptoms:
            return "🚨 Red – Immediate"
        elif symptoms:
            return "🟡 Yellow – Urgent"
        else:
            return "🟢 Green – Non-urgent"

    # Step 6: Run the logic
    symptoms = extract_symptoms(text)
    level = triage_level(symptoms)

    print("Symptoms:", symptoms)
    print("Triage Level:", level)

    # (Optional) Calculate OCR accuracy if ground truth is known
    ground_truth = "Patient feels chest pain and dizziness\nShortness of breath started 2 hours ago"
    accuracy = calculate_accuracy(ground_truth, recognized_text)
    print(f"\nOCR Accuracy: {accuracy:.2f}%")

Extracted Text:
 Patient tecls chest Pain’ and Stezivess —
Shortness Of lotath Stavted 2 Wows ago. ee

Symptoms: ['chest pain']
Triage Level: 🚨 Red – Immediate

OCR Accuracy: 80.00%
