In [10]:
import os # To import test image files
import cv2 # To work with opencv images
from PIL import Image # Image submodule to work with pillow images
import pytesseract  # pytesseract module
import pandas as pd
from datetime import datetime
import re

In [13]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [14]:
test_img_path = "../CHEQUE_OCR/Images/"


In [15]:
# Store extracted data here
results = []

In [25]:
# Field extraction logic
def extract_cheque_info(text):
    lines = text.split('\n')
    bank_name = ""
    ifsc_code = ""
    date = ""
    amount = ""

    
    for line in lines:
        line = line.strip()

        # Bank Name
        if not bank_name and ("bank" in line.lower() or line.isupper()):
            bank_name = line

        # IFSC Code (more lenient)
        if not ifsc_code:
            match = re.search(r"[A-Z]{4}0[0-9A-Z]{6}", line)
            if match:
                ifsc_code = match.group()

        # Date: allow space instead of /
        if not date:
            match = re.search(r"\b\d{2}[-/\s]\d{2}[-/\s]\d{4}\b", line)
            if match:
                date = match.group()

        # Amount: better ₹/Rs handling
        if not amount:
            match = re.search(r"(₹|Rs\.?|INR)?\s?[\d,]+\.?\d*", line)
            if match:
                amount = match.group()

    return [bank_name, ifsc_code, date, amount]


In [26]:

# Loop through images
for filename in os.listdir(test_img_path):
    if filename.lower().endswith((".jpg", ".jpeg", ".png")):
        image_path = os.path.join(test_img_path, filename)
        img = cv2.imread(image_path)

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.bilateralFilter(gray, 11, 17, 17)  # reduces noise
        edges = cv2.Canny(gray, 30, 200) 
        # Try adaptive threshold
        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 11, 2)

        text = pytesseract.image_to_string(thresh)

        extracted = extract_cheque_info(text)
        results.append(extracted)

# Save to Excel
df = pd.DataFrame(results, columns=["Bank Name", "IFSC Code", "Date", "Amount"])
# df.to_excel("cheque_data_extracted.xlsx", index=False)

print("✅ Cheque data extracted and saved to Excel.")


✅ Cheque data extracted and saved to Excel.


In [27]:
df

Unnamed: 0,Bank Name,IFSC Code,Date,Amount
0,AXIS BANK trp DATE,UTIB0000426,,
1,KLHL-TECH SECURE PRINT \TO- HYD/CTS- 2010,,93-09-2019,
2,SESHAASAI (M)/ CTS - 2070,,,
3,"14-4-820, AGAPURA, SITARAMBAGH, HYDERABAD - 50...",,30-01-2019,
4,lig AXIS BANK up,,,
5,. bare ID LA canara Bank termsiyesoroe wa {ss ...,,03-09-2019,
6,ICUBanke,,,
7,". IFSC : SYNBOO03011 (CBS) ""=D DTM MY Y¥ ¥: ¥",,,
8,lig AXIS BANK up,,,
9,. bare ID LA canara Bank termsiyesoroe wa {ss ...,,03-09-2019,
