In [1]:
import os # To import test image files
import cv2 # To work with opencv images
from PIL import Image # Image submodule to work with pillow images
import pytesseract  # pytesseract module
import pandas as pd
from datetime import datetime
import re

In [2]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [3]:
folder_path = "../CHEQUE_OCR/Images/"


In [4]:
# Known bank names to help match low-quality OCR
KNOWN_BANKS = ["AXIS BANK", "CANARA BANK", "ICICI BANK", "SYNDICATE BANK", "HDFC BANK", "STATE BANK", "BANK OF BARODA"]


In [7]:

# ==== CONFIGURATION ====
# Add your image folder path
folder_path = "../CHEQUE_OCR/Images"

# Bank code prefix mapping
IFSC_PREFIX_MAP = {
    "ICIC": "ICICI BANK",
    "UTIB": "AXIS BANK",
    "SYNB": "SYNDICATE BANK",
    "CNRB": "CANARA BANK",
    "SBIN": "STATE BANK OF INDIA",
    "BARB": "BANK OF BARODA",
    "HDFC": "HDFC BANK",
    "UBIN": "UNION BANK"
}
KNOWN_BANKS = list(IFSC_PREFIX_MAP.values())

# ==== UTILITIES ====

def clean_bank_name(text):
    text = text.upper().strip()
    text = re.sub(r"[^A-Z\s]", "", text)
    text = re.sub(r"\b([A-Z])(?:\s+([A-Z]))+", lambda m: m.group(0).replace(" ", ""), text)
    match = re.search(r"[A-Z\s]{3,}BANK(?:\s+[A-Z]+)?", text)
    if match:
        return match.group().strip()
    return text.strip()

def extract_ifsc(text):
    text = text.upper().replace("O", "0")
    match = re.search(r'IFSC[\s:]*([A-Z]{4}0[A-Z0-9]{6})', text)
    if match:
        return match.group(1)
    match = re.search(r'([A-Z]{4}0[0-9A-Z]{6})', text)
    return match.group(1) if match else ""

def extract_amount(text):
    text = text.replace(",", "").replace("O", "0").upper()
    match = re.findall(r'(?:₹|INR|RS|R\$)?\s?([\d]+\.\d{2})', text)
    if match:
        try:
            return str(max([float(m) for m in match]))
        except:
            return match[0]
    return ""

def extract_date(text):
    text = text.replace("O", "0")
    patterns = [
        r'\b\d{2}[-.]\d{2}[-.]\d{4}\b',
        r'\b\d{1,2}-[A-Za-z]{3}-\d{4}\b',
        r'\b\d{1,2}[/-][A-Za-z]{3}[/-]\d{4}\b'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return ""

def normalize_date(date_str):
    try:
        month_map = {
            "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04",
            "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08",
            "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12"
        }

        match = re.match(r'(\d{1,2})[-/\s]?([A-Z]{3})[-/\s]?(\d{4})', date_str.upper())
        if match:
            day, month_abbr, year = match.groups()
            return f"{int(day):02d}-{month_map.get(month_abbr[:3], '00')}-{year}"

        match = re.match(r'(\d{2})[.-](\d{2})[.-](\d{4})', date_str)
        if match:
            day, month, year = match.groups()
            return f"{day}-{month}-{year}"

    except:
        pass
    return date_str


def extract_bank_name(full_text, cropped_text, fallback_ifsc=None):
    full_text = full_text.upper()

    # Priority 1: Look in full_text for known banks (ICICI is here)
    for bank in KNOWN_BANKS:
        if bank in full_text:
            return bank

    # Priority 2: Check cropped top region
    for line in cropped_text.split('\n'):
        line = line.strip().upper()
        for bank in KNOWN_BANKS:
            if bank in line:
                return bank
        if "BANK" in line:
            return clean_bank_name(line)

    # Priority 3: Try fallback using IFSC
    if fallback_ifsc:
        prefix = fallback_ifsc[:4]
        return IFSC_PREFIX_MAP.get(prefix, "Not found")

    return "Not found"
# ==== MAIN EXTRACTION ====

def extract_cheque_data(image_path, filename=""):
    img = cv2.imread(image_path)
    if img is None:
        return None

    height = img.shape[0]
    cropped = img[0:int(0.4 * height), :]

    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    gray = cv2.bilateralFilter(gray, 11, 17, 17)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

    text_crop = pytesseract.image_to_string(thresh)
    full_text = pytesseract.image_to_string(img).upper()

    ifsc_code = extract_ifsc(full_text)
    bank_name = extract_bank_name(full_text, text_crop, fallback_ifsc=ifsc_code)
    amount = extract_amount(full_text)
    raw_date = extract_date(full_text)
    date = normalize_date(raw_date)

    return {
        "Bank Name": bank_name,
        "IFSC Code": ifsc_code,
        "Amount": amount,
        "Date": date
    }

# ==== RUN ON FOLDER ====

results = []
print("🔍 Checking files in:", folder_path)

for filename in os.listdir(folder_path):
    image_path = os.path.join(folder_path, filename)
    if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
        continue

    print(f"📄 Processing: {filename}")
    data = extract_cheque_data(image_path, filename)
    if data:
        results.append(data)

# Save to Excel
df = pd.DataFrame(results)
df.to_excel("cheque_data_output.xlsx", index=False)
print("✅ Data saved to cheque_data_output.xlsx")


🔍 Checking files in: ../CHEQUE_OCR/Images
📄 Processing: Axis Bank.jpg
📄 Processing: Canara Bank.jpg
📄 Processing: ICICI bank.jpg
📄 Processing: Syndicate bank.jpg
✅ Data saved to cheque_data_output.xlsx


In [8]:
df

Unnamed: 0,Bank Name,IFSC Code,Amount,Date
0,AXIS BANK,UTIB0000426,130354.7,06-09-2019
1,CANARA BANK,CNRB0002854,126888.0,03-09-2019
2,ICICI BANK,ICIC0006306,56476.0,05-09-2019
3,SYNDICATE BANK,SYNB0003011,5407.5,30-01-2019
