In [4]:
%%writefile app.py
import numpy as np
import pytesseract
from PIL import Image
import streamlit as st
import fitz  # PyMuPDF
import re
import io
from streamlit_pdf_viewer import pdf_viewer
import spacy
from nltk.corpus import words
import string
import cv2

nlp_model = spacy.load("en_core_web_sm")
# Set path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'



with st.sidebar:
    st.title("Document Reader Application")
    add_selectbox = st.sidebar.selectbox("Country",("India", "Mexico"))
    add_selectbox = st.sidebar.selectbox("Product",("Cards", "Loans"))
    

uploaded_file = st.file_uploader("Upload Image (PDF or Image)", type=["pdf", "jpg", "jpeg", "png"])

if uploaded_file is not None:
    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
    image = cv2.imdecode(file_bytes, 1)  # 1 for color

def remove_devanagari(text):
    return re.sub(r'[\u0900-\u097F]+', '', text)

def clean_ocr_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = remove_devanagari(text)
    return text.strip()

def extract_text_from_image(image):
    try:
        text = pytesseract.image_to_string(image, lang='eng+hi')  # Stick to English
        return clean_ocr_text(text)
    except Exception as e:
        return f"Error extracting text: {e}"

def extract_fields(text):
    if 'Aadhaar' in text:
        st.write('File Uploaded:Aadhaar')
        fields = {}
        doc = nlp_model(text)
        entities_extracted = {}
        for ent in doc.ents:
            if ent.label_.upper() in entities_extracted :
                entities_extracted[ent.label_.upper()].append(ent.text)
            else:
                entities_extracted[ent.label_.upper()] = [ent.text.upper()]
            
        fields['Name']=entities_extracted['NORP'][1]
        
        # Aadhaar number (format: 4 digits x3)
        aadhaar_match = re.search(r'\b\d{4}\s\d{4}\s\d{4}\b', text)
        if aadhaar_match:
            fields['Aadhaar Number'] = aadhaar_match.group()
        # Date or Year of Birth
        dob_match = re.search(r'\b(?:DOB|D.O.B|YOB|Year of Birth)[\s:]*([\d/]{4,10})', text, re.IGNORECASE)
        if dob_match:
            fields['DOB/YOB'] = dob_match.group(1)
        
    elif 'Permanent Account Number' in text:
        st.write('File Uploaded:PAN')
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        x, y, w, h = 0, 0, image.shape[1], image.shape[0] # Initialize with full image dimensions
        if contours:
            # Find the largest contour (or combine multiple relevant contours)
            largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        cropped_img = image[y:y+h, x:x+w]
        text = pytesseract.image_to_string(cropped_img)
        fields = {}
        name_match = re.search(r'Name\s*[:\-]?\s*([A-Z]{2,}(?:\s[A-Z]{2,}){1,2})\b', text, re.IGNORECASE)
        if name_match:
            fields['Name'] = name_match.group(1).strip()
        
        # Aadhaar number (format: 4 digits x3)
        pan_match = re.findall(r'\b([A-Z]{5}[0-9]{4}[A-Z])\b', text)
        if pan_match:
            fields['PAN Number'] = pan_match
        # Date or Year of Birth
        dob_match = re.search(r'\b(?:DOB|D.O.B|YOB|Date of Birth)[\s:]*([\d/]{4,10})', text, re.IGNORECASE)
        if dob_match:
            fields['DOB/YOB'] = dob_match.group(1)
    return fields

def main():
    if uploaded_file is not None:
        if uploaded_file.type == "application/pdf":
            raw_text = process_pdf(uploaded_file)
            binary_data = uploaded_file.getvalue()
            pdf_viewer(input=binary_data,width=150)
        else:
            image = Image.open(uploaded_file)
            st.image(image, caption="Uploaded Document", use_container_width=False,width=200)
            raw_text = extract_text_from_image(image)
        fields = extract_fields(raw_text)

        st.subheader("Extracted Fields:")
        if fields:
            for key, value in fields.items():
                st.write(f"**{key}:** {value}")
        else:
            st.warning("Could not detect English fields reliably.")

        # st.subheader("Raw OCR Text:")
        # st.text_area("Text", value=raw_text, height=150)

if __name__ == "__main__":
    main()
    

Overwriting app.py
