In [7]:
!pip install easyocr
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install torch torchvision
!pip install transformers
!pip install sentencepiece



In [8]:
import easyocr
import pandas as pd
import re
from fuzzywuzzy import fuzz
from PIL import Image
import os
import io
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files
from datetime import datetime
from IPython.display import Image as IPImage, display as ipy_display
from transformers import pipeline

# 🔍 Load summarizer model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# ✅ Normalize image filenames for accurate comparison
def normalize_filename(name):
    return re.sub(r'\s\(\d+\)', '', name.lower())

# 📂 Global variables
uploaded_files = {}
csv_file = 'ocr_extracted_results.csv'
xlsx_file = 'ocr_extracted_results.xlsx'
reader = easyocr.Reader(['en'])
date_pattern = r'(\b\d{1,2}[\/\-\s.](?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[\/\-\s.]\d{2,4}\b|' \
               r'\b\d{1,2}[\/\-\s.]\d{1,2}[\/\-\s.]\d{2,4}\b|\b\d{4}[\/\-\s.]\d{1,2}[\/\-\s.]\d{1,2}\b)'

# 📌 Status and Output
output_area = widgets.Output()
status_label = widgets.Label("")

# ✅ Upload Handler
def upload_clicked(b):
    global uploaded_files
    uploaded_files = files.upload()
    status_label.value = f"✅ Uploaded {len(uploaded_files)} files."

# ✅ Run OCR Handler
def run_ocr_clicked(b):
    if not uploaded_files:
        status_label.value = "⚠️ No files uploaded!"
        return

    extracted_data = []

    for original_name, file in uploaded_files.items():
        image_name = normalize_filename(original_name)

        # Preview image
        with output_area:
            print(f"🖼️ Preview: {original_name}")
            ipy_display(IPImage(filename=original_name, width=300))

        result = reader.readtext(original_name, detail=0)
        extracted_text = ' '.join(result)

        # Extract fields
        dates = re.findall(date_pattern, extracted_text)
        notifier_date, event_date = (dates + ["", ""])[:2]

        subject_match = re.search(r'Subject[:\- ]*(.*?)(?:[.!?\n]|$)', extracted_text)
        subject = subject_match.group(1).strip() if subject_match else "N/A"
        subject = ' '.join(subject.split()[:10])

        description_match = re.search(r'(.*?)(?:\.\s|\n)', extracted_text)
        description = description_match.group(1).strip() if description_match else "N/A"

        # ✨ Generate summary
        try:
            if len(extracted_text.split()) > 30:
                summary = summarizer(extracted_text, max_length=30, min_length=10, do_sample=False)[0]['summary_text']
            else:
                summary = description  # fallback if too short
        except:
            summary = "N/A"

        processed_on = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        extracted_data.append([image_name, notifier_date, event_date, subject, description, summary, processed_on])

    df = pd.DataFrame(extracted_data, columns=['Image', 'Notifier Date', 'Event Date', 'Subject', 'Description', 'Summary', 'Processed On'])

    df['Image'] = df['Image'].apply(normalize_filename)
    if os.path.exists(csv_file):
        existing_df = pd.read_csv(csv_file)
        existing_df['Image'] = existing_df['Image'].apply(normalize_filename)
        existing_df = existing_df[~existing_df['Image'].isin(df['Image'])]
        combined_df = pd.concat([existing_df, df], ignore_index=True)
    else:
        combined_df = df

    # Save updated CSV & Excel
    combined_df.to_csv(csv_file, index=False)
    combined_df.to_excel(xlsx_file, index=False)

    with output_area:
        clear_output()
        display(combined_df)

    status_label.value = f"✅ OCR complete. CSV updated with {len(df)} new/updated entries."

# ✅ Download CSV
def download_csv_clicked(b):
    if os.path.exists(csv_file):
        files.download(csv_file)
    else:
        status_label.value = "⚠️ No CSV found to download."

# ✅ Download Excel
def download_excel_clicked(b):
    if os.path.exists(xlsx_file):
        files.download(xlsx_file)
    else:
        status_label.value = "⚠️ No Excel file found to download."

# 🧹 Reset CSV
def reset_csv_clicked(b):
    if os.path.exists(csv_file):
        os.remove(csv_file)
    if os.path.exists(xlsx_file):
        os.remove(xlsx_file)
    output_area.clear_output()
    status_label.value = "🧹 CSV and Excel files have been cleared."

# 🖱️ Buttons
upload_button = widgets.Button(description='📂 Upload Images', button_style='primary')
upload_button.on_click(upload_clicked)

run_button = widgets.Button(description='🧠 Run OCR', button_style='success')
run_button.on_click(run_ocr_clicked)

download_button = widgets.Button(description='📥 Download CSV', button_style='info')
download_button.on_click(download_csv_clicked)

download_xlsx_button = widgets.Button(description='📊 Download Excel', button_style='info')
download_xlsx_button.on_click(download_excel_clicked)

reset_button = widgets.Button(description='🧹 Reset CSV', button_style='danger')
reset_button.on_click(reset_csv_clicked)

# 🧩 UI Layout
ui = widgets.VBox([
    widgets.HTML("<h3>🧾 OCR Document Extractor</h3>"),
    upload_button,
    run_button,
    download_button,
    download_xlsx_button,
    reset_button,
    output_area,
    status_label
])

display(ui)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


VBox(children=(HTML(value='<h3>🧾 OCR Document Extractor</h3>'), Button(button_style='primary', description='📂 …

Saving 12_09_24.png to 12_09_24 (1).png
Saving 16_09_24.png to 16_09_24 (2).png
Saving 25_09_24.jpg to 25_09_24 (1).jpg
