<a href="https://colab.research.google.com/github/aithentic-dev/newspaper-clip-extraction/blob/main/telugu_news_clip_processing_az_fileshare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Telugu News Clip Processing with Gemini + Azure File Share
This Colab notebook processes Telugu newspaper clips from an **Azure File Share**. It classifies images as news or non-news, extracts clean Telugu text using Google Gemini, and stores results back into the file share.

- Source: `raw/source`
- Success: `raw/success`
- Fail: `raw/fail`

Only files in the root of `source/` are processed (child directories are skipped).

In [None]:
# --- Step 1. Install SDKs ---
!pip install -q azure-storage-file-share google-generativeai

import os
import json
import google.generativeai as genai
import PIL.Image
from io import BytesIO
from azure.storage.fileshare import ShareServiceClient
from google.colab import userdata


# --- Configure Gemini API ---
#os.environ["GOOGLE_API_KEY"] = "api key here"
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel("gemini-2.0-flash-001")
#model = genai.GenerativeModel("gemini-1.5-flash")

# --- Configure Azure File Share ---
AZURE_CONNECTION_STRING = userdata.get("AZURE_CONNECTION_STRING")
FILE_SHARE_NAME = "lqr"

service_client = ShareServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
share_client = service_client.get_share_client(FILE_SHARE_NAME)

# Directories inside the file share
SOURCE_DIR = "source"
SUCCESS_DIR = "success"
FAIL_DIR = "fail"


In [None]:
def classify_image_bytes(image_bytes: bytes) -> str:
    """Classify image as News Article / Advertisement / Meme / Other."""
    img = PIL.Image.open(BytesIO(image_bytes))

    prompt = """
    Classify this image into one of these categories:
    1. News Article (contains Telugu news text with a headline and body)
    2. Advertisement (commercial ads, posters, offers)
    3. Meme / Cartoon (jokes, funny posts, non-news)
    4. Other

    Return only the category name.
    """
    response = model.generate_content([prompt, img])
    return response.text.strip()


In [None]:
def extract_telugu_text_bytes(image_bytes: bytes) -> dict:
    """Extract Telugu text from news article image and return as JSON."""
    img = PIL.Image.open(BytesIO(image_bytes))

    prompt = """
    You are given a Telugu newspaper clipping image.
    - Extract the Telugu text clearly.
    - Correct OCR mistakes and remove noise.
    - Keep context and meaning intact.
    - Maintain Telugu script (do not translate).
    Return JSON format:
    {
      "headline": "...",
      "content": "..."
    }
    """
    response = model.generate_content([prompt, img])

    try:
        return json.loads(response.text)
    except:
        return {"headline": "", "content": response.text.strip()}


In [None]:
def process_files_in_root():
    """
    Process only JPG/PNG files directly in the root of the file share.
    News → save to 'success' dir with JSON.
    Non-news → copy to 'fail' dir.
    """
    items = share_client.list_directories_and_files()

    for item in items:
        if item['is_directory']:
            continue  # skip subdirectories (fail/, success/, etc.)
        if not item['name'].lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        print(f"Processing: {item['name']}")
        file_client = share_client.get_file_client(item['name'])
        downloader = file_client.download_file()
        image_bytes = downloader.readall()

        # Step 1: Classify
        category = classify_image_bytes(image_bytes)
        print(f"  Category: {category}")

        if category == "News Article":
            extracted = extract_telugu_text_bytes(image_bytes)

            # Save original image to success
            success_dir_client = share_client.get_directory_client(SUCCESS_DIR)
            success_file = success_dir_client.get_file_client(item['name'])
            try:
                success_file.delete_file()
            except Exception:
                pass
            success_file.upload_file(image_bytes)

            # Save extracted JSON
            json_name = item['name'].rsplit(".", 1)[0] + ".json"
            json_file = success_dir_client.get_file_client(json_name)
            try:
                json_file.delete_file()
            except Exception:
                pass
            json_file.upload_file(json.dumps(extracted, ensure_ascii=False, indent=2))

            print(f"  ✅ Saved {item['name']} and {json_name} to {SUCCESS_DIR}")
        else:
            # Save image to fail
            fail_dir_client = share_client.get_directory_client(FAIL_DIR)
            fail_file = fail_dir_client.get_file_client(item['name'])
            try:
                fail_file.delete_file()
            except Exception:
                pass
            fail_file.upload_file(image_bytes)
            print(f"  ⏭️ Skipped, moved {item['name']} to {FAIL_DIR}")


# **This will extract source folder information into json file metatdata when processed**

In [None]:
def process_all_sources():
    """
    Process all JPG/PNG files inside each subfolder of SOURCE_DIR (e.g., Eenadu, Sakshi).
    News → save to 'success/{source}/' with JSON (including metadata).
    Non-news → save to 'fail/{source}/'.
    """

    source_root = share_client.get_directory_client(SOURCE_DIR)
    sources = source_root.list_directories_and_files()

    for src in sources:
        if not src['is_directory']:
            continue

        source_name = src['name']  # e.g., Eenadu, Sakshi
        print(f"📂 Processing source folder: {source_name}")

        src_dir = source_root.get_subdirectory_client(source_name)
        files = src_dir.list_directories_and_files()

        for f in files:
            if f['is_directory']:
                continue
            if not f['name'].lower().endswith((".jpg", ".jpeg", ".png")):
                continue

            print(f"  🖼️ Processing: {f['name']}")
            file_client = src_dir.get_file_client(f['name'])
            downloader = file_client.download_file()
            image_bytes = downloader.readall()

            # Step 1: Classify
            category = classify_image_bytes(image_bytes)
            print(f"    Category: {category}")

            if category == "News Article":
                extracted = extract_telugu_text_bytes(image_bytes)
                extracted["source"] = source_name  # ✅ add source metadata

                # Ensure success/{source}/ exists
                success_root = share_client.get_directory_client(SUCCESS_DIR)
                try:
                    success_root.create_directory()
                except Exception:
                    pass

                success_dir = share_client.get_directory_client(f"{SUCCESS_DIR}/{source_name}")
                try:
                    success_dir.create_directory()
                except Exception:
                    pass

                # Save image
                success_file = success_dir.get_file_client(f['name'])
                try:
                    success_file.delete_file()
                except Exception:
                    pass
                success_file.upload_file(image_bytes)

                # Save JSON
                json_name = f['name'].rsplit(".", 1)[0] + ".json"
                json_file = success_dir.get_file_client(json_name)
                try:
                    json_file.delete_file()
                except Exception:
                    pass
                json_file.upload_file(json.dumps(extracted, ensure_ascii=False, indent=2))

                print(f"    ✅ Saved {f['name']} and {json_name} to success/{source_name}")
            else:
                # Ensure fail/{source}/ exists
                fail_root = share_client.get_directory_client(FAIL_DIR)
                try:
                    fail_root.create_directory()
                except Exception:
                    pass

                fail_dir = share_client.get_directory_client(f"{FAIL_DIR}/{source_name}")
                try:
                    fail_dir.create_directory()
                except Exception:
                    pass

                fail_file = fail_dir.get_file_client(f['name'])
                try:
                    fail_file.delete_file()
                except Exception:
                    pass
                fail_file.upload_file(image_bytes)

                print(f"    ⏭️ Skipped, moved {f['name']} to fail/{source_name}")


# **Added ability to log and verify before process every file to skip if the file already processed.**

In [None]:
from datetime import datetime

def process_all_sources():
    """
    Process all JPG/PNG files inside each subfolder of SOURCE_DIR (e.g., Eenadu, Sakshi).
    News → save to 'success/{source}/' with JSON (including metadata).
    Non-news → save to 'fail/{source}/'.
    Maintains a processed.log file in success/ to avoid reprocessing same files.
    After processing or skipping, removes the original from source/.
    Each log entry includes date, source, filename, and status.
    """

    source_root = share_client.get_directory_client(SOURCE_DIR)
    sources = source_root.list_directories_and_files()

    # --- Load processed log ---
    processed_log_name = "processed.log"
    processed_records = []
    processed_lookup = set()

    success_root = share_client.get_directory_client(SUCCESS_DIR)
    try:
        log_file_client = success_root.get_file_client(processed_log_name)
        log_data = log_file_client.download_file().readall().decode("utf-8")
        for line in log_data.splitlines():
            parts = line.split(" | ")
            if len(parts) >= 3:
                _, source, fname, *_ = parts
                processed_lookup.add(fname.strip())
            processed_records.append(line)
    except Exception:
        processed_records = []
        processed_lookup = set()

    for src in sources:
        if not src['is_directory']:
            continue

        source_name = src['name']  # e.g., Eenadu
        print(f"📂 Processing source folder: {source_name}")

        src_dir = source_root.get_subdirectory_client(source_name)
        files = src_dir.list_directories_and_files()

        for f in files:
            if f['is_directory']:
                continue
            if not f['name'].lower().endswith((".jpg", ".jpeg", ".png")):
                continue

            filename = f['name']
            file_client = src_dir.get_file_client(filename)

            # Skip if already processed
            if filename in processed_lookup:
                print(f"  ⏭️ Skipping {filename} (already processed)")
                try:
                    file_client.delete_file()
                    print(f"    🗑️ Removed duplicate {filename} from source/{source_name}")
                except Exception as e:
                    print(f"    ⚠️ Could not delete duplicate: {e}")
                continue

            print(f"  🖼️ Processing: {filename}")
            downloader = file_client.download_file()
            image_bytes = downloader.readall()

            # Step 1: Classify
            category = classify_image_bytes(image_bytes)
            print(f"    Category: {category}")

            status = "success" if category == "News Article" else "fail"
            today = datetime.utcnow().strftime("%Y-%m-%d")

            if category == "News Article":
                extracted = extract_telugu_text_bytes(image_bytes)
                extracted["source"] = source_name

                # Ensure success/{source}/ exists
                success_dir = share_client.get_directory_client(f"{SUCCESS_DIR}/{source_name}")
                try:
                    share_client.get_directory_client(SUCCESS_DIR).create_directory()
                except Exception:
                    pass
                try:
                    success_dir.create_directory()
                except Exception:
                    pass

                # Save image
                success_file = success_dir.get_file_client(filename)
                try:
                    success_file.delete_file()
                except Exception:
                    pass
                success_file.upload_file(image_bytes)

                # Save JSON
                json_name = filename.rsplit(".", 1)[0] + ".json"
                json_file = success_dir.get_file_client(json_name)
                try:
                    json_file.delete_file()
                except Exception:
                    pass
                json_file.upload_file(json.dumps(extracted, ensure_ascii=False, indent=2))

                print(f"    ✅ Saved {filename} and {json_name} to success/{source_name}")
            else:
                # Ensure fail/{source}/ exists
                fail_dir = share_client.get_directory_client(f"{FAIL_DIR}/{source_name}")
                try:
                    share_client.get_directory_client(FAIL_DIR).create_directory()
                except Exception:
                    pass
                try:
                    fail_dir.create_directory()
                except Exception:
                    pass

                fail_file = fail_dir.get_file_client(filename)
                try:
                    fail_file.delete_file()
                except Exception:
                    pass
                fail_file.upload_file(image_bytes)

                print(f"    ⏭️ Skipped, moved {filename} to fail/{source_name}")

            # --- Update log with date, source, status ---
            log_entry = f"{today} | {source_name} | {filename} | {status}"
            processed_records.append(log_entry)
            processed_lookup.add(filename)

            log_content = "\n".join(processed_records)
            log_file_client = success_root.get_file_client(processed_log_name)
            try:
                log_file_client.delete_file()
            except Exception:
                pass
            log_file_client.upload_file(log_content.encode("utf-8"))

            # Remove original from source after processing
            try:
                file_client.delete_file()
                print(f"    🗑️ Removed original {filename} from source/{source_name}")
            except Exception as e:
                print(f"    ⚠️ Could not delete original: {e}")


In [None]:
# --- Run the processing pipeline ---
#process_files_in_root()
process_all_sources()


📂 Processing source folder: Eenadu
  🖼️ Processing: babu Cabineat Meeting Comments ee21052025.jpeg
    Category: News Article


  today = datetime.utcnow().strftime("%Y-%m-%d")


    ✅ Saved babu Cabineat Meeting Comments ee21052025.jpeg and babu Cabineat Meeting Comments ee21052025.json to success/Eenadu
    🗑️ Removed original babu Cabineat Meeting Comments ee21052025.jpeg from source/Eenadu
📂 Processing source folder: sakshi
