<a href="https://colab.research.google.com/github/aithentic-dev/newspaper-clip-extraction/blob/main/json_enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# --- Step 1: Install dependencies ---
!pip install azure-storage-file-share openai

# --- Step 2: Imports ---
import os, json
from azure.storage.fileshare import ShareDirectoryClient, ShareFileClient
from datetime import datetime
from openai import OpenAI

# --- Step 3: Configurations (using userdata) ---
AZURE_CONN_STR = userdata.get("AZURE_CONNECTION_STRING")
SHARE_NAME = "lqr"
BASE_DIR = "success"

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

# --- Step 4: OpenAI stance classifier ---
def classify_stance(headline, content):
    prompt = f"""
    You are analyzing a Telugu news article about YSRCP.
    Decide the stance of the article toward YSRCP.

    Rules:
    - "pro" → favorable to YSRCP or its leaders
    - "neutral" → factual reporting without visible bias
    - "anti" → critical or negative toward YSRCP or its leaders

    Return ONLY a valid JSON object with:
    - "stance_label": one of ["pro", "neutral", "anti"] (the highest probability class)
    - "stance_score": object with keys "pro", "neutral", "anti" (each value is a float 0–1, all three must sum to 1)

    Example output:
    {{
      "stance_label": "anti",
      "stance_score": {{
        "pro": 0.1,
        "neutral": 0.2,
        "anti": 0.7
      }}
    }}

    Headline: {headline[:500]}
    Article: {content[:2000]}
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a political news stance classifier. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=200
        )

        result_text = response.choices[0].message.content.strip()

        # Clean possible code block wrappers
        if result_text.startswith("```json"):
            result_text = result_text.replace("```json", "").replace("```", "").strip()

        stance_data = json.loads(result_text)

        # Validate structure
        if "stance_label" not in stance_data or "stance_score" not in stance_data:
            raise ValueError("Missing required fields in response")

        return stance_data

    except Exception as e:
        print(f"⚠️ Error in stance classification: {e}")
        return {
            "stance_label": "neutral",
            "stance_score": {"pro": 0.33, "neutral": 0.34, "anti": 0.33}
        }

# --- Step 5: Walk through all outlet folders ---
try:
    parent_dir_client = ShareDirectoryClient.from_connection_string(
        conn_str=AZURE_CONN_STR,
        share_name=SHARE_NAME,
        directory_path=BASE_DIR
    )

    subdirs = list(parent_dir_client.list_directories_and_files())
    print(f"📁 Found {len(subdirs)} subdirectories")

    for subdir in subdirs:
        if subdir.get("is_directory", True):  # Only process directories
            folder_name = subdir["name"]   # Sakshi / Eenadu / future outlets
            print(f"\n🔎 Processing folder: {folder_name}")

            dir_client = ShareDirectoryClient.from_connection_string(
                conn_str=AZURE_CONN_STR,
                share_name=SHARE_NAME,
                directory_path=f"{BASE_DIR}/{folder_name}"
            )

            files = list(dir_client.list_directories_and_files())
            # Skip processed files and log files
            json_files = [
                f for f in files
                if f["name"].lower().endswith(".json")
                and not f["name"].startswith("processed_")
                and not f["name"].lower().endswith("_log.json")
            ]

            # --- Load or create log file for this folder ---
            log_file_name = f"{folder_name.lower()}_log.json"
            log_file_client = ShareFileClient.from_connection_string(
                conn_str=AZURE_CONN_STR,
                share_name=SHARE_NAME,
                file_path=f"{BASE_DIR}/{folder_name}/{log_file_name}"
            )

            try:
                log_data = log_file_client.download_file().readall().decode("utf-8")
                processed_log = json.loads(log_data)
            except Exception:
                processed_log = []  # create new log if missing

            print(f"📄 Found {len(json_files)} JSON files to check. {len(processed_log)} already processed.")

            for f in json_files:
                if f["name"] in processed_log:
                    print(f"⏭️ Skipping {f['name']} (already in log)")
                    continue

                print(f"\n📄 Processing file: {f['name']}")

                try:
                    # --- Download the JSON file ---
                    file_client = ShareFileClient.from_connection_string(
                        conn_str=AZURE_CONN_STR,
                        share_name=SHARE_NAME,
                        file_path=f"{BASE_DIR}/{folder_name}/{f['name']}"
                    )
                    downloaded = file_client.download_file().readall()
                    data = json.loads(downloaded.decode("utf-8"))

                    # --- Extract content (handle embedded JSON from Gemini) ---
                    raw_content = data.get("content", "")
                    if "```json" in raw_content:
                        try:
                            json_start = raw_content.find("```json") + 7
                            json_end = raw_content.find("```", json_start)
                            json_str = raw_content[json_start:json_end].strip()
                            embedded_data = json.loads(json_str)
                            headline = embedded_data.get("headline", "")
                            content = embedded_data.get("content", "")
                        except Exception as e:
                            print(f"⚠️ Error parsing embedded JSON: {e}")
                            headline = data.get("headline", "")
                            content = raw_content
                    else:
                        headline = data.get("headline", "")
                        content = raw_content

                    source = folder_name   # e.g. Sakshi / Eenadu

                    print(f"📰 Headline: {headline[:100]}...")

                    # --- Run stance classification ---
                    stance_result = classify_stance(headline, content)

                    # --- Build enriched JSON ---
                    enriched = {
                        "id": f"{source}-{datetime.now().strftime('%Y%m%d%H%M%S')}",
                        "source": source,
                        "source_bias": "pro-YSRCP" if source.lower() == "sakshi" else "anti-YSRCP" if source.lower() == "eenadu" else "neutral",
                        "type": "news_article",
                        "date": datetime.now().strftime("%Y-%m-%d"),
                        "headline": headline,
                        "content": content,
                        "stance_label": stance_result["stance_label"],
                        "stance_score": stance_result["stance_score"]
                    }

                    # --- Save enriched file with overwrite ---
                    new_name = f"processed_{f['name']}"
                    new_file_client = ShareFileClient.from_connection_string(
                        conn_str=AZURE_CONN_STR,
                        share_name=SHARE_NAME,
                        file_path=f"{BASE_DIR}/{folder_name}/{new_name}"
                    )
                    new_content = json.dumps(enriched, ensure_ascii=False, indent=2).encode("utf-8")

                    # Create or overwrite file
                    try:
                        new_file_client.create_file(size=len(new_content))
                    except Exception:
                        new_file_client.delete_file()
                        new_file_client.create_file(size=len(new_content))

                    new_file_client.upload_file(new_content)

                    print(f"✅ Saved enriched file: {new_name}")
                    print(f"🏷️ Stance: {stance_result['stance_label']} (confidence: {max(stance_result['stance_score'].values()):.2f})")

                    # --- Update log file ---
                    processed_log.append(f["name"])
                    updated_log = json.dumps(processed_log, indent=2, ensure_ascii=False).encode("utf-8")
                    try:
                        log_file_client.create_file(size=len(updated_log))
                    except Exception:
                        log_file_client.delete_file()
                        log_file_client.create_file(size=len(updated_log))
                    log_file_client.upload_file(updated_log)
                    print(f"📝 Log updated: {log_file_name}")

                except Exception as file_error:
                    print(f"❌ Error processing file {f['name']}: {file_error}")
                    continue

except Exception as main_error:
    print(f"❌ Main execution error: {main_error}")

print("\n🎉 Processing complete!")


📁 Found 3 subdirectories

🔎 Processing folder: Eenadu
📄 Found 9 JSON files to check. 9 already processed.
⏭️ Skipping ACB Court Madhyeam Scam Niditulaku Noticelu ee09092025.json (already in log)
⏭️ Skipping Annam Venkatramana reddy Madhyeam Scam Raj kesireddy Saireddy ee 07082025.json (already in log)
⏭️ Skipping babu Cabineat Meeting Comments ee21052025.json (already in log)
⏭️ Skipping babu Madhyeam Scam Charge Sheet Vesaka YSRCP Leaders ni Endagtadi ee 19072025.json (already in log)
⏭️ Skipping Bail Upasamrishukuna Vasudevareddy ee 03072024.json (already in log)
⏭️ Skipping Bevarejash Corporation EX MD Vasudevareddy ki Bail ee30112024.json (already in log)
⏭️ Skipping Bevarejesh Corporation EX Md Vasudevareddy CID Case ee 08062024.json (already in log)
⏭️ Skipping Bevarejus Corporation Vasudevareddy pi Vetu ee 17042024.json (already in log)
⏭️ Skipping BJP MP CM Ramesh YCP hayeam 30vela Cores Madhyeam Scam Liquar ee 12022025.json (already in log)

🔎 Processing folder: sakshi
📄 Found