In [None]:
import json
import time
import re
from tqdm import tqdm
import google.generativeai as genai

genai.configure(api_key="xxxxxxxx")  


model = genai.GenerativeModel("gemini-2.0-flash")

with open("processed_news_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

N = 20
data = data[:N]

results = []

for item in tqdm(data):
    prompt = f"""
You are a narrative understanding assistant.

Given a news article and a list of entities, classify each entity into one of the following roles:

- Protagonist: the main positive force or initiator of constructive action
- Antagonist: the opposing force or entity causing conflict
- Neutral: an important entity that is not actively pushing or resisting the conflict

Respond ONLY in valid JSON format with no explanation.

Expected output format:
[
  {{ "entity": "Entity Name", "label": "Protagonist" }},
  {{ "entity": "Another Entity", "label": "Antagonist" }}
]

Text: {item['text']}
Entities: {item['entities']}
"""

    try:
        response = model.generate_content([{"text": prompt}])
        content = response.text.strip()

        if content.startswith("```json"):
            content = re.sub(r"^```json\s*", "", content)
            content = re.sub(r"\s*```$", "", content)

        try:
            label_data = json.loads(content)
        except json.JSONDecodeError:
            print("Invalid JSON returned:\n", content[:150])
            label_data = []

        results.append({
            "text": item["text"],
            "entities": item["entities"],
            "labeled_entities": label_data
        })

    except Exception as e:
        print("⚠️ Error:", e)
        continue

    time.sleep(1.0)
    
with open("labeled_output_gemini.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("Done! Labeled results saved to labeled_output_gemini.json")


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [01:58<00:00,  5.91s/it]

Done! Labeled results saved to labeled_output_gemini.json





In [6]:
with open("labeled_output_gemini.json", "r", encoding="utf-8") as f:
    data = json.load(f)

narratives = []

for item in tqdm(data[:20]):  
    text = item["text"]
    labeled_entities = {entry["entity"]: entry["label"] for entry in item["labeled_entities"]}

    prompt = f"""
You are a narrative extraction assistant.

Given the following news article, extract all **complete** events. If no meaningful events are found, return an empty JSON array.

Each event must include:
- subject (who did it)
- action (verb)
- object (to whom or what)
- context (optional: time, place, reason)

Return ONLY a list of JSON objects. No explanation.

Text: {text}
"""

    try:
        response = model.generate_content([{"text": prompt}])
        content = response.text.strip()

        if content.startswith("```json"):
            content = re.sub(r"^```json\s*", "", content)
            content = re.sub(r"\s*```$", "", content)

        try:
            events = json.loads(content)
        except json.JSONDecodeError:
            print("⚠️ Invalid JSON for events:\n", content[:150])
            events = []

        for e in events:
            subj = e.get("subject", "")
            obj = e.get("object", "")
            e["subject_role"] = labeled_entities.get(subj, "Unknown")
            e["object_role"] = labeled_entities.get(obj, "Unknown")

        narratives.append({
            "text": text,
            "entities": item["entities"],
            "labeled_entities": item["labeled_entities"],
            "events": events
        })

    except Exception as e:
        print("Error:", e)
        continue

    time.sleep(1.0)

with open("narrative_output_gemini.json", "w", encoding="utf-8") as f:
    json.dump(narratives, f, indent=4, ensure_ascii=False)

print("Done! Narrative output saved to narrative_output_gemini.json")

100%|██████████| 20/20 [01:31<00:00,  4.55s/it]

Done! Narrative output saved to narrative_output_gemini.json





In [None]:
with open("urdu.json", "r", encoding="utf-8") as f:
    data = json.load(f)

results = []

for item in tqdm(data[:20]): 
    text = item["text"]

    entities = item["entities"]

    label_prompt = f"""
آپ ایک بیانیہ فہمی اسسٹنٹ ہیں۔

مندرجہ ذیل خبر اور اداروں کی فہرست کی بنیاد پر، ہر ادارے کو درج ذیل کرداروں میں سے ایک تفویض کریں:

- Protagonist: مرکزی مثبت کردار یا عمل کو شروع کرنے والا
- Antagonist: تنازعہ پیدا کرنے والا مخالف کردار
- Neutral: اہم لیکن غیر جانب دار کردار

صرف نیچے دیے گئے JSON فارمیٹ میں جواب دیں:
[
  {{"entity": "ادارے کا نام", "label": "Protagonist"}},
  ...
]

خبر: {text}
ادارے: {entities}
"""

    try:
        response = model.generate_content([{"text": label_prompt}])
        content = response.text.strip()

        if content.startswith("```json"):
            content = re.sub(r"^```json\s*", "", content)
            content = re.sub(r"\s*```$", "", content)

        try:
            labeled = json.loads(content)
        except json.JSONDecodeError:
            print("⚠️ Invalid JSON:", content[:100])
            labeled = []

        labeled_entities = {e["entity"]: e["label"] for e in labeled}

        narrative_prompt = f"""
        آپ ایک بیانیہ نکالنے والے اسسٹنٹ ہیں۔

        مندرجہ ذیل خبر سے تمام مکمل واقعات نکالیں:
        - subject (کون؟) → اداروں کی فہرست سے انتخاب کریں۔
        - action (کیا کیا؟)
        - object (کس پر یا کس کے ساتھ؟) → اداروں کی فہرست سے انتخاب کریں۔
        - context (اضافی معلومات)

        صرف اداروں میں سے subject اور object کا انتخاب کریں۔
        صرف JSON فہرست واپس کریں۔

        خبر: {text}
        ادارے: {entities}
        """


        response2 = model.generate_content([{"text": narrative_prompt}])
        events_content = response2.text.strip()

        if events_content.startswith("```json"):
            events_content = re.sub(r"^```json\s*", "", events_content)
            events_content = re.sub(r"\s*```$", "", events_content)

        try:
            events = json.loads(events_content)
        except json.JSONDecodeError:
            print("⚠️ Invalid JSON for events:", events_content[:100])
            events = []

        def find_best_match(name, labeled_entities):
            # Exact match
            if name in labeled_entities:
                return labeled_entities[name]
            # Fuzzy match: partial match
            for entity in labeled_entities:
                if name.strip() in entity or entity.strip() in name:
                    return labeled_entities[entity]
            return "Unknown"

        ...

        for e in events:
            subj = e.get("subject", "")
            obj = e.get("object", "")
            e["subject_role"] = find_best_match(subj, labeled_entities)
            e["object_role"] = find_best_match(obj, labeled_entities)


        results.append({
            "text": text,
            "entities": entities,
            "labeled_entities": labeled,
            "events": events
        })

    except Exception as e:
        print("⚠️ Error:", e)
        continue

    time.sleep(1.0)

with open("urdu_narrative_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("✅ Urdu narrative extraction complete. Saved to urdu_narrative_output.json")


100%|██████████| 20/20 [00:00<?, ?it/s]

✅ Arabic narrative extraction complete. Saved to arabic_narrative_output.json





In [30]:
from rapidfuzz import process, fuzz
import unicodedata
from difflib import get_close_matches


with open("arabic.json", "r", encoding="utf-8") as f:
    data = json.load(f)

results = []

def trim_text(text, max_chars=2000):
    return text[:max_chars]

def normalize_arabic(s: str) -> str:
    if not isinstance(s, str):
        return ""
    # strip diacritics, normalize spaces
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return re.sub(r"\s+", "", s)


def find_best_match(name: str, labeled_entities: dict) -> str:
    # guard empty / non‐string
    if not isinstance(name, str) or not name.strip():
        return "Unknown"

    # 1) exact
    if name in labeled_entities:
        return labeled_entities[name]

    # 2) normalized exact
    norm = normalize_arabic(name)
    for ent, label in labeled_entities.items():
        if normalize_arabic(ent) == norm:
            return label

    # 3) fuzzy
    candidates = list(labeled_entities.keys())
    match = get_close_matches(name, candidates, n=1, cutoff=0.6)
    if match:
        return labeled_entities[match[0]]

    return "Unknown"

for item in tqdm(data[:20], desc="Processing"): 
    text = item.get("content", "").strip()
    if not text:
        continue
    text = trim_text(text)

    entities = []
    for ent_list in item.get("entities", {}).values():
        entities.extend(ent_list)

    label_prompt = f"""
أنت مساعد لفهم الأدوار السردية.

بناءً على النص التالي وقائمة الكيانات، صنف كل كيان إلى:
- Protagonist
- Antagonist
- Neutral

أعد فقط JSON بدون شرح، الصيغة:
[
  {{"entity": "اسم الكيان", "label": "Protagonist"}},
  ...
]

النص:
\"\"\"{text}\"\"\"

الكيانات: {entities}
"""
    response = model.generate_content([{"text": label_prompt}])
    lbl_txt = response.text.strip()
    if lbl_txt.startswith("```json"):
        lbl_txt = re.sub(r"^```json\s*", "", lbl_txt)
        lbl_txt = re.sub(r"\s*```$", "", lbl_txt)
    try:
        labeled = json.loads(lbl_txt)
    except json.JSONDecodeError:
        print("⚠️ Role JSON failed, skipping:", lbl_txt[:100])
        continue
    labeled_entities = {e["entity"]: e["label"] for e in labeled}

    narr_prompt = f"""
أنت مساعد لاستخراج الأحداث.

من النص التالي استخرج الأحداث الكاملة فقط كـJSON بدون شرح.
– subject: اختر من القائمة فقط
– action: الفعل
– object: اختر من القائمة فقط (إن وجد)
– context: معلومات إضافية (اختياري)

النص:
\"\"\"{text}\"\"\"

الكيانات: {entities}
"""
    response2 = model.generate_content([{"text": narr_prompt}])
    evt_txt = response2.text.strip()
    if evt_txt.startswith("```json"):
        evt_txt = re.sub(r"^```json\s*", "", evt_txt)
        evt_txt = re.sub(r"\s*```$", "", evt_txt)
    try:
        events = json.loads(evt_txt)
    except json.JSONDecodeError:
        print("⚠️ Events JSON failed, skipping:", evt_txt[:100])
        events = []

    # Attach roles via fuzzy matching
    for e in events:
        subj = e.get("subject", "")
        obj  = e.get("object", "")
        e["subject_role"] = find_best_match(subj, labeled_entities)
        e["object_role"]  = find_best_match(obj,  labeled_entities)

    results.append({
        "text": text,
        "entities": item.get("entities", {}),
        "labeled_entities": labeled,
        "events": events
    })

    time.sleep(1.0)  

# ✅ Save output
with open("arabic_narrative_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("✅ Done! Saved to arabic_narrative_output.json")

Processing:   0%|          | 0/20 [00:00<?, ?it/s]

Processing: 100%|██████████| 20/20 [03:35<00:00, 10.76s/it]

✅ Done! Saved to arabic_narrative_output.json





In [None]:
with open("nep_temp_results/hindi.json", "r", encoding="utf-8") as f:
    data = json.load(f)

results = []

def trim_text(text, max_chars=2000):
    return text[:max_chars]

def find_best_match(name, labeled_entities):

    if name in labeled_entities:
        return labeled_entities[name]
    for ent, label in labeled_entities.items():
        if name.strip() in ent or ent.strip() in name:
            return label
    return "Unknown"

for item in tqdm(data[:20]):
    text = item.get("content", "").strip()
    if not text:
        continue
    text = trim_text(text)

    entities = [e["text"] for e in item.get("entities", [])]

    label_prompt = f"""
आप एक कथात्मक समझ सहायक हैं।

नीचे दिए गए समाचार और संस्थाओं को देखें, और प्रत्येक को निम्नलिखित भूमिकाओं में से एक असाइन करें:
- Protagonist: मुख्य सकारात्मक भूमिका या सक्रिय शक्ति
- Antagonist: विरोधी या प्रतिकूल शक्ति
- Neutral: महत्वपूर्ण लेकिन तटस्थ

केवल JSON में उत्तर दें, बिना किसी अतिरिक्त व्याख्या के:
[
  {{"entity": "संस्था का नाम", "label": "Protagonist"}},
  ...
]

टेक्स्ट: {text}

संस्थाएं: {entities}
"""
    try:
        resp = model.generate_content([{"text": label_prompt}])
        role_content = resp.text.strip()
        if role_content.startswith("```json"):
            role_content = re.sub(r"^```json\s*", "", role_content)
            role_content = re.sub(r"\s*```$", "", role_content)

        try:
            labeled = json.loads(role_content)
        except json.JSONDecodeError:
            print("⚠️ Invalid JSON for roles, got:", role_content)
            labeled = []

        labeled_entities = {e["entity"]: e["label"] for e in labeled}

        narrative_prompt = f"""
आप एक कहानी निष्कर्षण सहायक हैं।

निम्नलिखित समाचार से सभी पूर्ण घटनाओं को निकालें:
- subject: किसने किया?
- action: क्या किया?
- object: किस पर/क्या किया?
- context: (वैकल्पिक) अतिरिक्त जानकारी

Subject और object केवल उपरोक्त संस्थाओं में से चुनें।
केवल JSON सूची वापस करें, बिना किसी अन्य व्याख्या के:

टेक्स्ट: {text}
संस्थाएं: {entities}
"""
        resp2 = model.generate_content([{"text": narrative_prompt}])
        ev_content = resp2.text.strip()
        if ev_content.startswith("```json"):
            ev_content = re.sub(r"^```json\s*", "", ev_content)
            ev_content = re.sub(r"\s*```$", "", ev_content)
        try:
            events = json.loads(ev_content)
        except json.JSONDecodeError:
            print("⚠️ Invalid JSON for events, got:", ev_content)
            events = []

        for ev in events:
            subj = ev.get("subject", "")
            obj = ev.get("object", "")
            ev["subject_role"] = find_best_match(subj, labeled_entities)
            ev["object_role"]  = find_best_match(obj,  labeled_entities)

        results.append({
            "text": text,
            "entities": item.get("entities", []),
            "labeled_entities": labeled,
            "events": events
        })

    except Exception as e:
        print("⚠️ Error during processing:", e)
        continue

    time.sleep(1.0)

with open("hindi_narrative_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("✅ Hindi narrative extraction complete. Saved to hindi_narrative_output.json")