In [None]:
!pip install --quiet tqdm pandas requests python-dateutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import requests
import time
import re
import json
import pandas as pd
from urllib.parse import urlparse
from datetime import datetime
from difflib import SequenceMatcher
import os

TAVILY_API_KEY = ""
DEEPSEEK_API_KEY = ""
TAVILY_ENDPOINT = "https://api.tavily.com/search"
DEEPSEEK_ENDPOINT = "https://api.deepseek.com/chat/completions"

USE_LLM = bool(DEEPSEEK_API_KEY)

EXCEL_PATH = "/content/drive/MyDrive/KG/UAS/Data_Anggota.xlsx"
SAVE_FOLDER = "/content/drive/MyDrive/KG/UAS"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# =================== DATA ANGGOTA ===================
df_anggota = pd.read_excel(EXCEL_PATH)
df_anggota = df_anggota.dropna(subset=["Nama"])
anggota_dpr = [{"nama": str(nama).strip()} for nama in df_anggota["Nama"]]

KNOWN_NAMES = [p["nama"] for p in anggota_dpr]
KNOWN_NAMES_LC = [n.lower() for n in KNOWN_NAMES]

TRUSTED_NEWS = [
    "kompas.com","detik.com","tempo.co","antaranews.com","cnnindonesia.com",
    "tribunnews.com","liputan6.com","hukumonline.com","jpnn.com","tvonenews.com",
    "metrotvnews.com","suara.com","vivanews.com","merdeka.com","republika.co.id"
]

BLOCKED_DOMAINS = [
    "instagram.com","facebook.com","fb.com","tiktok.com","youtube.com","youtu.be",
    "twitter.com","x.com","blogspot.","wordpress.", "medium.com"
]

LEGAL_KEYWORDS = [
    "kasus","korupsi","suap","gratifikasi","pencucian uang","tppu","pidana",
    "tersangka","terdakwa","dakwa","penyidikan","penyelidikan","ditetapkan",
    "diperiksa","ditahan","kpk","kejaksaan","polri","ott","pengadilan","vonis"
]

HOAX_KEYWORDS = ["hoaks","hoax","cek fakta","fact check","tidak benar","dibantah","klaim palsu"]

PROMPT_EKSTRAK = """
Anda adalah sistem ekstraksi fakta hukum yang sangat ketat.
Bekerjalah HANYA berdasarkan teks yang diberikan. Jangan menambah informasi.

TARGET = "{nama}"

ATURAN:
- Jika berita menyatakan TARGET sebagai 'tersangka' -> status = "diduga".
- Jika berita menyatakan TARGET sebagai 'terdakwa' -> status = "diduga".
- Jika berita menyatakan TARGET 'saksi' atau 'diperiksa sebagai saksi' -> status = "saksi".
- Jika berita adalah hoaks -> tidak diambil".
- Jika hanya komentar/opini -> status = "tidak diambil".
- Jika ada indikasi belum resmi -> status = "tidak diambil".

OUTPUT JSON EXACT (tanpa pasal):
{{
  "status": "...",
  "jenis_kasus": "...",
  "ringkasan": "...",
  "peran": "...",
  "related_members": ["..."],
  "triple": [
    ["{nama}", "status", "..."],
    ["{nama}", "terkait_kasus", "..."],
    ["{nama}", "peran", "..."]
  ]
}}

TEKS BERITA:
{text}
"""

# ======================================================
def domain_of(url):
    try:
        return urlparse(url).netloc.lower()
    except:
        return ""

def is_blocked_domain(url):
    dom = domain_of(url)
    return any(b in dom for b in BLOCKED_DOMAINS)

def is_trusted_domain(url):
    dom = domain_of(url)
    return any(t in dom for t in TRUSTED_NEWS)

def name_in_text_adaptive(name_full, title, content):
    txt = (title + " " + content).lower()
    name_lc = name_full.lower().strip()
    if name_lc in txt:
        return True
    first = name_lc.split()[0]
    if re.search(r"\b"+re.escape(first)+r"\b", txt):
        sents = re.split(r'(?<=[\.\?\!])\s+', txt)
        for s in sents:
            if (re.search(r"\b"+re.escape(first)+r"\b", s)
                and re.search(r"dpr|anggota dewan|komisi|fraksi", s)):
                same_first = [n for n in KNOWN_NAMES_LC if n.split()[0] == first and n != name_lc]
                if not any(o in s for o in same_first):
                    return True
    return False

def detect_hoax(text):
    tl = text.lower()
    return any(k in tl for k in HOAX_KEYWORDS)

def detect_legal_keyword(text):
    tl = text.lower()
    return any(k in tl for k in LEGAL_KEYWORDS)

def extract_status_and_role(text, target_name):
    t = text.lower()

    if re.search(r"(diperiksa|diperiksa sebagai saksi|memberikan keterangan sebagai saksi)\b", t):
        return "saksi", "saksi"

    if re.search(r"\bdiduga\b|\bdugaan\b|\ditetapkan sebagai tersangka|menjadi tersangka|sebagai tersangka\|b\bdiperiksa\b(sebagai terdakwa|didakwa|terdakwa)\b", t):
        return "diduga", "pelaku/diduga"

    return None, None

def detect_related_members(text, target_name):
    tl = text.lower()
    related = []
    for name_full in KNOWN_NAMES_LC:
        if name_full == target_name.lower():
            continue
        if name_full in tl:
            related.append(name_full.title())
    return sorted(list(dict.fromkeys(related)))

def call_deepseek(prompt):
    if not USE_LLM:
        return None
    try:
        payload = {
            "model": "deepseek-chat",
            "messages": [{"role":"user","content":prompt}],
            "temperature": 0.0
        }
        r = requests.post(DEEPSEEK_ENDPOINT, json=payload,
                          headers={"Authorization": f"Bearer {DEEPSEEK_API_KEY}"}, timeout=20)
        r.raise_for_status()
        return r.json()
    except:
        return None

def analyze_article(title, url, content, target_name):
    if is_blocked_domain(url) or not is_trusted_domain(url):
        return None
    if not name_in_text_adaptive(target_name, title, content):
        return None
    else:
        if not detect_legal_keyword(content):
            return None
        status, role = extract_status_and_role(content, target_name)
        jenis = ""
        ringkasan = content[:250]

    jenis_found = []
    for kw in ["korupsi","suap","gratifikasi","narkoba",
               "penggelapan","penipuan","pemerasan",
               "pencucian uang"]:
        if kw in content.lower():
            jenis_found.append(kw)
    jenis_kasus = ", ".join(jenis_found) if jenis_found else jenis

    role_map = {
        "pelaku/diduga":"pelaku/diduga",
        "saksi":"saksi",
    }
    peran_norm = role_map.get(role, role)

    triple = [
        [target_name, "status", status],
        [target_name, "terkait_kasus", jenis_kasus],
        [target_name, "peran", peran_norm]
    ]

    return {
        "title": title,
        "url": url,
        "status": status,
        "jenis_kasus": jenis_kasus,
        "ringkasan": ringkasan,
        "peran": peran_norm,
        "related_members": detect_related_members(content, target_name),
        "triple": triple
    }

def tavily_search(query, max_results=8):
    if not TAVILY_API_KEY:
        return []
    try:
        payload = {"api_key": TAVILY_API_KEY, "query": query, "max_results": max_results}
        r = requests.post(TAVILY_ENDPOINT, json=payload, timeout=15)
        r.raise_for_status()
        return r.json().get("results", [])
    except:
        return []

def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def dedupe_articles(articles):
    if not articles:
        return []
    result = []
    status_priority = {"tersangka":5,"terdakwa":4,"saksi":3,"diperiksa":3,"diduga":2,"bukan terlibat":1,"tidak terlibat (hoaks)":0}
    for a in articles:
        keep = True
        for b in result:
            if similar(a[0], b[0]) > 0.85:
                if status_priority.get(a[3],0) > status_priority.get(b[3],0):
                    b[:] = a
                keep = False
                break
        if keep:
            result.append(a)
    return result

def is_valid_name(text, target_name):
    pattern = r'\b{}\b'.format(re.escape(target_name))
    return re.search(pattern, text, flags=re.IGNORECASE) is not None

def filter_news(results, target_name):
    seen_urls = set()
    seen_titles = set()
    filtered = []

    for news in results:
        title = news.get('title', '')
        url = news.get('url', '')
        domain = urlparse(url).netloc.lower()

        if not any(t in domain for t in TRUSTED_NEWS):
            continue
        if not is_valid_name(title + " " + news.get('summary', ''), target_name):
            continue
        if url in seen_urls or title in seen_titles:
            continue

        seen_urls.add(url)
        seen_titles.add(title)
        filtered.append(news)

    return filtered

def run_all(save_prefix="results"):
    save_path = "/content/drive/MyDrive/KG/UAS"
    os.makedirs(save_path, exist_ok=True)

    outputs = []
    for person in anggota_dpr:
        name = person["nama"]
        print("="*70)
        print(f"Mengecek kasus hukum untuk: {name}")

        queries = [
            f"{name} kasus",
            f"{name} diperiksa KPK",
            f"{name} tersangka",
            f"{name} dugaan korupsi"
        ]

        raw_results = []
        for q in queries:
            hits = tavily_search(q, max_results=6)
            for h in hits:
                raw_results.append({
                    "title": h.get("title",""),
                    "url": h.get("url",""),
                    "summary": h.get("content","")
                })
            time.sleep(0.2)

        valid_results = filter_news(raw_results, name)

        deduped = []
        for item in valid_results:
            t = item["title"]
            u = item["url"]
            c = item["summary"]
            deduped.append([t, u, c, ""])

        deduped = dedupe_articles(deduped)

        valid_count = 0
        for t,u,c,_ in deduped:
            art = analyze_article(t, u, c, name)
            if not art:
                continue
            valid_count += 1
            outputs.append({
                "target": name,
                "title": art["title"],
                "url": art["url"],
                "status": art["status"],
                "jenis_kasus": art["jenis_kasus"],
                "ringkasan": art["ringkasan"],
                "peran": art["peran"],
                "related_members": ";".join(art["related_members"]),
                "triple": json.dumps(art["triple"], ensure_ascii=False)
            })

            print()
            print(f"- {art['title']}")
            print(f"  {art['url']}")
            print(f"  Status      : {art['status']}")
            print(f"  Jenis Kasus : {art['jenis_kasus']}")
            print(f"  Ringkasan   : {art['ringkasan'][:300]}")
            print(f"  Peran       : {art['peran']}")
            if art["related_members"]:
                print(f"  Related DPR : {', '.join(art['related_members'])}")
            print("  Triple:")
            for tr in art["triple"]:
                print("    ", tr)
            print()

        print(f"Ditemukan {valid_count} berita valid untuk {name}.\n")

    if outputs:
        ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        json_file = os.path.join(save_path, f"{save_prefix}_{ts}.json")
        csv_file = os.path.join(save_path, f"{save_prefix}_{ts}.csv")
        with open(json_file,"w",encoding="utf-8") as f:
            json.dump(outputs, f, ensure_ascii=False, indent=2)
        pd.DataFrame(outputs).to_csv(csv_file, index=False, encoding="utf-8-sig")
        print(f"Saved {len(outputs)} rows â†’ {json_file}, {csv_file}")
    else:
        print("No validated outputs to save.")

if __name__ == "__main__":
    run_all()
