1. Tahap Scraping

In [None]:
import re
import sys
import time
import pandas as pd
import pytz
import requests
from bs4 import BeautifulSoup
from types import SimpleNamespace
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse, urlencode
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

DEFAULT_TZ = "Asia/Jakarta"
INDEX_BASE = "https://indeks.kompas.com"

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"),
    "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
    "Referer": "https://indeks.kompas.com/"
}

URL_DATE_RE = re.compile(r"/read/(\d{4})/(\d{2})/(\d{2})/(\d{6,9})(?:/|$)")

DISASTER_KEYWORDS = [
    "gempa", "banjir", "longsor", "tsunami", "angin puting beliung", "angin kencang",
    "cuaca ekstrem", "erupsi", "letusan", "gunung", "kebakaran hutan", "karhutla",
    "kekeringan", "badai", "topan", "gelombang tinggi", "abrasi", "lahar", "banjir bandang",
    "bmkg", "pvmbg", "bnpb", "basarnas",
]

def make_session():
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.7,
                  status_forcelist=[429, 500, 502, 503, 504],
                  allowed_methods=["GET"])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def get(session, url, timeout=25):
    r = session.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r

def dt_from_url(tz, link):
    try:
        m = URL_DATE_RE.search(urlparse(link).path)
        if not m:
            return None
        y, mo, d, hms = m.groups()
        hh, mm, ss = int(hms[:2]), int(hms[2:4]), int(hms[4:6])
        return tz.localize(datetime(int(y), int(mo), int(d), hh, mm, ss))
    except Exception:
        return None

def in_range(start_dt, end_dt, dt):
    return (dt is not None) and (start_dt <= dt <= end_dt)

def collect_links_for_date(session, tz, day_dt, max_pages, sleep_sec, timeout):
    y, m, d = day_dt.year, day_dt.month, day_dt.day
    daily_links = set()
    pattern = re.compile(fr"/read/{y}/{m:02d}/{d:02d}/\d{{6,9}}/")

    site = "nasional"
    for page in range(1, max_pages + 1):
        qs = urlencode({"site": site, "date": f"{y}-{m:02d}-{d:02d}", "page": page})
        url = f"{INDEX_BASE}/?{qs}"

        try:
            time.sleep(sleep_sec)
            resp = get(session, url, timeout=timeout)
        except Exception as ex:
            print(f"[INDEKS] Gagal {url} -> {ex}")
            break

        soup = BeautifulSoup(resp.text, "html.parser")
        anchors = soup.find_all("a", href=True)

        found = 0
        for a in anchors:
            href = a["href"].split("?")[0]
            if href.startswith("//"):
                href = "https:" + href
            if href.startswith("/"):
                href = urljoin("https://www.kompas.com", href)

            try:
                p = urlparse(href)
                if ("kompas.com" in p.netloc) and pattern.search(p.path):
                    daily_links.add(href)
                    found += 1
            except:
                continue

        print(f"[INDEKS] nasional {y}-{m:02d}-{d:02d} page={page} -> {found} link")
        if found == 0:
            break

    return daily_links

def extract_article_detail(html):
    soup = BeautifulSoup(html, "html.parser")

    title = None
    for sel in ["h1.read__title", "h1.detail__title", "h1.headline__title", "h1"]:
        el = soup.select_one(sel)
        if el and el.get_text(strip=True):
            title = el.get_text(strip=True)
            break

    author = None
    for sel in [".read__author", ".author", '[rel="author"]', ".byline"]:
        el = soup.select_one(sel)
        if el and el.get_text(strip=True):
            author = el.get_text(" ", strip=True)
            break

    section = None
    for sel in ['meta[property="article:section"]', 'meta[name="section"]']:
        m = soup.select_one(sel)
        if m and m.get("content"):
            section = m.get("content").strip()
            break

    if not section:
        crumbs = [a.get_text(" ", strip=True)
                  for a in soup.select(".breadcrumb a, .breadcrumb__link, a.breadcrumb__link")
                  if a.get_text(strip=True)]
        crumbs = [c for c in crumbs if c.lower() not in ("kompas.com",)]
        if crumbs:
            section = crumbs[-1]
    if not section:
        section = "Nasional"

    summary = ""
    for sel in [".read__content p", ".article__content p", "article p"]:
        nodes = soup.select(sel)
        if nodes:
            texts = [p.get_text(" ", strip=True) for p in nodes if p.get_text(strip=True)]
            if texts:
                summary = " ".join(texts)[:5000]
                break

    return (title or "").strip(), (author or "").strip(), section, summary

def is_disaster_row(title, summary):
    txt = f"{title or ''} {summary or ''}".lower()
    return any(kw in txt for kw in DISASTER_KEYWORDS)

def run(args):
    tz = pytz.timezone(args.timezone)
    start_dt = tz.localize(datetime.strptime(args.start, "%Y-%m-%d"))
    end_dt = tz.localize(datetime.strptime(args.end, "%Y-%m-%d")) + timedelta(hours=23, minutes=59, seconds=59)

    print(f"\n=== Scraping NASIONAL bencana alam {args.start} s/d {args.end} ===")

    session = make_session()

    all_links = set()
    cur = start_dt
    while cur <= end_dt:
        daily = collect_links_for_date(session, tz, cur, args.max_pages, args.sleep, args.timeout)
        all_links |= daily
        cur += timedelta(days=1)

    print(f"Total link unik: {len(all_links)}")

    links_in_range = []
    for u in sorted(all_links):
        dt_url = dt_from_url(tz, u)
        if in_range(start_dt, end_dt, dt_url):
            links_in_range.append((u, dt_url))
    print(f"Link dalam rentang tanggal: {len(links_in_range)}")

    rows = []
    for i, (link, dt_url) in enumerate(links_in_range, 1):
        try:
            time.sleep(args.sleep)
            r = get(session, link, timeout=args.timeout)
            title, author, section, summary = extract_article_detail(r.text)
        except Exception as ex:
            print(f"[ERR] {link} -> {ex}")
            title = author = summary = ""
            section = "Nasional"

        if is_disaster_row(title, summary):
            rows.append({
                "title": title,
                "url": link,
                "author": author,
                "section": section,
                "published_local": dt_url.strftime("%Y-%m-%d %H:%M:%S %Z"),
                "summary": summary
            })

        if i % 25 == 0:
            print(f"[PROGRESS] {i}/{len(links_in_range)} (disaster: {len(rows)})")

    if not rows:
        print("TIDAK ada artikel bencana di rentang waktu ini.")
        return None

    df = pd.DataFrame(rows).drop_duplicates(subset=["url"]).reset_index(drop=True)

    df["published_dt"] = pd.to_datetime(df["published_local"].str.replace(" WIB","", regex=False), errors="coerce")
    df["bulan"] = df["published_dt"].dt.to_period("M").astype(str)

    return df


# ======================
#  SCRAPE JAN–JUN 2025
# ======================
args = SimpleNamespace(
    start="2025-01-01",
    end="2025-06-30",
    timezone="Asia/Jakarta",
    sleep=1.5,
    timeout=25,
    max_pages=6
)
df_janjun = run(args)

# ======================
#  SCRAPE JUL–SEP 2025
# ======================
args = SimpleNamespace(
    start="2025-07-01",
    end="2025-09-30",
    timezone="Asia/Jakarta",
    sleep=1.5,
    timeout=25,
    max_pages=6
)
df_julsep = run(args)

# ======================
#  GABUNG & SIMPAN EXCEL
# ======================
df_total = pd.concat([df_janjun, df_julsep], ignore_index=True)
df_total = df_total.drop_duplicates(subset="url").reset_index(drop=True)

output_path = "kompas_bencana_nasional_2025.xlsx"
df_total.to_excel(output_path, index=False)

print("\n=== FILE BERHASIL DISIMPAN ===")
print(" ->", output_path)
print("Total artikel:", len(df_total))

df_total.head()


2. Preprocessing

Import & Konfigurasi Dasar

In [1]:
import re
import unicodedata
from pathlib import Path
from html import unescape
import pandas as pd

# Apakah mau menghapus emoji/symbol
REMOVE_EMOJI = True

# Normalisasi singkatan
NORMALIZE_MAP = {
    r"\bKab\.?\b": "Kabupaten",
    r"\bKec\.?\b": "Kecamatan",
    r"\bKota\.?\b": "Kota",
    r"\bProv\.?\b": "Provinsi",
    r"\bWali Kota\b": "Walikota",
    r"\bWakil Presiden\b": "Wakil Presiden",
    r"\bDr\.\b": "Dr.",
    r"\bPres\.*\b": "Presiden"
}

# Regex Helper
TAG_RE = re.compile(r"<[^>]+>")
URL_RE = re.compile(r"https?://\S+|www\.\S+")
MULTI_SPACE_RE = re.compile(r"\s{2,}")
REPEAT_PUNCT_RE = re.compile(r"([!?.,])\1{2,}")
CONTROL_CHARS = re.compile(r"[\r\n\t]+")
SPACE_PUNCT_RE = re.compile(r"\s+([,.;:!?])")
LEADING_TRAILING_SPACE_RE = re.compile(r"^\s+|\s+$")


Fungsi Preprocessing

In [2]:
def remove_emoji_and_symbols(text: str) -> str:
    normalized = unicodedata.normalize("NFKC", text)
    out_chars = []
    for ch in normalized:
        cat = unicodedata.category(ch)
        if cat.startswith(("C", "S")):
            continue
        out_chars.append(ch)
    return "".join(out_chars)


def detect_text_column(df: pd.DataFrame):
    candidates = ["cleaned_text", "clean_text", "text", "summary", "body",
                  "isi", "berita", "content", "headline"]
    for cand in candidates:
        for col in df.columns:
            if cand.lower() == col.lower():
                return col

    # Jika tidak menemukan, pilih kolom object terpanjang
    obj_cols = [c for c in df.columns if df[c].dtype == object]
    if not obj_cols:
        return df.columns[0]

    best = max(
        obj_cols,
        key=lambda c: df[c].dropna().astype(str).map(len).mean()
        if len(df[c].dropna()) > 0 else 0
    )
    return best


def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    s = unescape(s)
    s = s.lstrip("\ufeff")
    s = TAG_RE.sub(" ", s)
    s = URL_RE.sub(" ", s)
    s = CONTROL_CHARS.sub(" ", s)

    if REMOVE_EMOJI:
        s = remove_emoji_and_symbols(s)

    s = REPEAT_PUNCT_RE.sub(r"\1", s)

    for pat, rep in NORMALIZE_MAP.items():
        s = re.sub(pat, rep, s, flags=re.IGNORECASE)

    s = MULTI_SPACE_RE.sub(" ", s)
    s = SPACE_PUNCT_RE.sub(r"\1", s)
    s = LEADING_TRAILING_SPACE_RE.sub("", s)
    return s

def run_preprocessing(input_path, out_dir):
    input_path = Path(input_path)
    out_dir = Path(out_dir)

    if not input_path.exists():
        raise FileNotFoundError(f"File tidak ditemukan: {input_path}")

    # Load Excel/CSV
    if input_path.suffix.lower() in [".xls", ".xlsx"]:
        xls = pd.ExcelFile(input_path)
        df = pd.read_excel(input_path, sheet_name=xls.sheet_names[0])
    else:
        df = pd.read_csv(input_path)

    print(f"Loaded file: {input_path} | shape = {df.shape}")

    text_col = detect_text_column(df)
    print(f"Detected text column: {text_col}")

    df["original_text"] = df[text_col].astype(str).fillna("")
    df["cleaned_text"] = df["original_text"].apply(clean_text)
    df["cleaned_text_preview"] = df["cleaned_text"].str.slice(0, 200)

    # Create output folder
    out_dir.mkdir(parents=True, exist_ok=True)
    out_csv = out_dir / (input_path.stem + "_preprocessed.csv")

    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Preprocessed CSV saved to: {out_csv}")

    print("\nRingkasan:")
    print(f" - total rows: {len(df)}")
    print(f" - original_text kosong: {(df['original_text'].str.strip() == '').sum()}")
    print(f" - cleaned_text kosong: {(df['cleaned_text'].str.strip() == '').sum()}")

    print("\nContoh 10 baris (original → cleaned):\n")
    for i, row in df[['original_text', 'cleaned_text_preview']].head(10).iterrows():
        print(f"[{i}] ORIGINAL: {row['original_text'][:200]!r}")
        print(f"     CLEANED: {row['cleaned_text_preview']!r}")
        print("-" * 60)

    return df


In [3]:
df_pre = run_preprocessing(
    input_path=r"D:\SEMESTER 7\Equivalensi jurnal P Setio\coding\kompas_bencana_nasional_2025.xlsx",
    out_dir=r"D:\SEMESTER 7\Equivalensi jurnal P Setio\coding"
)

df_pre.head()


Loaded file: D:\SEMESTER 7\Equivalensi jurnal P Setio\coding\kompas_bencana_nasional_2025.xlsx | shape = (1429, 6)
Detected text column: summary
Preprocessed CSV saved to: D:\SEMESTER 7\Equivalensi jurnal P Setio\coding\kompas_bencana_nasional_2025_preprocessed.csv

Ringkasan:
 - total rows: 1429
 - original_text kosong: 0
 - cleaned_text kosong: 0

Contoh 10 baris (original → cleaned):

[0] ORIGINAL: 'JAKARTA, KOMPAS.com - Presiden Prabowo Subianto mendoakan agar Indonesia diberikan kesejahteraan, perdamaian, dan kebaikan di tahun 2025. Harapan dan doa Prabowo tersebut disampaikan oleh Menteri Koor'
     CLEANED: 'JAKARTA, KOMPAS.com - Presiden Prabowo Subianto mendoakan agar Indonesia diberikan kesejahteraan, perdamaian, dan kebaikan di tahun 2025. Harapan dan doa Prabowo tersebut disampaikan oleh Menteri Koor'
------------------------------------------------------------
[1] ORIGINAL: 'JAKARTA, KOMPAS.com - Satgas Operasi Damai Cartenz 2024 mencatatkan sejumlah capaian selama menggel

Unnamed: 0,title,url,author,section,published_local,summary,original_text,cleaned_text,cleaned_text_preview
0,Menko Budi Ungkap Harapan Prabowo untuk 2025: ...,https://nasional.kompas.com/read/2025/01/01/00...,(KOMPAS.com/BAHARUDIN AL FARISI),Nasional,2025-01-01 00:27:07 WIB,"JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant..."
1,Capaian Satgas Ops Damai Cartenz 2024: Duduki ...,https://nasional.kompas.com/read/2025/01/01/17...,(Satgas Damai Cartenz Polri),Nasional,2025-01-01 17:43:47 WIB,"JAKARTA, KOMPAS.com - Satgas Operasi Damai Car...","JAKARTA, KOMPAS.com - Satgas Operasi Damai Car...","JAKARTA, KOMPAS.com - Satgas Operasi Damai Car...","JAKARTA, KOMPAS.com - Satgas Operasi Damai Car..."
2,"Prabowo ""Video Call"" Ribuan Prajurit TNI di Pa...",https://nasional.kompas.com/read/2025/01/01/18...,(Dok. Kementerian Pertahanan),Nasional,2025-01-01 18:56:01 WIB,"JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...","JAKARTA, KOMPAS.com - Presiden Prabowo Subiant..."
3,"Sidang Korupsi di Basarnas, Hakim Singgung Ket...",https://nasional.kompas.com/read/2025/01/02/13...,(KOMPAS.com/Syakirun Ni'am),Nasional,2025-01-02 13:27:06 WIB,"JAKARTA, KOMPAS.com - Hakim Pengadilan Tipikor...","JAKARTA, KOMPAS.com - Hakim Pengadilan Tipikor...","JAKARTA, KOMPAS.com - Hakim Pengadilan Tipikor...","JAKARTA, KOMPAS.com - Hakim Pengadilan Tipikor..."
4,Menko Polkam hingga Jaksa Agung Gelar Rakor Pe...,https://nasional.kompas.com/read/2025/01/02/13...,(KOMPAS.COM/ KIKI SAFITRI),Nasional,2025-01-02 13:48:23 WIB,"JAKARTA, KOMPAS.com - Jaksa Agung RI ST Burhan...","JAKARTA, KOMPAS.com - Jaksa Agung RI ST Burhan...","JAKARTA, KOMPAS.com - Jaksa Agung RI ST Burhan...","JAKARTA, KOMPAS.com - Jaksa Agung RI ST Burhan..."


3. Tahap Ekstraksi event type

In [4]:
!pip install sentence-transformers
!pip install xlsxwriter


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

# ====== DAFTAR LABEL JENIS BENCANA ======
EVENT_LABELS = [
    "banjir",
    "banjir bandang",
    "gempa bumi",
    "guncangan",
    "tanah longsor",
    "pergerakan tanah",
    "letusan gunung berapi",
    "kebakaran hutan",
    "kekeringan",
    "badai",
    "topan",
    "tsunami",
    "gelombang tinggi",
    "abrasi"
]

# ====== LOAD MODEL SBERT ======
def load_classifier():
    print("[INFO] Memuat model Sentence-BERT (Multilingual)...")
    model = SentenceTransformer("distiluse-base-multilingual-cased-v2")
    return model

# ====== KLASIFIKASI EVENT ======
def classify_event_type(text, model, threshold=0.20):
    if not isinstance(text, str) or not text.strip():
        return "tidak terdeteksi"

    text = text[:500]  # pakai bagian awal saja

    emb_text = model.encode(text, convert_to_tensor=True)
    emb_labels = model.encode(EVENT_LABELS, convert_to_tensor=True)

    scores = util.cos_sim(emb_text, emb_labels)[0]
    best_idx = torch.argmax(scores).item()
    best_score = scores[best_idx].item()

    label = EVENT_LABELS[best_idx]

    if best_score < threshold:
        return f"{label} (low confidence)"

    return label


# ====== PIPELINE UTAMA ======
def run_event_extraction(input_path, summary_col="summary", sheet=None, output="event_type_fixed.xlsx"):

    ext = os.path.splitext(input_path)[1].lower()
    if ext == ".xlsx":
        df = pd.read_excel(input_path, sheet_name=sheet)
    else:
        df = pd.read_csv(input_path)

    if summary_col not in df.columns:
        raise ValueError(f"Kolom '{summary_col}' tidak ditemukan. Kolom tersedia: {list(df.columns)}")

    summaries = df[summary_col].fillna("").astype(str)

    # load SBERT
    model = load_classifier()

    print("[INFO] Mengekstrak event_type...")
    df["event_type"] = [
        classify_event_type(text, model) for text in tqdm(summaries)
    ]

    # simpan hasil
    if output.lower().endswith(".csv"):
        df.to_csv(output, index=False, encoding="utf-8")
    else:
        df.to_excel(output, index=False, engine="xlsxwriter")

    print(f"\n[SUKSES] Hasil disimpan di: {output}")
    return df


# ==== JALANKAN ====
df_event = run_event_extraction(
    input_path="kompas_bencana_nasional_2025.xlsx",
    summary_col="summary",
    sheet="Disaster_Only",
    output="event_type_fixed.xlsx"
)

df_event.head()



[INFO] Memuat model Sentence-BERT (Multilingual)...


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

[INFO] Mengekstrak event_type...


100%|██████████| 1429/1429 [00:19<00:00, 74.12it/s]



[SUKSES] Hasil disimpan di: event_type_fixed.xlsx


Unnamed: 0,title,url,author,section,published_local,summary,event_type
0,Menko Budi Ungkap Harapan Prabowo untuk 2025: ...,https://nasional.kompas.com/read/2025/01/01/00...,(KOMPAS.com/BAHARUDIN AL FARISI),Nasional,2025-01-01 00:27:07 WIB,"JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...",badai (low confidence)
1,Capaian Satgas Ops Damai Cartenz 2024: Duduki ...,https://nasional.kompas.com/read/2025/01/01/17...,(Satgas Damai Cartenz Polri),Nasional,2025-01-01 17:43:47 WIB,"JAKARTA, KOMPAS.com - Satgas Operasi Damai Car...",banjir bandang (low confidence)
2,"Prabowo ""Video Call"" Ribuan Prajurit TNI di Pa...",https://nasional.kompas.com/read/2025/01/01/18...,(Dok. Kementerian Pertahanan),Nasional,2025-01-01 18:56:01 WIB,"JAKARTA, KOMPAS.com - Presiden Prabowo Subiant...",banjir bandang (low confidence)
3,"Sidang Korupsi di Basarnas, Hakim Singgung Ket...",https://nasional.kompas.com/read/2025/01/02/13...,(KOMPAS.com/Syakirun Ni'am),Nasional,2025-01-02 13:27:06 WIB,"JAKARTA, KOMPAS.com - Hakim Pengadilan Tipikor...",banjir bandang (low confidence)
4,Menko Polkam hingga Jaksa Agung Gelar Rakor Pe...,https://nasional.kompas.com/read/2025/01/02/13...,(KOMPAS.COM/ KIKI SAFITRI),Nasional,2025-01-02 13:48:23 WIB,"JAKARTA, KOMPAS.com - Jaksa Agung RI ST Burhan...",banjir bandang (low confidence)


Fungsi Prediksi Event Type

4. Tahap FEATURE SELECTION (TF-IDF + FastText)

In [18]:
!pip install gensim


Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp312-cp312-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   -------------------------------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# ================================================================
#   TAHAP 4 — FEATURE SELECTION (TF-IDF + FASTTEXT — FILTERED)
# ================================================================

from pathlib import Path
import ast
import multiprocessing
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import FastText
import matplotlib.pyplot as plt

# ------------------------------------------------
# Utility Functions
# ------------------------------------------------
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def parse_tokens_cell(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return [str(t) for t in x]

    s = str(x).strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            lst = ast.literal_eval(s)
            if isinstance(lst, list):
                return [str(t).strip() for t in lst]
        except:
            pass

    s = s.replace(",", " ")
    return [t for t in s.split() if t]

def build_tokens(df):
    token_col = None
    for cand in ['stemmed_tokens', 'tokens_nostop', 'tokens_summary']:
        if cand in df.columns:
            token_col = cand
            break

    if token_col is None:
        df['__tokens_tmp'] = df['processed_summary'].astype(str).fillna("").apply(lambda t: t.split())
        token_col = '__tokens_tmp'

    df[token_col] = df[token_col].apply(parse_tokens_cell)
    df['processed_summary'] = df[token_col].apply(lambda toks: " ".join(toks))

    return df, token_col

# ------------------------------------------------
# TF-IDF
# ------------------------------------------------
def run_tfidf(texts, max_features=2000, min_df=5, stop_words=None):
    vect = TfidfVectorizer(max_features=max_features, min_df=min_df, stop_words=stop_words)
    X = vect.fit_transform(texts)
    feats = vect.get_feature_names_out()
    means = np.asarray(X.mean(axis=0)).ravel()

    tfidf_df = pd.DataFrame({'word': feats, 'mean_tfidf': means})
    tfidf_df = tfidf_df.sort_values('mean_tfidf', ascending=False).reset_index(drop=True)
    return tfidf_df

def plot_top_words(df, value_col, topn=30, out_png=None, title=None):
    df_top = df.head(topn)
    if df_top.empty:
        return

    words = df_top['word'].tolist()[::-1]
    vals = df_top[value_col].tolist()[::-1]

    plt.figure(figsize=(8,6))
    plt.barh(range(len(words)), vals)
    plt.yticks(range(len(words)), words)
    plt.xlabel(value_col)
    if title:
        plt.title(title)
    plt.tight_layout()

    if out_png:
        plt.savefig(out_png, dpi=150)
        plt.close()
    else:
        plt.show()

# ------------------------------------------------
# FASTTEXT
# ------------------------------------------------
def train_fasttext(sentences, vector_size=150, window=5, min_count=1, epochs=20, workers=0):
    sentences = [s for s in sentences if isinstance(s, list) and len(s) > 0]
    workers = workers if workers > 0 else max(1, multiprocessing.cpu_count() - 1)

    model = FastText(vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=len(sentences), epochs=epochs)
    return model

def most_similar_per_seed(model, seeds, topk=20):
    rows = []
    for seed in seeds:
        if seed in model.wv:
            try:
                for w, score in model.wv.most_similar(seed, topn=topk):
                    rows.append({'seed': seed, 'similar_word': w, 'score': float(score)})
            except:
                rows.append({'seed': seed, 'similar_word': None, 'score': None})
        else:
            rows.append({'seed': seed, 'similar_word': None, 'score': None})
    return pd.DataFrame(rows)

def global_similarity(model, seeds, topn=50):
    seeds_existing = [s for s in seeds if s in model.wv]
    if not seeds_existing:
        return pd.DataFrame(columns=['word', 'avg_similarity'])

    vocab = list(model.wv.index_to_key)
    scores = []
    for w in vocab:
        vals = [model.wv.similarity(w, s) for s in seeds_existing]
        scores.append((w, float(np.mean(vals))))

    scores.sort(key=lambda x: x[1], reverse=True)
    return pd.DataFrame(scores[:topn], columns=['word','avg_similarity'])

# ------------------------------------------------
# MAIN PIPELINE
# ------------------------------------------------
def run_feature_selection_filtered(
    input_path,
    outdir,
    topn=50,
    max_features=2000,
    min_df=5,
    ft_dim=150,
    ft_window=5,
    ft_min_count=1,
    ft_epochs=20,
    workers=0,
    seeds="banjir,gempa,longsor,kebakaran,tsunami",
    topk_per_seed=20,
    filter_on=True
):
    inpath = Path(input_path)
    outdir = Path(outdir)
    ensure_dir(outdir)
    ensure_dir(outdir / "plots")

    print(f"[LOAD] {inpath}")
    df = pd.read_csv(inpath)

    if 'processed_summary' not in df.columns:
        raise ValueError("Kolom 'processed_summary' tidak ditemukan.")

    df, token_col = build_tokens(df)
    print(f"[TOKENS] menggunakan kolom: {token_col}")

    # --------------------------------------
    # FILTERING (opsional)
    # --------------------------------------
    disaster_keywords = ["banjir","gempa","longsor","kebakaran","tsunami","erupsi","angin"]

    if filter_on:
        pattern = r'\b(' + '|'.join(disaster_keywords) + r')\b'
        mask = df['processed_summary'].str.contains(pattern, regex=True)
        df_filtered = df[mask].copy()
        print(f"[FILTER] {len(df_filtered)} dokumen tersisa dari total {len(df)}")
        if len(df_filtered) == 0:
            df_filtered = df.copy()
            print("[WARNING] Filter kosong → memakai seluruh dataset")
    else:
        df_filtered = df.copy()
        print(f"[FILTER OFF] memakai seluruh dokumen: {len(df_filtered)}")

    # --------------------------------------
    # Extra stopwords
    # --------------------------------------
    extra_stopwords = {
        "jadi","baca","kata","sebut","tidak","ini","itu","akan","yang","dengan",
        "untuk","pada","oleh","adalah","sudah","tahun","presiden","menteri",
        "prabowo","tni","indonesia","via","foto","terkait","lainnya","lagi"
    }
    stop_words = list(extra_stopwords)

    # ---------------- TF-IDF ----------------
    print("[TF-IDF] extracting...")
    texts = df_filtered['processed_summary'].astype(str)

    tfidf_df = run_tfidf(texts, max_features=max_features, min_df=min_df, stop_words=stop_words)
    tfidf_df.head(topn).to_csv(outdir / f"tfidf_top{topn}.csv", index=False)

    plot_top_words(
        tfidf_df, 'mean_tfidf', 30,
        out_png=outdir / "plots" / "top30_tfidf.png",
        title="Top 30 Kata (TF-IDF Filtered)"
    )

    # ---------------- FASTTEXT ---------------
    print("[FASTTEXT] training model...")
    sentences = df_filtered[token_col].tolist()
    workers = workers if workers > 0 else max(1, multiprocessing.cpu_count() - 1)

    model = train_fasttext(sentences, ft_dim, ft_window, ft_min_count, ft_epochs, workers)
    model.save(str(outdir / "fasttext.model"))

    seeds_list = [s.strip() for s in seeds.split(",") if s.strip()]

    per_seed_df = most_similar_per_seed(model, seeds_list, topk_per_seed)
    per_seed_df.to_csv(outdir / f"fasttext_top{topk_per_seed}_per_seed.csv", index=False)

    global_df = global_similarity(model, seeds_list, topn)
    global_df.to_csv(outdir / f"fasttext_global_top{topn}.csv", index=False)

    if not global_df.empty:
        plot_top_words(
            global_df.rename(columns={'avg_similarity': 'mean_tfidf'}),
            'mean_tfidf', 30,
            out_png=outdir / "plots" / "top30_fasttext.png",
            title="Top 30 Kata (FastText Similarity)"
        )

    print("\n== TF-IDF Top 10 ==")
    print(tfidf_df.head(10).to_string(index=False))

    print("\n== FastText Top 10 ==")
    print(global_df.head(10).to_string(index=False))

    print("\n[SELESAI] Semua file disimpan ke:", outdir)

    return {
        "tfidf": tfidf_df,
        "fasttext_global": global_df,
        "fasttext_per_seed": per_seed_df,
        "filtered_docs": df_filtered
    }


# ================================================================
#        JALANKAN PIPELINE
# ================================================================
results = run_feature_selection_filtered(
    input_path="kompas_preprocessed.csv",
    outdir="results_feature_selection_filtered",
    topn=50,
    filter_on=True
)

results["tfidf"].head()


[LOAD] kompas_preprocessed.csv
[TOKENS] menggunakan kolom: stemmed_tokens
[FILTER] 601 dokumen tersisa dari total 1429
[TF-IDF] extracting...


  mask = df['processed_summary'].str.contains(pattern, regex=True)


[FASTTEXT] training model...

== TF-IDF Top 10 ==
   word  mean_tfidf
 banjir    0.046262
  gempa    0.038048
  bantu    0.034777
bencana    0.033437
 korban    0.032736
  hujan    0.029564
  bekas    0.027083
 dampak    0.025475
   kita    0.024593
myanmar    0.024488

== FastText Top 10 ==
       word  avg_similarity
pascabanjir        0.505284
    longsor        0.501656
    cempaka        0.482032
 darussalam        0.478277
    gerebek        0.476036
     banjir        0.472050
      rusuh        0.470048
       usir        0.469911
     sortir        0.463817
   longwave        0.462980

[SELESAI] Semua file disimpan ke: results_feature_selection_filtered


Unnamed: 0,word,mean_tfidf
0,banjir,0.046262
1,gempa,0.038048
2,bantu,0.034777
3,bencana,0.033437
4,korban,0.032736


5. Tahap (Eksperimen Model)

In [20]:
# =========================================
#   Logistic Regression & Linear SVM
# =========================================

import os, re, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, precision_recall_fscore_support
)

# ====================================================
# CONFIG
# ====================================================
DATA_PATH = "kompas_bencana_nasional_2025.xlsx"
RANDOM_STATE = 42
MIN_COUNT_THRESHOLD = 10
TFIDF_MAX_FEATURES = 5000
METRICS_CSV = "experiment_metrics.csv"
FIG_DIR = "figures"

os.makedirs(FIG_DIR, exist_ok=True)


# ====================================================
# PREPROCESSING
# ====================================================
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'\([^)]*\)', ' ', text)
    text = re.sub(r'[^0-9a-zà-ž\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_data(path):
    df = pd.read_excel(path)
    df['text_raw'] = (df.get('title', '').fillna("") + " " + df.get('summary', '').fillna("")).astype(str)
    df['text_clean'] = df['text_raw'].apply(preprocess_text)
    return df


# ====================================================
# PSEUDO-LABELING (Heuristik)
# ====================================================
event_keywords = {
    'banjir': ['banjir', 'pascabanjir'],
    'gempa': ['gempa','gempabumi','guncang'],
    'longsor': ['longsor','tanah longsor'],
    'kebakaran': ['kebakaran','bakar'],
    'tsunami': ['tsunami'],
    'kekeringan': ['kekeringan','kering'],
    'angin_puting_beliung': ['puting beliung','angin kencang'],
    'erupsi': ['erupsi','letusan','gunung meletus']
}

def assign_event_label(text):
    for label, keys in event_keywords.items():
        for k in keys:
            if k in text:
                return label
    return 'lainnya'


# ====================================================
# LOAD DATA
# ====================================================
print(" Loading dataset:", DATA_PATH)
df = load_data(DATA_PATH)
print("Total records:", len(df))

# Pseudo-label
df['event_label'] = df['text_clean'].apply(assign_event_label)
print("\nInitial label counts:\n", df['event_label'].value_counts())


# ====================================================
# MERGE RARE LABELS → 'lainnya'
# ====================================================
counts = df['event_label'].value_counts()
rare = counts[counts < MIN_COUNT_THRESHOLD].index.tolist()

print("\nMerging rare labels to 'lainnya':", rare)

df['event_label_collapsed'] = df['event_label'].apply(
    lambda x: x if x not in rare else 'lainnya'
)

print("\nLabel distribution after collapse:\n",
      df['event_label_collapsed'].value_counts())


# ====================================================
# SPLIT TRAIN-TEST
# ====================================================
X = df['text_clean'].values
y = df['event_label_collapsed'].values

le = LabelEncoder()
y_enc = le.fit_transform(y)

if len(np.unique(y_enc)) < 2:
    raise ValueError(" Error: hanya 1 kelas tersedia setelah pseudo-labeling.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=RANDOM_STATE, stratify=y_enc
)

# ====================================================
# TF-IDF
# ====================================================
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=TFIDF_MAX_FEATURES,
                        min_df=2, sublinear_tf=True)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)


# ====================================================
# SKENARIO A — Logistic Regression
# ====================================================
print("\n Training Logistic Regression...")
pipe_lr = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced',
                              random_state=RANDOM_STATE))
])
param_lr = {'clf__C': [1, 10]}

grid_lr = GridSearchCV(pipe_lr, param_lr, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_lr.fit(X_train, y_train)

y_pred_lr = grid_lr.predict(X_test)

print("\nLR Best Params:", grid_lr.best_params_)
print("LR Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


# ====================================================
# SKENARIO B — Linear SVM
# ====================================================
print("\n Training Linear SVM...")
pipe_svm = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=TFIDF_MAX_FEATURES,
                              min_df=2, sublinear_tf=True)),
    ('clf', LinearSVC(class_weight='balanced', max_iter=5000,
                      random_state=RANDOM_STATE))
])
param_svm = {'clf__C': [1, 10]}

grid_svm = GridSearchCV(pipe_svm, param_svm, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_svm.fit(X_train, y_train)

y_pred_svm = grid_svm.predict(X_test)

print("\nSVM Best Params:", grid_svm.best_params_)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=le.classes_))


# ====================================================
# SAVE MODELS
# ====================================================
joblib.dump(grid_lr.best_estimator_, "model_lr_best.pkl")
joblib.dump(grid_svm.best_estimator_, "model_svm_best.pkl")
print("\n Saved models: model_lr_best.pkl, model_svm_best.pkl")


# ====================================================
# SAVE METRICS
# ====================================================
metrics = [
    {
        'model': 'LogisticRegression',
        'accuracy': accuracy_score(y_test, y_pred_lr),
        'f1_macro': f1_score(y_test, y_pred_lr, average='macro'),
        'best_params': str(grid_lr.best_params_)
    },
    {
        'model': 'LinearSVC',
        'accuracy': accuracy_score(y_test, y_pred_svm),
        'f1_macro': f1_score(y_test, y_pred_svm, average='macro'),
        'best_params': str(grid_svm.best_params_)
    }
]
pd.DataFrame(metrics).to_csv(METRICS_CSV, index=False)
print(" Saved metrics to:", METRICS_CSV)


# ====================================================
# VISUALISATIONS
# ====================================================
# 1) Label distribution
plt.figure(figsize=(10,5))
sns.barplot(
    x=df['event_label_collapsed'].value_counts().index,
    y=df['event_label_collapsed'].value_counts().values
)
plt.xticks(rotation=45)
plt.title("Label Distribution")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/label_distribution.png", dpi=300)
plt.close()

# 2) Confusion Matrix Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(10,8))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix — Logistic Regression")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/confusion_matrix_lr.png", dpi=300)
plt.close()

# 3) Confusion Matrix SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(10,8))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Greens',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix — Linear SVM")
plt.tight_layout()
plt.savefig(f"{FIG_DIR}/confusion_matrix_svm.png", dpi=300)
plt.close()

print("\n EXPERIMENT FINISHED!")


 Loading dataset: kompas_bencana_nasional_2025.xlsx
Total records: 1429

Initial label counts:
 event_label
lainnya                 715
banjir                  370
gempa                   140
kebakaran                85
erupsi                   43
longsor                  33
kekeringan               18
tsunami                  14
angin_puting_beliung     11
Name: count, dtype: int64

Merging rare labels to 'lainnya': []

Label distribution after collapse:
 event_label_collapsed
lainnya                 715
banjir                  370
gempa                   140
kebakaran                85
erupsi                   43
longsor                  33
kekeringan               18
tsunami                  14
angin_puting_beliung     11
Name: count, dtype: int64

 Training Logistic Regression...

LR Best Params: {'clf__C': 10}
LR Accuracy: 0.8286713286713286
                      precision    recall  f1-score   support

angin_puting_beliung       0.50      0.50      0.50         2
              ba

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVM Best Params: {'clf__C': 1}
SVM Accuracy: 0.8216783216783217
                      precision    recall  f1-score   support

angin_puting_beliung       0.50      0.50      0.50         2
              banjir       0.85      0.76      0.80        74
              erupsi       0.83      0.56      0.67         9
               gempa       0.92      0.86      0.89        28
           kebakaran       0.71      0.59      0.65        17
          kekeringan       0.50      0.33      0.40         3
             lainnya       0.81      0.94      0.87       143
             longsor       1.00      0.43      0.60         7
             tsunami       0.00      0.00      0.00         3

            accuracy                           0.82       286
           macro avg       0.68      0.55      0.60       286
        weighted avg       0.82      0.82      0.81       286


 Saved models: model_lr_best.pkl, model_svm_best.pkl
 Saved metrics to: experiment_metrics.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 EXPERIMENT FINISHED!
