## Evaluasi Kinerja Model TF-IDF dan RNN pada Sistem Klasifikasi Berita Hoaks Politik Berbahasa Indonesia

## 1. Data Preprocessing

### 1.1 Import the Libraries

In [1]:
import pandas as pd
import pickle
import os
import re
from pathlib import Path
from sklearn.model_selection import train_test_split

### 1.2 Define Political Keyword [untuk filter baris]

In [2]:
POLITIK_KEYWORDS = [
    "politik", "pemilu", "pemilihan umum", "pilkada", "pileg", "pilpres", "kampanye",
    "partai", "partai politik", "caleg", "capres", "cawapres", "calon presiden",
    "calon wakil presiden", "calon legislatif", "suara", "TPS", "DPT", "KPU", "Bawaslu",
    "politikus", "politisi", "anggota DPR", "DPR", "DPRD", "MPR", "lembaga legislatif",
    "parlemen", "ormas", "LSM", "koalisi", "oposisi", "kubu", "kabinet", "menteri",
    "reshuffle", "pemerintahan", "kekuasaan", "pemangku kebijakan", "kepala daerah",
    "gubernur", "bupati", "wali kota", "presiden", "wakil presiden", "sidang paripurna",
    "perppu", "peraturan", "undang-undang", "RUU", "RUU KUHP", "UU ITE", "konstitusi",
    "amandemen", "demokrasi", "otoriter", "otoritarian", "diktator", "sistem politik",
    "ideologi", "pancasila", "reformasi", "orba", "orde baru", "orde lama", "kebijakan",
    "anggaran", "APBN", "APBD", "korupsi", "nepotisme", "kolusi", "KKN", "KPK", "MK",
    "MA", "hukum tata negara", "pelanggaran HAM", "demonstrasi", "aksi", "unjuk rasa",
    "aktivis", "suara rakyat", "politik identitas", "politik uang", "black campaign",
    "hoaks politik", "buzzer", "opini publik"
]

POLITIK_SET = set(POLITIK_KEYWORDS)

### 1.3 Utility Functions

In [3]:
# Fungsi cleaning HANYA untuk keperluan filter topik (supaya keyword match)
# Teks asli yang disimpan tetap MENTAH.
def clean_for_filter_only(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) 
    return text

def is_politik(text):
    if pd.isna(text) or not text:
        return 0
    # Bersihkan sebentar cuma buat ngecek keyword, aslinya tdk diubah
    text_check = clean_for_filter_only(text) 
    tokens = set(text_check.split())
    return int(len(tokens & POLITIK_SET) > 0)

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

#### 1.4 Load Dataset

In [None]:
base_path = Path("../dataset/raw") 

dataset_files = [
    "dataset_cnn_10k_cleaned.csv",
    "dataset_kompas_4k_cleaned.csv",
    "dataset_tempo_6k_cleaned.csv",
    "dataset_turnbackhoax_10_cleaned.csv"
]

paths = [base_path / f for f in dataset_files]
all_dfs = []

TEXT_CANDIDATES = ["text_new", "Clean Narasi", "FullText", "Narasi"]
LABEL_CANDIDATES = ["hoax", "label"]

for path in paths:
    if not path.exists():
        print(f"WARNING: File tidak ditemukan: {path}")
        continue

    df = pd.read_csv(str(path))
    
    text_col = pick_col(df, TEXT_CANDIDATES)
    label_col = pick_col(df, LABEL_CANDIDATES)

    if text_col is None or label_col is None:
        print(f"SKIP: Kolom target tidak ditemukan di {path.name}")
        continue

    # Rename kolom standar
    df = df[[text_col, label_col]].rename(columns={text_col: 'text_raw', label_col: 'label'})
    
    # SKIP CLEANING
    # langsung pakai text_raw sebagai text_clean.
    # Tidak ada penghapusan angka, tanda baca, atau stopwords.
    df['text_clean'] = df['text_raw'].astype(str)
    
    print(f"Loaded (RAW): {path.name} | Rows: {len(df)}")
    all_dfs.append(df)

Loaded (RAW): dataset_cnn_10k_cleaned.csv | Rows: 9630
Loaded (RAW): dataset_kompas_4k_cleaned.csv | Rows: 4750
Loaded (RAW): dataset_tempo_6k_cleaned.csv | Rows: 6592
Loaded (RAW): dataset_turnbackhoax_10_cleaned.csv | Rows: 10381


#### 1.5 Dataset Consolidation and Filter Topic

In [5]:
if not all_dfs:
    raise ValueError("Tidak ada data yang berhasil di-load!")

df_gabungan = pd.concat(all_dfs, ignore_index=True)

# Tetap filter baris supaya cuma ambil berita politik
# Tapi teks di dalamnya tetap mentah
print("Memfilter topik politik...")
df_gabungan['is_politik'] = df_gabungan['text_raw'].apply(is_politik)
df_politik = df_gabungan[df_gabungan['is_politik'] == 1].copy()

if df_politik.empty:
    raise ValueError("Tidak ada data politik tersisa.")

df_final = df_politik.copy()
print(f"Total data politik (RAW): {len(df_final)}")
print(df_final['label'].value_counts())

Memfilter topik politik...
Total data politik (RAW): 20882
label
0    19254
1     1628
Name: count, dtype: int64


#### 1.6 Train Val Test Split

In [6]:
X_all = df_final['text_clean'].tolist() # Isinya teks mentah
y_all = df_final['label'].tolist()

# Tahap 1: Pisahkan Train (80%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_all, y_all,
    test_size=0.2, 
    random_state=42,
    stratify=y_all
)

In [7]:
# Tahap 2: Pisahkan Sisa (20%) jadi Val (10%) dan Test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5, 
    random_state=42,
    stratify=y_temp
)

In [8]:
print("-" * 30)
print(f"Train Size : {len(X_train)} (80%)")
print(f"Val Size   : {len(X_val)} (10%)")
print(f"Test Size  : {len(X_test)} (10%)")
print("-" * 30)

------------------------------
Train Size : 16705 (80%)
Val Size   : 2088 (10%)
Test Size  : 2089 (10%)
------------------------------


### 1.7 Save Processed the Dataset

In [None]:
# Simpan ke folder baru biar data 'Clean' kemarin aman
SAVE_DIR = Path("../dataset/processed/01_raw_no_preprocessing") 
os.makedirs(SAVE_DIR, exist_ok=True)

pickle.dump(X_train, open(SAVE_DIR / "X_train.pkl", "wb"))
pickle.dump(X_val,   open(SAVE_DIR / "X_val.pkl", "wb"))
pickle.dump(X_test,  open(SAVE_DIR / "X_test.pkl", "wb"))
pickle.dump(y_train, open(SAVE_DIR / "y_train.pkl", "wb"))
pickle.dump(y_val,   open(SAVE_DIR / "y_val.pkl", "wb"))
pickle.dump(y_test,  open(SAVE_DIR / "y_test.pkl", "wb"))

df_final.to_csv(SAVE_DIR / "df_final_raw.csv", index=False)

print(f"SUKSES! Data RAW tersimpan di: {SAVE_DIR.resolve()}")

SUKSES! Data RAW tersimpan di: /code/Github/Project/Ml/without-preprocessing/dataset/processed/01_raw_no_preprocessing
