## Evaluasi Kinerja Model TF-IDF dan RNN pada Sistem Klasifikasi Berita Hoaks Politik Berbahasa Indonesia

## 1. Data Preprocessing

### 1.1 Import the Libraries

In [12]:
import pandas as pd
import pickle
import os
import re

from pathlib import Path
from sklearn.model_selection import train_test_split

### 1.1 Define Political Keyword

In [13]:
POLITIK_KEYWORDS = [
    "politik", "pemilu", "pemilihan umum", "pilkada", "pileg", "pilpres", "kampanye",
    "partai", "partai politik", "caleg", "capres", "cawapres", "calon presiden",
    "calon wakil presiden", "calon legislatif", "suara", "TPS", "DPT", "KPU", "Bawaslu",
    "politikus", "politisi", "anggota DPR", "DPR", "DPRD", "MPR", "lembaga legislatif",
    "parlemen", "ormas", "LSM", "koalisi", "oposisi", "kubu", "kabinet", "menteri",
    "reshuffle", "pemerintahan", "kekuasaan", "pemangku kebijakan", "kepala daerah",
    "gubernur", "bupati", "wali kota", "presiden", "wakil presiden", "sidang paripurna",
    "perppu", "peraturan", "undang-undang", "RUU", "RUU KUHP", "UU ITE", "konstitusi",
    "amandemen", "demokrasi", "otoriter", "otoritarian", "diktator", "sistem politik",
    "ideologi", "pancasila", "reformasi", "orba", "orde baru", "orde lama", "kebijakan",
    "anggaran", "APBN", "APBD", "korupsi", "nepotisme", "kolusi", "KKN", "KPK", "MK",
    "MA", "hukum tata negara", "pelanggaran HAM", "demonstrasi", "aksi", "unjuk rasa",
    "aktivis", "suara rakyat", "politik identitas", "politik uang", "black campaign",
    "hoaks politik", "buzzer", "opini publik"
]

POLITIK_SET = set(POLITIK_KEYWORDS)

### 1.2 Utility Functions (Clean, Stopwords, Detector)

In [14]:
def load_stopwords(path):
    with open(path, "r", encoding="utf-8") as f:
        return set([w.strip() for w in f.readlines() if w.strip()])

def clean_text(text):
    text = str(text).lower()
        
    text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'[^\w\s]', '', text) 
    
    return text

def is_politik(text):
    if pd.isna(text) or not text:
        return 0
    text = clean_text(text) 
    tokens = set(text.split())
    return int(len(tokens & POLITIK_SET) > 0)

#### 1.4 Load and Clean Each Dataset

In [15]:
base_path = Path.home() / "Work/GitHub/Personal/Project/Training/dataset/raw"

dataset_files = [
    "dataset_cnn_10k_cleaned.csv",
    "dataset_kompas_4k_cleaned.csv",
    "dataset_tempo_6k_cleaned.csv",
    "dataset_turnbackhoax_10_cleaned.csv"
]

paths = [base_path / f for f in dataset_files]

stopwords_path = base_path / "stopwords_id.txt"
stopwords = load_stopwords(stopwords_path) 
all_dfs = []

TEXT_CANDIDATES = ["text_new", "Clean Narasi", "FullText", "Narasi"]
LABEL_CANDIDATES = ["hoax", "label"]

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

for path in paths:
    df = pd.read_csv(str(path))
    
    text_col = pick_col(df, TEXT_CANDIDATES)
    label_col = pick_col(df, LABEL_CANDIDATES)

    print(f"Loaded: {path} | Rows: {len(df)}")
    print("Detected text column:", text_col)
    print("Detected label column:", label_col)

    df = df[[text_col, label_col]].rename(columns={text_col: 'text_raw', label_col: 'label'})
    
    # Cleaning
    df['text_clean'] = df['text_raw'].astype(str).apply(clean_text)
    df['text_clean'] = df['text_clean'].apply(
        lambda x: " ".join([w for w in x.split() if w not in stopwords])
    )

    all_dfs.append(df)

Loaded: /home/calista/Work/GitHub/Personal/Project/Training/dataset/raw/dataset_cnn_10k_cleaned.csv | Rows: 9630
Detected text column: text_new
Detected label column: hoax
Loaded: /home/calista/Work/GitHub/Personal/Project/Training/dataset/raw/dataset_kompas_4k_cleaned.csv | Rows: 4750
Detected text column: text_new
Detected label column: hoax
Loaded: /home/calista/Work/GitHub/Personal/Project/Training/dataset/raw/dataset_tempo_6k_cleaned.csv | Rows: 6592
Detected text column: text_new
Detected label column: hoax
Loaded: /home/calista/Work/GitHub/Personal/Project/Training/dataset/raw/dataset_turnbackhoax_10_cleaned.csv | Rows: 10381
Detected text column: Clean Narasi
Detected label column: hoax


#### 1.5 Dataset Consolidation

In [16]:
df_gabungan = pd.concat(all_dfs, ignore_index=True)
print("Total setelah concat:", len(df_gabungan))

Total setelah concat: 31353


#### 1.6 Filter Data Politic

In [None]:
df_gabungan['is_politik'] = df_gabungan['text_clean'].apply(is_politik)

df_politik = df_gabungan[df_gabungan['is_politik'] == 1].copy()
print("Total politik only:", len(df_politik))

if df_politik.empty:
    raise ValueError("Tidak ada data politik.")

Total politik only: 20890


#### 1.7 Dataset Final

In [18]:
df_final = df_politik.copy()

print(df_final['label'].value_counts())
print("Total data (imbalanced):", len(df_final))

label
0    19260
1     1630
Name: count, dtype: int64
Total data (imbalanced): 20890


### 1.8 Train Test Split

#### 1.8.1 Separate Train (80%) and Remainder (20%)

In [19]:
X_all = df_final['text_clean'].tolist()
y_all = df_final['label'].tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    X_all, y_all,
    test_size=0.2, 
    random_state=42,
    stratify=y_all
)

#### 1.8.2 Separate the Remainder (20%) into Validation (10%) and Test (10%)

In [20]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5, 
    random_state=42,
    stratify=y_temp
)

In [21]:
print(f"Train Size : {len(X_train)} (80%)")
print(f"Val Size   : {len(X_val)} (10%)")
print(f"Test Size  : {len(X_test)} (10%)")

Train Size : 16712 (80%)
Val Size   : 2089 (10%)
Test Size  : 2089 (10%)


### 1.9 Save Processed the Dataset

In [22]:
SAVE_DIR = Path("../dataset/processed/01_after_preprocessing")
os.makedirs(SAVE_DIR, exist_ok=True)

pickle.dump(X_train, open(SAVE_DIR / "X_train.pkl", "wb"))
pickle.dump(X_val,   open(SAVE_DIR / "X_val.pkl", "wb"))   
pickle.dump(X_test,  open(SAVE_DIR / "X_test.pkl", "wb"))
pickle.dump(y_train, open(SAVE_DIR / "y_train.pkl", "wb"))
pickle.dump(y_val,   open(SAVE_DIR / "y_val.pkl", "wb"))   
pickle.dump(y_test,  open(SAVE_DIR / "y_test.pkl", "wb"))

df_final.to_csv(SAVE_DIR / "df_final.csv", index=False)
print(f"saved in: {SAVE_DIR.resolve()}")

saved in: /home/calista/Work/GitHub/Personal/Project/Training/dataset/processed/01_after_preprocessing
