In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pickle
import json

import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA available: True
GPU name: NVIDIA GeForce GTX 1080 Ti


- Pada bagian ini, dataset hasil preprocessing (X dan y) akan dibagi menjadi beberapa fold menggunakan K-Fold.  
- Setiap fold akan dilatih dan dievaluasi menggunakan LSTM dan GRU.  
- Tujuannya: memperoleh performa rata-rata yang stabil dan memilih model terbaik.

In [2]:
df = pd.read_csv(os.path.join("dataset", "indonesian-news-title.csv"))

### Step 1 — Load artefak preprocessing
Kita ambil vocab, label encoder, config, dan dataset encoded yang dibuat di fase 1.

In [3]:
# Load vocab, label encoder, config
with open("artifacts/vocab/word2idx.pkl", "rb") as f:
    word2idx = pickle.load(f)

with open("artifacts/labels/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

with open("artifacts/config/config.json") as f:
    config = json.load(f)

MAX_LEN = config["max_len"]
VOCAB_SIZE = config["vocab_size"]
NUM_CLASSES = config["num_classes"]
EMBED_DIM = config["embedding_dim"]
HIDDEN_SIZE = config["hidden_size"]

# Load balanced & encoded dataset
X = np.load("artifacts/dataset/X.npy")
y = np.load("artifacts/dataset/y.npy")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Step 2 — Siapkan Dataset dan DataLoader
Dataset ini dipake PyTorch buat ngasih data dalam bentuk batch.


In [4]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = NewsDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Step 3 — Bikin arsitektur model LSTM dan GRU
Kedua model ini mirip, cuma beda di jenis recurrent layer-nya.

In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])


class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        _, h = self.gru(x)
        return self.fc(h[-1])

### Step 4 — Setup K-Fold untuk bagi dataset

In [6]:
k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [7]:
for train_idx, test_idx in kf.split(X, y):
    print(len(train_idx), len(test_idx))

17539 4385
17539 4385
17539 4385
17539 4385
17540 4384


### Step 5 — Loop K-Fold Training + Evaluasi

In [8]:
EPOCHS = 5
BATCH_SIZE = 32
lstm_scores = []
gru_scores = []
fold_no = 1

for train_idx, test_idx in kf.split(X, y):
    print(f"\n===== FOLD {fold_no} =====")

    # Split data
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Dataset & Loader
    train_loader = DataLoader(
        NewsDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True
    )
    test_loader = DataLoader(
        NewsDataset(X_test, y_test), batch_size=BATCH_SIZE
    )

    # Init models
    lstm_model = LSTMClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES).to(device)
    gru_model  = GRUClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES).to(device)

    criterion = nn.CrossEntropyLoss()
    lstm_opt = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)
    gru_opt  = torch.optim.Adam(gru_model.parameters(), lr=1e-3)

    # === TRAIN LSTM ===
    for epoch in range(EPOCHS):
        lstm_model.train()
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            lstm_opt.zero_grad()
            out = lstm_model(Xb)
            loss = criterion(out, yb)
            loss.backward()
            lstm_opt.step()

    # === TRAIN GRU ===
    for epoch in range(EPOCHS):
        gru_model.train()
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            gru_opt.zero_grad()
            out = gru_model(Xb)
            loss = criterion(out, yb)
            loss.backward()
            gru_opt.step()

    # === EVALUASI ===
    def evaluate(model):

        model.eval()
        preds = []
        true = []

        with torch.no_grad():
            for Xb, yb in test_loader:
                Xb = Xb.to(device)
                out = model(Xb)
                pred = torch.argmax(out, dim=1).cpu().tolist()
                true.extend(yb.tolist())
                preds.extend(pred)

        acc = accuracy_score(true, preds)
        prec, rec, f1, _ = precision_recall_fscore_support(
            true, preds, average="weighted"
        )

        return acc, prec, rec, f1

    # simpan skor
    lstm_scores.append(evaluate(lstm_model))
    gru_scores.append(evaluate(gru_model))

    print(f"LSTM fold {fold_no}:", lstm_scores[-1])
    print(f"GRU  fold {fold_no}:", gru_scores[-1])

    fold_no += 1



===== FOLD 1 =====
LSTM fold 1: (0.7074116305587229, 0.7125586937186793, 0.7074116305587229, 0.7076944044132734)
GRU  fold 1: (0.7131128848346636, 0.7212991453486725, 0.7131128848346636, 0.7139403622822386)

===== FOLD 2 =====
LSTM fold 2: (0.6919042189281642, 0.6947329845770048, 0.6919042189281642, 0.6909226516534106)
GRU  fold 2: (0.70672748004561, 0.7125030334177592, 0.70672748004561, 0.7077259839824257)

===== FOLD 3 =====
LSTM fold 3: (0.7179019384264538, 0.7232106265737465, 0.7179019384264538, 0.7195276759155843)
GRU  fold 3: (0.732497149372862, 0.7382582737979372, 0.732497149372862, 0.7335533055378547)

===== FOLD 4 =====
LSTM fold 4: (0.7005701254275941, 0.7025844577439528, 0.7005701254275941, 0.7009818832354554)
GRU  fold 4: (0.720866590649943, 0.7226886664663911, 0.720866590649943, 0.7213545436544628)

===== FOLD 5 =====
LSTM fold 5: (0.698220802919708, 0.7083391326444863, 0.698220802919708, 0.7002311126300002)
GRU  fold 5: (0.7052919708029197, 0.7063151106428104, 0.70529197

### Step 6 — Rata-rata skor

In [9]:
import numpy as np

print("\n===== Rata-rata Hasil K-Fold =====")

lstm_mean = np.mean(lstm_scores, axis=0)
gru_mean  = np.mean(gru_scores, axis=0)

print("LSTM (acc, prec, rec, f1):", lstm_mean)
print("GRU  (acc, prec, rec, f1):", gru_mean)


===== Rata-rata Hasil K-Fold =====
LSTM (acc, prec, rec, f1): [0.70320174 0.70828518 0.70320174 0.70387155]
GRU  (acc, prec, rec, f1): [0.71569922 0.72021285 0.71569922 0.71627726]


### Step 7 — Simpan model terlatih (F1 tertinggi)
Biar nanti bisa dipakai di fase inference.

In [10]:
best_model_name = "lstm" if lstm_mean[3] > gru_mean[3] else "gru"
print("Best model:", best_model_name)

os.makedirs("artifacts/model_final", exist_ok=True)

if best_model_name == "lstm":
    torch.save(lstm_model.state_dict(), "artifacts/model_final/final_model.pth")
else:
    torch.save(gru_model.state_dict(), "artifacts/model_final/final_model.pth")

Best model: gru
