In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pickle
import json

import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA available: True
GPU name: NVIDIA GeForce GTX 1080 Ti


- Pada bagian ini, dataset hasil preprocessing (X dan y) akan dibagi menjadi beberapa fold menggunakan K-Fold.  
- Setiap fold akan dilatih dan dievaluasi menggunakan LSTM dan GRU.  
- Tujuannya: memperoleh performa rata-rata yang stabil dan memilih model terbaik.

In [2]:
df = pd.read_csv(os.path.join("dataset", "indonesian-news-title.csv"))

### Step 1 — Load artefak preprocessing
Kita ambil vocab, label encoder, config, dan dataset encoded yang dibuat di fase 1.

In [3]:
# Load vocab, label encoder, config
with open("artifacts/vocab/word2idx.pkl", "rb") as f:
    word2idx = pickle.load(f)

with open("artifacts/labels/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

with open("artifacts/config/config.json") as f:
    config = json.load(f)

MAX_LEN = config["max_len"]
VOCAB_SIZE = config["vocab_size"]
NUM_CLASSES = config["num_classes"]
EMBED_DIM = config["embedding_dim"]
HIDDEN_SIZE = config["hidden_size"]

# Load balanced & encoded dataset
X = np.load("artifacts/dataset/X.npy")
y = np.load("artifacts/dataset/y.npy")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Step 2 — Siapkan Dataset dan DataLoader
Dataset ini dipake PyTorch buat ngasih data dalam bentuk batch.


In [4]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = NewsDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Step 3 — Bikin arsitektur model LSTM dan GRU
Kedua model ini mirip, cuma beda di jenis recurrent layer-nya.  
Tambahin Word Embedding juga

In [5]:
embedding_matrix = np.load("artifacts/embedding/embedding_matrix.npy")
EMBED_DIM = embedding_matrix.shape[1]

In [6]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=False   # kalau mau fine-tuning, True kalau mau tetap
        )
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])


class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=False   # kalau mau fine-tuning, True kalau mau tetap
        )
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        _, h = self.gru(x)
        return self.fc(h[-1])

### Step 4 — Setup K-Fold untuk bagi dataset

In [7]:
k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [8]:
for train_idx, test_idx in kf.split(X, y):
    print(len(train_idx), len(test_idx))

17539 4385
17539 4385
17539 4385
17539 4385
17540 4384


### Step 5 — Loop K-Fold Training + Evaluasi

In [9]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def encode_text(text):
    tokens = clean_text(text).split()
    seq = [word2idx.get(tok, word2idx["<UNK>"]) for tok in tokens]

    if len(seq) < MAX_LEN:
        seq = seq + [word2idx["<PAD>"]] * (MAX_LEN - len(seq))
    else:
        seq = seq[:MAX_LEN]

    return torch.tensor([seq], dtype=torch.long).to(device)

In [10]:
lstm_model = LSTMClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES).to(device)
print(lstm_model.emb.weight[10][:10])

test = encode_text("jokowi resmikan proyek tol baru")

# pastikan list int
if isinstance(test, torch.Tensor):
    test = test.tolist()

inp = torch.tensor([test], dtype=torch.long).to(device)

vecs = lstm_model.emb(inp).shape
vecs

tensor([-0.7456, -0.1253, -1.0992,  0.1143,  0.0916,  0.1585, -0.1856, -0.0210,
         0.5227, -0.2509], device='cuda:0', grad_fn=<SliceBackward0>)


torch.Size([1, 1, 20, 300])

In [11]:
EPOCHS = 5
BATCH_SIZE = 32
lstm_scores = []
gru_scores = []
fold_no = 1

for train_idx, test_idx in kf.split(X, y):
    print(f"\n===== FOLD {fold_no} =====")

    # Split data
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Dataset & Loader
    train_loader = DataLoader(
        NewsDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True
    )
    test_loader = DataLoader(
        NewsDataset(X_test, y_test), batch_size=BATCH_SIZE
    )

    # Init models
    lstm_model = LSTMClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES).to(device)
    gru_model  = GRUClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES).to(device)

    criterion = nn.CrossEntropyLoss()
    lstm_opt = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)
    gru_opt  = torch.optim.Adam(gru_model.parameters(), lr=1e-3)

    # === TRAIN LSTM ===
    for epoch in range(EPOCHS):
        lstm_model.train()
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            lstm_opt.zero_grad()
            out = lstm_model(Xb)
            loss = criterion(out, yb)
            loss.backward()
            lstm_opt.step()

    # === TRAIN GRU ===
    for epoch in range(EPOCHS):
        gru_model.train()
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            gru_opt.zero_grad()
            out = gru_model(Xb)
            loss = criterion(out, yb)
            loss.backward()
            gru_opt.step()

    # === EVALUASI ===
    def evaluate(model):

        model.eval()
        preds = []
        true = []

        with torch.no_grad():
            for Xb, yb in test_loader:
                Xb = Xb.to(device)
                out = model(Xb)
                pred = torch.argmax(out, dim=1).cpu().tolist()
                true.extend(yb.tolist())
                preds.extend(pred)

        acc = accuracy_score(true, preds)
        prec, rec, f1, _ = precision_recall_fscore_support(
            true, preds, average="weighted"
        )

        return acc, prec, rec, f1

    # simpan skor
    lstm_scores.append(evaluate(lstm_model))
    gru_scores.append(evaluate(gru_model))

    print(f"LSTM fold {fold_no}:", lstm_scores[-1])
    print(f"GRU  fold {fold_no}:", gru_scores[-1])

    fold_no += 1



===== FOLD 1 =====
LSTM fold 1: (0.809806157354618, 0.8129945646324171, 0.809806157354618, 0.8097650282788484)
GRU  fold 1: (0.8102622576966932, 0.8111068985416936, 0.8102622576966932, 0.8101215034083125)

===== FOLD 2 =====
LSTM fold 2: (0.8189281641961231, 0.8227149492533088, 0.8189281641961231, 0.8187607476643184)
GRU  fold 2: (0.8088939566704675, 0.8135529814737781, 0.8088939566704675, 0.810075701514098)

===== FOLD 3 =====
LSTM fold 3: (0.8326111744583808, 0.8337795208723884, 0.8326111744583808, 0.8326548798458193)
GRU  fold 3: (0.8198403648802737, 0.8233190972701442, 0.8198403648802737, 0.8198339482165907)

===== FOLD 4 =====
LSTM fold 4: (0.8084378563283923, 0.8119052386901777, 0.8084378563283923, 0.8085174090956248)
GRU  fold 4: (0.8136830102622576, 0.8156997209369249, 0.8136830102622576, 0.8134235515701538)

===== FOLD 5 =====
LSTM fold 5: (0.8186587591240876, 0.8218755345368186, 0.8186587591240876, 0.8184989922793983)
GRU  fold 5: (0.8204835766423357, 0.8234414446843314, 0.8

### Step 6 — Rata-rata skor

In [12]:
import numpy as np

print("\n===== Rata-rata Hasil K-Fold =====")

lstm_mean = np.mean(lstm_scores, axis=0)
gru_mean  = np.mean(gru_scores, axis=0)

print("LSTM (acc, prec, rec, f1):", lstm_mean)
print("GRU  (acc, prec, rec, f1):", gru_mean)


===== Rata-rata Hasil K-Fold =====
LSTM (acc, prec, rec, f1): [0.81768842 0.82065396 0.81768842 0.81763941]
GRU  (acc, prec, rec, f1): [0.81463263 0.81742403 0.81463263 0.81490889]


### Step 7 — Simpan model terlatih (F1 tertinggi)
Biar nanti bisa dipakai di fase inference.

In [14]:
best_model_name = "lstm" if lstm_mean[3] > gru_mean[3] else "gru"
config["best_model_type"] = best_model_name

with open("artifacts/config/config.json", "w") as f:
    json.dump(config, f, indent=4)

print("Best model:", best_model_name)

os.makedirs("artifacts/model_final", exist_ok=True)

if best_model_name == "lstm":
    torch.save(lstm_model.state_dict(), "artifacts/model_final/final_model.pth")
else:
    torch.save(gru_model.state_dict(), "artifacts/model_final/final_model.pth")

Best model: lstm
