In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers datasets accelerate scikit-learn

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModel,  
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, 
    precision_score, 
    recall_score, 
    accuracy_score,
    classification_report,
    hamming_loss
)
import json
import os
import warnings
import matplotlib.pyplot as plt
import math

warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

print("Library berhasil diimport!")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

LABEL_NAMES = [
    'Analgesik', 
    'Antasida', 
    'Antidiare', 
    'Antipiretik', 
    'Antiseptik', 
    'Dekongestan', 
    'Ekspektoran', 
    'Herbal', 
    'Multivitamin'
]
NUM_LABELS = len(LABEL_NAMES)
print(f"Jumlah label yang akan digunakan: {NUM_LABELS}")
print("Nama Label:", LABEL_NAMES)

label2id = {label: i for i, label in enumerate(LABEL_NAMES)}
id2label = {i: label for i, label in enumerate(LABEL_NAMES)}

print("\nLabel to ID mapping:")
for label, idx in label2id.items():
    print(f"  '{label}': {idx}")

df = pd.read_csv('/kaggle/input/preprocessed-indobert-train/preprocessed_indobert_data_train_label.csv',
                 sep=';'
                )
df['Jenis_list'] = df['Jenis'].apply(lambda x: [label.strip() for label in x.split(',')])

mlb = MultiLabelBinarizer(classes=LABEL_NAMES) 
encoded_labels = mlb.fit_transform(df['Jenis_list'])

df['labels_list'] = list(encoded_labels)

print("\nContoh data setelah One-Hot Encoding dengan MultiLabelBinarizer:")
print(df[['text', 'Jenis', 'Jenis_list', 'labels_list']].head())
print("\nLabel yang dikenali oleh MultiLabelBinarizer:")
print(mlb.classes_)

PEMBAGIAN DATASET

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df['text'].values,
    df['labels_list'].values,
    test_size=0.2,
    random_state=42,
    stratify=None
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")

print("\nPreview Training Data (5 baris):")
print(pd.DataFrame({"text": X_train, "labels": y_train}).head(5))

print("\nPreview Validation Data (3 baris):")
print(pd.DataFrame({"text": X_val, "labels": y_val}).head(3))

TOKENIZATION

In [None]:
class MedicineDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

In [None]:
MODEL_NAME = 'indobenchmark/indobert-base-p1'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = MedicineDataset(X_train, y_train, tokenizer)
val_dataset = MedicineDataset(X_val, y_val, tokenizer)

print("Dataset berhasil dibuat!")

In [None]:
def to_text_list(seq):
    if isinstance(seq, (pd.Series, np.ndarray)):
        seq = seq.tolist()
    out = []
    for x in seq:
        if x is None:
            out.append("")
        elif isinstance(x, float) and math.isnan(x):
            out.append("")
        elif isinstance(x, (list, tuple)):
            out.append(" ".join(map(str, x)))
        else:
            out.append(str(x))
    return out

X_train_txt = to_text_list(X_train)
X_val_txt   = to_text_list(X_val)

def total_tokens(texts, tok):
    enc = tok(texts, truncation=True, padding=False, add_special_tokens=True)
    return sum(len(ids) for ids in enc["input_ids"])

print("Total Tokens:")
print(f"- Train : {total_tokens(X_train_txt, tokenizer)}")
print(f"- Val   : {total_tokens(X_val_txt, tokenizer)}")

In [None]:
sample_text = X_train_txt[0]

model_max = getattr(tokenizer, "model_max_length", 512)
if not isinstance(model_max, int) or model_max > 100000:  
    model_max = 512

encoded = tokenizer(
    sample_text,
    truncation=True,
    padding='max_length',
    max_length=min(len(tokenizer.encode(sample_text, add_special_tokens=True)), model_max),
    return_attention_mask=True
)

tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"])

print("[Preview Contoh dari Train Set]")
print("Text         :", (sample_text[:120] + "…") if len(sample_text) > 120 else sample_text)
print("Tokens       :", tokens[:15])
print("Input IDs    :", encoded["input_ids"][:15])
print("AttentionMask:", encoded["attention_mask"][:15])

HYPERPARAMETER TUNING

In [None]:
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("DataLoader berhasil dibuat!")

In [None]:
class IndoBERTMultiLabel(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.1):
        super(IndoBERTMultiLabel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        logits = self.classifier(output)
        return logits

In [None]:
model = IndoBERTMultiLabel(MODEL_NAME, NUM_LABELS)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

num_epochs = 25
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.10 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

print(f"Model berhasil diinisialisasi!")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

FINE-TUNING MODEL

In [None]:
def train_epoch(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

def evaluate(model, data_loader, criterion, device, threshold=0.5):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            predictions = torch.sigmoid(logits)
            predictions = (predictions > threshold).float()
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    f1_micro = f1_score(all_labels, all_predictions, average='micro')
    precision_micro = precision_score(all_labels, all_predictions, average='micro')
    recall_micro = recall_score(all_labels, all_predictions, average='micro')
    hamming = hamming_loss(all_labels, all_predictions)
    
    return {
        'loss': total_loss / len(data_loader),
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'hamming_loss': hamming
    }

TRAINING & PERFORMANCE EVALUATION

In [None]:
print("Memulai training...")

patience = 3  
epochs_no_improve = 0
best_val_loss = float('inf')

best_val_f1 = 0.0
best_val_precision = 0.0
best_val_recall = 0.0
best_val_hamming = float('inf')
best_model_state = None

train_loss_history = []
val_loss_history = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 50)
    
    # Training
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
    train_loss_history.append(train_loss)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validation
    val_metrics = evaluate(model, val_loader, criterion, device)
    val_loss_history.append(val_metrics['loss'])
    print(f"Val Loss: {val_metrics['loss']:.4f}")
    print(f"Val F1 (micro): {val_metrics['f1_micro']:.4f}")
    print(f"Val Precision (micro): {val_metrics['precision_micro']:.4f}")
    print(f"Val Recall (micro): {val_metrics['recall_micro']:.4f}")
    print(f"Val Hamming Loss: {val_metrics['hamming_loss']:.4f}")
    
    if val_metrics['loss'] < best_val_loss:
        best_val_loss = val_metrics['loss']
        best_val_f1 = val_metrics['f1_micro']
        best_val_precision = val_metrics['precision_micro']
        best_val_recall = val_metrics['recall_micro']
        best_val_hamming = val_metrics['hamming_loss']
        best_model_state = model.state_dict().copy()
        epochs_no_improve = 0
        print(f"Validation loss membaik! Menyimpan model...")
    else:
        epochs_no_improve += 1
        print(f"Validation loss tidak membaik. Counter: {epochs_no_improve}/{patience}")

    if epochs_no_improve >= patience:
        print(f"\nEarly stopping dipicu pada epoch {epoch + 1}!")
        break

print("\nTraining selesai!")
print("\n" + "="*50)
print("Hasil Terbaik pada Data Validasi:")
print(f"  -> F1-Score (micro): {best_val_f1:.4f}")
print(f"  -> Precision (micro): {best_val_precision:.4f}")
print(f"  -> Recall (micro)   : {best_val_recall:.4f}")
print(f"  -> Hamming Loss: {best_val_hamming:.4f}")
print("="*50)

if best_model_state is None:
    best_model_state = model.state_dict().copy()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_loss_history, label='Training Loss')
plt.plot(val_loss_history, label='Validation Loss')
plt.title('Training vs Validation Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
model.load_state_dict(best_model_state)

def get_detailed_metrics_validation(model, data_loader, device, threshold=0.5):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            predictions = torch.sigmoid(logits)
            predictions = (predictions > threshold).float()
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return np.array(all_labels), np.array(all_predictions)

y_true_val, y_pred_val = get_detailed_metrics_validation(model, val_loader, device)

print("\nMetrik F1-Score per Kelas (Data Validasi - Model Terbaik):")
print("=" * 65)
for i, label_name in enumerate(LABEL_NAMES):
    f1 = f1_score(y_true_val[:, i], y_pred_val[:, i], zero_division=0)
    precision = precision_score(y_true_val[:, i], y_pred_val[:, i], zero_division=0)
    recall = recall_score(y_true_val[:, i], y_pred_val[:, i], zero_division=0)
    hamming = hamming_loss(y_true_val[:, i], y_pred_val[:, i])
    
    print(f"{label_name:15} - F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Hamming: {hamming:.4f}")

In [None]:
model_save_path = "/kaggle/working/indobert_medicine_classifier_fixed"
model.eval()
current_label_mapping = {
    'label_names': LABEL_NAMES,
    'label2id': label2id,
    'id2label': id2label
}

torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'model_name': MODEL_NAME,
        'num_labels': NUM_LABELS,
        'label_mapping': current_label_mapping
    }
}, f"{model_save_path}_model.pth")

tokenizer.save_pretrained(model_save_path)

print(f"Model berhasil disimpan di: {model_save_path}")

In [None]:
def predict_medicine_labels(text, model, tokenizer, device, label_names, threshold=0.5):
    """
    Fungsi untuk memprediksi label obat dari teks baru
    """
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = torch.sigmoid(logits)
        predictions = (probabilities > threshold).float()
    
    # Convert to readable format
    results = []
    for i, (prob, pred) in enumerate(zip(probabilities[0], predictions[0])):
        results.append({
            'label': label_names[i],
            'probability': prob.item(),
            'predicted': bool(pred.item())
        })
    
    return results

sample_text = "promag obat bahan alami dengan ekstrak kayu manis untuk asam lambung"
predictions = predict_medicine_labels(sample_text, model, tokenizer, device, LABEL_NAMES)

print(f"\nContoh prediksi untuk: '{sample_text}'")
print("-" * 60)
for result in predictions:
    if result['predicted']:
        print(f"✓ {result['label']}: {result['probability']:.4f}")
    else:
        print(f"  {result['label']}: {result['probability']:.4f}")

print("\nScript selesai! Model berhasil di-train dan siap digunakan.")