# üáÆüá© IndoBERT Emotion Detection Training

Notebook ini untuk melatih model deteksi emosi bahasa Indonesia menggunakan IndoBERT.

**Langkah:**
1. Upload dataset (train.txt, val.txt, test.txt)
2. Jalankan semua cell
3. Download model yang sudah dilatih

**Pastikan GPU aktif:** Runtime ‚Üí Change runtime type ‚Üí GPU

In [None]:
# Install dependencies
!pip install -q transformers torch accelerate sentencepiece scikit-learn

In [None]:
# Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Upload dataset files
from google.colab import files
print("Upload train.txt, val.txt, dan test.txt")
uploaded = files.upload()

In [None]:
import os
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

# Config
MODEL_NAME = "indobenchmark/indobert-base-p1"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

print("‚úÖ Imports done!")

In [None]:
def preprocess_text(text):
    """Simple text cleaning."""
    if not text or not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'\brt\b', '', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_data(filepath):
    """Load and preprocess data."""
    df = pd.read_csv(filepath, sep=';', header=None, names=['text', 'label'])
    df = df.dropna()
    df['text'] = df['text'].apply(preprocess_text)
    df = df[df['text'].str.len() > 0]
    return df

# Load data
print("üìÇ Loading dataset...")
train_df = load_data('train.txt')
val_df = load_data('val.txt')
test_df = load_data('test.txt')

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print(f"\nDistribution:\n{train_df['label'].value_counts()}")

In [None]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label'])
val_labels = label_encoder.transform(val_df['label'])
test_labels = label_encoder.transform(test_df['label'])

num_classes = len(label_encoder.classes_)
print(f"Classes ({num_classes}): {list(label_encoder.classes_)}")

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("‚úÖ Dataset class defined!")

In [None]:
# Load tokenizer and model
print(f"üìù Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"üèóÔ∏è Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    id2label={i: label for i, label in enumerate(label_encoder.classes_)},
    label2id={label: i for i, label in enumerate(label_encoder.classes_)}
)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"‚úÖ Model loaded on {device}!")

In [None]:
# Create datasets
train_dataset = EmotionDataset(train_df['text'].values, train_labels, tokenizer, MAX_LEN)
val_dataset = EmotionDataset(val_df['text'].values, val_labels, tokenizer, MAX_LEN)
test_dataset = EmotionDataset(test_df['text'].values, test_labels, tokenizer, MAX_LEN)

print(f"‚úÖ Datasets created!")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    report_to='none',
    learning_rate=LEARNING_RATE,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("‚úÖ Trainer ready!")

In [None]:
# Train!
print("üöÄ Training...")
trainer.train()

In [None]:
# Evaluate
print("üìà Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print(f"\n‚úÖ Test Accuracy: {test_results['eval_accuracy']*100:.2f}%")
print(f"‚úÖ Test F1: {test_results['eval_f1']*100:.2f}%")

In [None]:
# Detailed classification report
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
print("\nüìä Classification Report:")
print(classification_report(test_labels, preds, target_names=label_encoder.classes_))

In [None]:
# Save model
print("üíæ Saving model...")
os.makedirs('indobert_emotion', exist_ok=True)
model.save_pretrained('indobert_emotion/model')
tokenizer.save_pretrained('indobert_emotion/tokenizer')

with open('indobert_emotion/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("‚úÖ Model saved!")

In [None]:
# Test prediction
def predict_emotion(text):
    model.eval()
    encoding = tokenizer(
        preprocess_text(text),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        conf = probs[0][pred].item()

    emotion = label_encoder.inverse_transform([pred])[0]
    return emotion, conf

# Test samples
test_texts = [
    "Senang banget hari ini!",
    "Sedih sekali dia pergi",
    "Marah aku sama kamu!",
    "Takut dengan keadaan ini",
    "Aku cinta kamu",
    "Hari ini biasa saja"
]

print("\nüß™ Test Predictions:")
print("="*50)
for text in test_texts:
    emotion, conf = predict_emotion(text)
    print(f'"{text}"')
    print(f"  ‚Üí {emotion} ({conf*100:.1f}%)")
    print()

In [None]:
# Download model
!zip -r indobert_emotion.zip indobert_emotion/
files.download('indobert_emotion.zip')
print("\n‚úÖ Download started! Extract zip dan letakkan di folder saved_models_indobert")