In [1]:
#=======================================================================
# KLASIFIKASI DATA TRANSFORMER (BERT-based)
#=======================================================================

In [2]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Text preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk
from nltk.tokenize import word_tokenize

In [4]:
# Feature extraction (untuk perbandingan)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api

In [8]:
# Transformer Libraries
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    BertTokenizer,
    BertForSequenceClassification
)
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder




In [9]:
# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [10]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi

In [11]:
# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [12]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [14]:
#=======================================================================
# BAGIAN 1: LOAD DATA
#=======================================================================

print("\n[1] Loading Data...")
df_positif = pd.read_csv('komentar_positif.csv')
df_negatif = pd.read_csv('komentar_negatif.csv')
df_netral = pd.read_csv('komentar_netral.csv')

# Tambahkan label
df_positif['label'] = 'positif'
df_negatif['label'] = 'negatif'
df_netral['label'] = 'netral'

# Gabungkan semua data
df = pd.concat([df_positif, df_negatif, df_netral], ignore_index=True)

print(f"  Total data: {len(df)}")
print(f"  - Positif: {len(df_positif)}")
print(f"  - Negatif: {len(df_negatif)}")
print(f"  - Netral: {len(df_netral)}")


[1] Loading Data...
  Total data: 350
  - Positif: 150
  - Negatif: 150
  - Netral: 50


In [15]:
#=======================================================================
# BAGIAN 2: TEXT PREPROCESSING
#=======================================================================

print("\n[2] Text Preprocessing (Cleaning, Stemming, Stopword Removal)...")

# Initialize Sastrawi
factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()

factory_stopword = StopWordRemoverFactory()
stopword_remover = factory_stopword.create_stop_word_remover()

def clean_text(text):
    """Cleaning text"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Full preprocessing: cleaning, stopword removal, stemming"""
    text = clean_text(text)
    text = stopword_remover.remove(text)
    text = stemmer.stem(text)
    return text

# Apply preprocessing
df['comment_clean'] = df['comment'].apply(clean_text)
df['comment_processed'] = df['comment'].apply(preprocess_text)

print("  ✓ Preprocessing selesai!")
print(f"\n  Contoh preprocessing:")
print(f"  Original : {df['comment'].iloc[0][:80]}...")
print(f"  Cleaned  : {df['comment_clean'].iloc[0][:80]}...")
print(f"  Processed: {df['comment_processed'].iloc[0][:80]}...")


[2] Text Preprocessing (Cleaning, Stemming, Stopword Removal)...
  ✓ Preprocessing selesai!

  Contoh preprocessing:
  Original : anak pimpin nama purnawan efendi anak laki mukin umur anak normal umumnyapas kel...
  Cleaned  : anak pimpin nama purnawan efendi anak laki mukin umur anak normal umumnyapas kel...
  Processed: anak pimpin nama purnawan efendi anak laki mukin umur anak normal umumnyapas kel...


In [16]:
#=======================================================================
# BAGIAN 3: PREPARE DATA FOR TRANSFORMER
#=======================================================================

print("\n[3] Preparing Data for Transformer...")


[3] Preparing Data for Transformer...


In [17]:
# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

In [18]:
# Split data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['comment_processed'], 
    df['label_encoded'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label_encoded']
)

In [19]:
# Also get original labels for reporting
_, _, y_train_labels, y_test_labels = train_test_split(
    df['comment_processed'], 
    df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print(f"  Training data: {len(X_train_text)}")
print(f"  Testing data: {len(X_test_text)}")
print(f"  Label mapping: {dict(enumerate(label_encoder.classes_))}")


  Training data: 280
  Testing data: 70
  Label mapping: {0: 'negatif', 1: 'netral', 2: 'positif'}


In [20]:
#=======================================================================
# BAGIAN 4: CREATE DATASET CLASS FOR TRANSFORMER
#=======================================================================

class TextDataset(Dataset):
    """Custom Dataset for Transformer"""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [21]:
#=======================================================================
# BAGIAN 5: TRANSFORMER CLASSIFICATION
#=======================================================================

results = []

print("\n" + "="*80)
print("TRANSFORMER-BASED CLASSIFICATION")
print("="*80)


TRANSFORMER-BASED CLASSIFICATION


In [23]:
#---------------------------------------------------------------------------
# 5.1 IndoBERT (Indonesian BERT)
#---------------------------------------------------------------------------

print("\n[5.1] IndoBERT (Indonesian BERT)")
print("-" * 50)

try:
    print("  Loading IndoBERT model...")
    model_name = "indobenchmark/indobert-base-p1"
    
    tokenizer_indobert = AutoTokenizer.from_pretrained(model_name)
    model_indobert = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        problem_type="single_label_classification"
    )
    model_indobert.to(device)
    
    # Create datasets
    train_dataset = TextDataset(X_train_text, y_train, tokenizer_indobert)
    test_dataset = TextDataset(X_test_text, y_test, tokenizer_indobert)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results_indobert',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        disable_tqdm=False
    )
    
    # Trainer
    trainer = Trainer(
        model=model_indobert,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )
    
    print("  Training IndoBERT model...")
    trainer.train()
    
    # Predictions
    print("  Making predictions...")
    predictions = trainer.predict(test_dataset)
    y_pred_indobert = np.argmax(predictions.predictions, axis=1)
    
    # Metrics
    acc_indobert = accuracy_score(y_test, y_pred_indobert)
    prec_indobert = precision_score(y_test, y_pred_indobert, average='weighted')
    rec_indobert = recall_score(y_test, y_pred_indobert, average='weighted')
    f1_indobert = f1_score(y_test, y_pred_indobert, average='weighted')
    
    print(f"  Accuracy : {acc_indobert:.4f}")
    print(f"  Precision: {prec_indobert:.4f}")
    print(f"  Recall   : {rec_indobert:.4f}")
    print(f"  F1-Score : {f1_indobert:.4f}")
    
    results.append({
        'Method': 'IndoBERT',
        'Accuracy': acc_indobert,
        'Precision': prec_indobert,
        'Recall': rec_indobert,
        'F1-Score': f1_indobert
    })
    
    # Convert predictions back to original labels
    y_pred_indobert_labels = label_encoder.inverse_transform(y_pred_indobert)
    
    print("\n  Classification Report:")
    print(classification_report(y_test_labels, y_pred_indobert_labels))
    
except Exception as e:
    print(f"  ⚠ Error with IndoBERT: {e}")
    print("  Skipping IndoBERT...")


[5.1] IndoBERT (Indonesian BERT)
--------------------------------------------------
  Loading IndoBERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ⚠ Error with IndoBERT: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
  Skipping IndoBERT...
