In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("="*70)
print("ANALISIS SENTIMEN: KASUS SALAH TANGKAP BJORKA")
print("Tahap 2: Pelabelan Sentimen dengan IndoBERT")
print("="*70)

ANALISIS SENTIMEN: KASUS SALAH TANGKAP BJORKA
Tahap 2: Pelabelan Sentimen dengan IndoBERT


In [3]:
# 1. LOAD DATA
print("\n[1] MEMUAT DATA...")
print("-"*70)
df = pd.read_csv('youtube_comments_with_stats.csv')
print(f"✓ Data berhasil dimuat: {len(df)} komentar")
print(f"✓ Kolom: {df.columns.tolist()}")


[1] MEMUAT DATA...
----------------------------------------------------------------------
✓ Data berhasil dimuat: 2177 komentar
✓ Kolom: ['clean_comment', 'char_length', 'word_count']


In [4]:
# 2. SETUP MODEL INDOBERT
print("\n[2] SETUP MODEL INDOBERT...")
print("-"*70)
print("⏳ Downloading model IndoBERT (ini mungkin butuh waktu di run pertama)...")

# Pilihan model IndoBERT untuk sentiment analysis
# Model: indobenchmark/indobert-base-p1 (fine-tuned untuk sentiment)
model_name = "mdhugol/indonesia-bert-sentiment-classification"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    print("✓ Model berhasil dimuat!")
    print(f"✓ Model: {model_name}")
except Exception as e:
    print(f"✗ Error loading model: {e}")
    print("\n⚠️ Alternatif: Coba model lain jika error")
    print("   - IndoNLU: 'indonlu/indobert-base-p1'")
    print("   - mBERT multilingual: 'bert-base-multilingual-uncased'")
    exit()

# Check if GPU available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"✓ Device: {device}")



[2] SETUP MODEL INDOBERT...
----------------------------------------------------------------------
⏳ Downloading model IndoBERT (ini mungkin butuh waktu di run pertama)...
✓ Model berhasil dimuat!
✓ Model: mdhugol/indonesia-bert-sentiment-classification
✓ Device: cpu


In [5]:
# 3. PREPROCESSING UNTUK BATCH PREDICTION
print("\n[3] PREPROCESSING DATA...")
print("-"*70)

class CommentDataset(Dataset):
    def __init__(self, comments, tokenizer, max_length=128):
        self.comments = comments
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        comment = str(self.comments[idx])
        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Buat dataset dan dataloader
batch_size = 16  # Sesuaikan dengan RAM/GPU Anda
dataset = CommentDataset(df['clean_comment'].values, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

print(f"✓ Dataset created: {len(dataset)} samples")
print(f"✓ Batch size: {batch_size}")
print(f"✓ Number of batches: {len(dataloader)}")


[3] PREPROCESSING DATA...
----------------------------------------------------------------------
✓ Dataset created: 2177 samples
✓ Batch size: 16
✓ Number of batches: 137


In [6]:
# 4. PREDIKSI SENTIMEN
print("\n[4] MELAKUKAN PREDIKSI SENTIMEN...")
print("-"*70)
print("⏳ Memproses... (ini akan memakan waktu tergantung jumlah data)")

model.eval()
predictions = []
probabilities = []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing batches"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Softmax untuk mendapatkan probability
        probs = torch.softmax(logits, dim=1)
        
        # Ambil prediksi (class dengan probability tertinggi)
        preds = torch.argmax(probs, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        probabilities.extend(probs.cpu().numpy())

print("\n✓ Prediksi selesai!")


[4] MELAKUKAN PREDIKSI SENTIMEN...
----------------------------------------------------------------------
⏳ Memproses... (ini akan memakan waktu tergantung jumlah data)


Processing batches: 100%|██████████| 137/137 [04:00<00:00,  1.75s/it]


✓ Prediksi selesai!





In [7]:
# 5. MAPPING LABEL SENTIMEN
print("\n[5] MAPPING LABEL SENTIMEN...")
print("-"*70)

# Mapping label (sesuaikan dengan model yang digunakan)
# Untuk model mdhugol/indonesia-bert-sentiment-classification:
# 0 = Positive, 1 = Neutral, 2 = Negative
sentiment_map = {
    0: 'positive',
    1: 'neutral', 
    2: 'negative'
}

df['sentiment_label'] = [sentiment_map[pred] for pred in predictions]
df['sentiment_score'] = predictions

# Ambil confidence score (probability tertinggi)
df['confidence'] = [max(prob) for prob in probabilities]

# Probability untuk setiap class
prob_df = pd.DataFrame(probabilities, columns=['prob_positive', 'prob_neutral', 'prob_negative'])
df = pd.concat([df, prob_df], axis=1)

print("✓ Label sentimen berhasil ditambahkan!")
print(f"✓ Kolom baru: sentiment_label, sentiment_score, confidence, prob_*")


[5] MAPPING LABEL SENTIMEN...
----------------------------------------------------------------------
✓ Label sentimen berhasil ditambahkan!
✓ Kolom baru: sentiment_label, sentiment_score, confidence, prob_*


In [8]:
# 6. ANALISIS HASIL
print("\n[6] ANALISIS HASIL PELABELAN")
print("-"*70)

print("\nDistribusi Sentimen:")
sentiment_counts = df['sentiment_label'].value_counts()
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {sentiment.capitalize():.<15} {count:>6} ({percentage:>5.2f}%)")

print(f"\nRata-rata Confidence Score: {df['confidence'].mean():.4f}")
print(f"Min Confidence Score: {df['confidence'].min():.4f}")
print(f"Max Confidence Score: {df['confidence'].max():.4f}")

# Confidence berdasarkan sentimen
print("\nRata-rata Confidence per Sentimen:")
for sentiment in sentiment_map.values():
    avg_conf = df[df['sentiment_label'] == sentiment]['confidence'].mean()
    print(f"  {sentiment.capitalize():.<15} {avg_conf:.4f}")


[6] ANALISIS HASIL PELABELAN
----------------------------------------------------------------------

Distribusi Sentimen:
  Negative.......   1280 (58.80%)
  Neutral........    512 (23.52%)
  Positive.......    385 (17.68%)

Rata-rata Confidence Score: 0.8984
Min Confidence Score: 0.3688
Max Confidence Score: 0.9983

Rata-rata Confidence per Sentimen:
  Positive....... 0.8616
  Neutral........ 0.8633
  Negative....... 0.9234


In [9]:
# 7. CONTOH HASIL PREDIKSI
print("\n[7] CONTOH HASIL PREDIKSI")
print("-"*70)

print("\n🟢 CONTOH KOMENTAR POSITIVE (5 teratas):")
positive_samples = df[df['sentiment_label'] == 'positive'].nlargest(5, 'confidence')
for idx, row in positive_samples.iterrows():
    print(f"\nKomentar #{idx+1} (Confidence: {row['confidence']:.4f})")
    print(f"  {row['clean_comment'][:150]}...")

print("\n🟡 CONTOH KOMENTAR NEUTRAL (5 teratas):")
neutral_samples = df[df['sentiment_label'] == 'neutral'].nlargest(5, 'confidence')
for idx, row in neutral_samples.iterrows():
    print(f"\nKomentar #{idx+1} (Confidence: {row['confidence']:.4f})")
    print(f"  {row['clean_comment'][:150]}...")

print("\n🔴 CONTOH KOMENTAR NEGATIVE (5 teratas):")
negative_samples = df[df['sentiment_label'] == 'negative'].nlargest(5, 'confidence')
for idx, row in negative_samples.iterrows():
    print(f"\nKomentar #{idx+1} (Confidence: {row['confidence']:.4f})")
    print(f"  {row['clean_comment'][:150]}...")


[7] CONTOH HASIL PREDIKSI
----------------------------------------------------------------------

🟢 CONTOH KOMENTAR POSITIVE (5 teratas):

Komentar #801 (Confidence: 0.9972)
  gaya bicara penyampaiann aa teguh enak banget...

Komentar #507 (Confidence: 0.9972)
  mantap...

Komentar #1435 (Confidence: 0.9972)
  422 sumpah estetik banget transisi halus warnawarnanya bikin adem 855dan musik latar cocok banget...

Komentar #2084 (Confidence: 0.9972)
  keren...

Komentar #865 (Confidence: 0.9969)
  istimewa...

🟡 CONTOH KOMENTAR NEUTRAL (5 teratas):

Komentar #941 (Confidence: 0.9983)
  nirvana090 cek mutasi uang kmna...

Komentar #1812 (Confidence: 0.9981)
  tau undang om dedy muncul ig...

Komentar #1922 (Confidence: 0.9980)
  bang ded coba undang pace komputer...

Komentar #1216 (Confidence: 0.9980)
  3621 deddy masuk kantor cyber tv pake video ala matrix...

Komentar #1854 (Confidence: 0.9976)
  ngk undang pakar telematika sang ungkap video gambar pnas...

🔴 CONTOH KOMENTAR NEGATIVE (5

In [10]:
# 8. SIMPAN HASIL
print("\n[8] MENYIMPAN HASIL...")
print("-"*70)

# Simpan data dengan label sentimen
output_file = 'youtube_comments_labeled.csv'
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"✓ Data dengan label sentimen disimpan: '{output_file}'")

# Simpan statistik sentimen
stats_file = 'sentiment_statistics.txt'
with open(stats_file, 'w', encoding='utf-8') as f:
    f.write("="*70 + "\n")
    f.write("STATISTIK PELABELAN SENTIMEN - INDOBERT\n")
    f.write("="*70 + "\n\n")
    
    f.write("DISTRIBUSI SENTIMEN:\n")
    f.write("-"*70 + "\n")
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(df)) * 100
        f.write(f"{sentiment.capitalize():.<20} {count:>8} ({percentage:>6.2f}%)\n")
    
    f.write(f"\n\nTOTAL KOMENTAR: {len(df)}\n")
    f.write(f"Rata-rata Confidence: {df['confidence'].mean():.4f}\n")
    
    f.write("\n\nCONFIDENCE SCORE PER SENTIMEN:\n")
    f.write("-"*70 + "\n")
    for sentiment in sentiment_map.values():
        avg_conf = df[df['sentiment_label'] == sentiment]['confidence'].mean()
        f.write(f"{sentiment.capitalize():.<20} {avg_conf:.4f}\n")

print(f"✓ Statistik sentimen disimpan: '{stats_file}'")

# Simpan summary per sentimen
for sentiment in sentiment_map.values():
    sentiment_df = df[df['sentiment_label'] == sentiment]
    sentiment_file = f'comments_{sentiment}.csv'
    sentiment_df.to_csv(sentiment_file, index=False, encoding='utf-8')
    print(f"✓ Komentar {sentiment} disimpan: '{sentiment_file}'")

print("\n" + "="*70)
print("✓✓✓ TAHAP 2 SELESAI! ✓✓✓")
print("="*70)


[8] MENYIMPAN HASIL...
----------------------------------------------------------------------
✓ Data dengan label sentimen disimpan: 'youtube_comments_labeled.csv'
✓ Statistik sentimen disimpan: 'sentiment_statistics.txt'
✓ Komentar positive disimpan: 'comments_positive.csv'
✓ Komentar neutral disimpan: 'comments_neutral.csv'
✓ Komentar negative disimpan: 'comments_negative.csv'

✓✓✓ TAHAP 2 SELESAI! ✓✓✓
