In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import re
import time

In [2]:
# ============================================================================
# BAGIAN 1: PERSIAPAN DATA
# ============================================================================

print("="*80)
print("SELF-TRAINING SENTIMENT ANALYSIS UNTUK KOMENTAR YOUTUBE")
print("="*80)

SELF-TRAINING SENTIMENT ANALYSIS UNTUK KOMENTAR YOUTUBE


In [3]:
# Load data labeled (training set)
print("\n[1] Loading data labeled...")
df_positif = pd.read_csv('komentar_positif.csv')
df_negatif = pd.read_csv('komentar_negatif.csv')
df_netral = pd.read_csv('komentar_netral.csv')


[1] Loading data labeled...


In [4]:
# Tambahkan label
df_positif['label'] = 'positif'
df_negatif['label'] = 'negatif'
df_netral['label'] = 'netral'

In [5]:
# Gabungkan data labeled
df_labeled = pd.concat([df_positif, df_negatif, df_netral], ignore_index=True)
print(f"   Total data labeled: {len(df_labeled)} baris")
print(f"   - Positif: {len(df_positif)}")
print(f"   - Negatif: {len(df_negatif)}")
print(f"   - Netral: {len(df_netral)}")

   Total data labeled: 350 baris
   - Positif: 150
   - Negatif: 150
   - Netral: 50


In [6]:
# Load data unlabeled (semua komentar YouTube)
print("\n[2] Loading data unlabeled...")
df_unlabeled = pd.read_csv('youtube_comments.csv')
print(f"   Total data unlabeled: {len(df_unlabeled)} baris")


[2] Loading data unlabeled...
   Total data unlabeled: 2157 baris


In [7]:
# ============================================================================
# BAGIAN 2: TEXT PREPROCESSING
# ============================================================================

def preprocess_text(text):
    """Preprocessing text sederhana"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("\n[3] Preprocessing text...")
df_labeled['comment_clean'] = df_labeled['comment'].apply(preprocess_text)
df_unlabeled['comment_clean'] = df_unlabeled['comment'].apply(preprocess_text)


[3] Preprocessing text...


In [8]:
# ============================================================================
# BAGIAN 3: INITIAL MODEL TRAINING
# ============================================================================

print("\n[4] Training initial model...")

# Vectorization
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_labeled = vectorizer.fit_transform(df_labeled['comment_clean'])
y_labeled = df_labeled['label']

# Split untuk evaluasi
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled
)

# Train initial model
initial_model = MultinomialNB()
initial_model.fit(X_train, y_train)

# Evaluasi initial model
print("\n   Evaluasi Initial Model:")
y_pred = initial_model.predict(X_test)
print(classification_report(y_test, y_pred))


[4] Training initial model...

   Evaluasi Initial Model:
              precision    recall  f1-score   support

     negatif       0.62      0.77      0.69        30
      netral       0.00      0.00      0.00        10
     positif       0.70      0.77      0.73        30

    accuracy                           0.66        70
   macro avg       0.44      0.51      0.47        70
weighted avg       0.57      0.66      0.61        70



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# ============================================================================
# BAGIAN 4: SELF-TRAINING PROCESS
# ============================================================================

print("\n[5] Self-Training Process...")
print("   Memprediksi label untuk data unlabeled...")

# Prediksi pada data unlabeled
X_unlabeled = vectorizer.transform(df_unlabeled['comment_clean'])
predictions = initial_model.predict(X_unlabeled)
prediction_proba = initial_model.predict_proba(X_unlabeled)

# Ambil confidence score (probabilitas maksimum)
confidence_scores = np.max(prediction_proba, axis=1)

# Tambahkan hasil prediksi ke dataframe
df_unlabeled['predicted_label'] = predictions
df_unlabeled['confidence'] = confidence_scores


[5] Self-Training Process...
   Memprediksi label untuk data unlabeled...


In [10]:
# ============================================================================
# BAGIAN 5: LABELING DENGAN LLM (SIMULASI)
# ============================================================================

print("\n[6] Labeling dengan LLM...")
print("   (Dalam praktik nyata, gunakan API Claude/GPT untuk labeling)")

# Simulasi LLM labeling - dalam praktik nyata, gunakan API
# Untuk demo, kita akan menggunakan keyword-based labeling sebagai simulasi
def simulate_llm_label(text):
    """Simulasi LLM labeling menggunakan keyword-based approach"""
    text = str(text).lower()
    
    # Keywords positif
    positive_keywords = ['bagus', 'mantap', 'keren', 'suka', 'hebat', 'top', 
                        'terbaik', 'sukses', 'senang', 'terima kasih', 'good',
                        'love', 'amazing', 'great', 'excellent', 'perfect']
    
    # Keywords negatif
    negative_keywords = ['buruk', 'jelek', 'tidak', 'gak', 'benci', 'kecewa',
                        'gagal', 'bad', 'worst', 'hate', 'terrible', 'boring',
                        'sampah', 'payah']
    
    # Hitung score
    pos_score = sum(1 for word in positive_keywords if word in text)
    neg_score = sum(1 for word in negative_keywords if word in text)
    
    if pos_score > neg_score:
        return 'positif'
    elif neg_score > pos_score:
        return 'negatif'
    else:
        return 'netral'

# Terapkan LLM labeling (simulasi)
df_unlabeled['llm_label'] = df_unlabeled['comment'].apply(simulate_llm_label)

print("   LLM labeling selesai!")


[6] Labeling dengan LLM...
   (Dalam praktik nyata, gunakan API Claude/GPT untuk labeling)
   LLM labeling selesai!


In [11]:
# ============================================================================
# BAGIAN 6: SELF-TRAINING DENGAN THRESHOLD
# ============================================================================

print("\n[7] Selecting high-confidence predictions untuk self-training...")

# Filter prediksi dengan confidence tinggi (threshold 0.8)
CONFIDENCE_THRESHOLD = 0.8
df_high_confidence = df_unlabeled[df_unlabeled['confidence'] >= CONFIDENCE_THRESHOLD].copy()

print(f"   Data dengan confidence >= {CONFIDENCE_THRESHOLD}: {len(df_high_confidence)} baris")

# Gabungkan dengan data labeled original untuk retraining
df_augmented = pd.concat([
    df_labeled[['comment', 'comment_clean', 'label']],
    df_high_confidence[['comment', 'comment_clean', 'predicted_label']].rename(
        columns={'predicted_label': 'label'}
    )
], ignore_index=True)

print(f"   Total data untuk retraining: {len(df_augmented)} baris")

# Retrain model dengan augmented data
print("\n[8] Retraining model dengan augmented data...")
X_augmented = vectorizer.transform(df_augmented['comment_clean'])
y_augmented = df_augmented['label']

retrained_model = MultinomialNB()
retrained_model.fit(X_augmented, y_augmented)

# Test pada test set original
y_pred_retrained = retrained_model.predict(X_test)
print("\n   Evaluasi Retrained Model:")
print(classification_report(y_test, y_pred_retrained))


[7] Selecting high-confidence predictions untuk self-training...
   Data dengan confidence >= 0.8: 10 baris
   Total data untuk retraining: 360 baris

[8] Retraining model dengan augmented data...

   Evaluasi Retrained Model:
              precision    recall  f1-score   support

     negatif       0.79      1.00      0.88        30
      netral       0.00      0.00      0.00        10
     positif       0.91      0.97      0.94        30

    accuracy                           0.84        70
   macro avg       0.57      0.66      0.61        70
weighted avg       0.73      0.84      0.78        70



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# ============================================================================
# BAGIAN 7: FINAL PREDICTION & EXPORT
# ============================================================================

print("\n[9] Final prediction pada semua data unlabeled...")

# Prediksi final dengan retrained model
final_predictions = retrained_model.predict(X_unlabeled)
final_proba = retrained_model.predict_proba(X_unlabeled)
final_confidence = np.max(final_proba, axis=1)

# Update dataframe dengan final predictions
df_unlabeled['final_predicted_label'] = final_predictions
df_unlabeled['final_confidence'] = final_confidence


[9] Final prediction pada semua data unlabeled...


In [13]:
# ============================================================================
# BAGIAN 8: EXPORT RESULTS
# ============================================================================

print("\n[10] Exporting results...")

# Siapkan dataframe untuk export
df_export = df_unlabeled[[
    'comment',
    'predicted_label',      # Label dari initial model
    'confidence',           # Confidence dari initial model
    'llm_label',           # Label dari LLM (simulasi)
    'final_predicted_label', # Label dari retrained model
    'final_confidence'      # Confidence dari retrained model
]].copy()

# Rename kolom untuk clarity
df_export.columns = [
    'comment',
    'model_label_initial',
    'model_confidence_initial',
    'llm_label',
    'model_label_final',
    'model_confidence_final'
]

# Export ke CSV
df_export.to_csv('data_self_training.csv', index=False, encoding='utf-8-sig')
print("   ✓ Results exported to: data_self_training.csv")


[10] Exporting results...
   ✓ Results exported to: data_self_training.csv


In [14]:
# ============================================================================
# BAGIAN 9: SUMMARY & STATISTICS
# ============================================================================

print("\n" + "="*80)
print("SUMMARY & STATISTICS")
print("="*80)

print("\n[Initial Model Distribution]")
print(df_export['model_label_initial'].value_counts())
print(f"\nAverage Confidence: {df_export['model_confidence_initial'].mean():.4f}")

print("\n[LLM Label Distribution]")
print(df_export['llm_label'].value_counts())

print("\n[Final Model Distribution]")
print(df_export['model_label_final'].value_counts())
print(f"\nAverage Confidence: {df_export['model_confidence_final'].mean():.4f}")

# Agreement analysis
print("\n[Agreement Analysis]")
agreement = (df_export['model_label_initial'] == df_export['llm_label']).sum()
print(f"Model vs LLM Agreement: {agreement}/{len(df_export)} ({agreement/len(df_export)*100:.2f}%)")

print("\n[High Confidence Predictions]")
high_conf_count = (df_export['model_confidence_final'] >= 0.8).sum()
print(f"Predictions with confidence >= 0.8: {high_conf_count}/{len(df_export)} ({high_conf_count/len(df_export)*100:.2f}%)")

print("\n" + "="*80)
print("SELF-TRAINING PROCESS COMPLETED!")
print("="*80)
print("\nFile output: data_self_training.csv")
print("Kolom yang tersedia:")
print("  - comment: Komentar asli")
print("  - model_label_initial: Label dari initial model")
print("  - model_confidence_initial: Confidence dari initial model")
print("  - llm_label: Label dari LLM")
print("  - model_label_final: Label dari retrained model")
print("  - model_confidence_final: Confidence dari retrained model")


SUMMARY & STATISTICS

[Initial Model Distribution]
model_label_initial
positif    1106
negatif    1050
netral        1
Name: count, dtype: int64

Average Confidence: 0.5275

[LLM Label Distribution]
llm_label
netral     1973
positif     105
negatif      79
Name: count, dtype: int64

[Final Model Distribution]
model_label_final
positif    1433
negatif     722
netral        2
Name: count, dtype: int64

Average Confidence: 0.5461

[Agreement Analysis]
Model vs LLM Agreement: 110/2157 (5.10%)

[High Confidence Predictions]
Predictions with confidence >= 0.8: 23/2157 (1.07%)

SELF-TRAINING PROCESS COMPLETED!

File output: data_self_training.csv
Kolom yang tersedia:
  - comment: Komentar asli
  - model_label_initial: Label dari initial model
  - model_confidence_initial: Confidence dari initial model
  - llm_label: Label dari LLM
  - model_label_final: Label dari retrained model
  - model_confidence_final: Confidence dari retrained model
