In [3]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    confusion_matrix, 
    classification_report,
    roc_auc_score,
    roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [4]:
NON_HOAX_TAGS = ['berita', 'fakta', 'klarifikasi', 'benar', 'cek fakta']
def contains_non_hoax_tag(text):
    if pd.isna(text):
        return False
    # Deteksi tag di awal teks dengan [] atau ()
    match = re.match(r'[\[\(]\s*([^\]\)]+?)\s*[\]\)]', text, flags=re.IGNORECASE)
    if match:
        tag = match.group(1).strip().lower()
        return tag in NON_HOAX_TAGS
    return False

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    # Hapus tag di awal seperti [HOAX], (SALAH), dll
    text = re.sub(r'^[\[\(]\s*[^)\]]+\s*[\)\]]', '', text)
    # Hapus link
    text = re.sub(r'http\S+|www\S+', '', text)
    # Hapus karakter selain huruf dan spasi
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Hilangkan spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_dataframe(df):
    # Preprocess dataframe with text cleaning
    df = df[~df['judul'].apply(contains_non_hoax_tag)].copy()
    df['text'] = df['judul'].fillna('') + ' ' + df['summary'].fillna('')
    df['clean_text'] = df['text'].apply(clean_text)
    df = df[df['clean_text'].str.strip() != '']
    df = df.dropna(subset=['clean_text', 'hoax'])
    return df[['clean_text', 'hoax']]

def load_and_prepare_data():
    # Load all datasets and combine them
    sum_paths = {
        "cnn": 'D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK/Summarized/Summarized_CNN.csv',
        "detik": 'D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK/Summarized/Summarized_Detik.csv',
        "kompas": 'D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK/Summarized/Summarized_Kompas.csv',
        "tbh": 'D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK/Summarized/Summarized_TurnBackHoax.csv'
    }
    
    all_data = []
    
    for source, path in sum_paths.items():
        print(f"Loading {source} dataset...")
        df = pd.read_csv(path)
        df_processed = preprocess_dataframe(df)
        
        # Set labels: CNN, Detik, Kompas = 0 (Non-Hoax), TBH = 1 (Hoax)
        if source != 'tbh':
            df_processed['hoax'] = 0
        else:
            df_processed['hoax'] = 1
            
        print(f"{source}: {len(df_processed)} samples")
        all_data.append(df_processed)

        if source == 'tbh':
            print("\nContoh baris TBH sebelum preprocessing:")
            print(df.head(3)[['judul', 'summary']])
            print("\nContoh baris TBH setelah preprocessing:")
            print(df_processed.head(3))

    
    # Combine all datasets
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal combined dataset: {len(combined_df)} samples")
    print(f"Class distribution:\n{combined_df['hoax'].value_counts()}")
    
    return combined_df

In [5]:
class HoaxDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
class IndoBERTHoaxClassifier(nn.Module):
    def __init__(self, model_name='indobenchmark/indobert-base-p1', num_classes=2, freeze_layers=8):
        super(IndoBERTHoaxClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Freeze first 8 layers (layers 1-8), unfreeze layers 9-12
        for i, layer in enumerate(self.bert.encoder.layer):
            if i < freeze_layers:
                for param in layer.parameters():
                    param.requires_grad = False
            else:
                for param in layer.parameters():
                    param.requires_grad = True
        
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [7]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc="Training", leave=False)
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        # Update progress bar dengan loss info
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
    
    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    probabilities = []
    
    progress_bar = tqdm(data_loader, desc="Evaluating", leave=False)
    
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs, dim=1)
            
            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
    
    return predictions, actual_labels, probabilities

In [8]:
def calculate_metrics(y_true, y_pred, y_probs):
    """Calculate comprehensive evaluation metrics"""
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    # ROC AUC Score
    y_probs_class1 = [prob[1] for prob in y_probs]  # Probability for class 1 (hoax)
    roc_auc = roc_auc_score(y_true, y_probs_class1)
    
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc
    }
    
    return metrics

def plot_confusion_matrix(y_true, y_pred):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Non-Hoax', 'Hoax'], 
                yticklabels=['Non-Hoax', 'Hoax'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

def plot_roc_curve(y_true, y_probs):
    """Plot ROC curve"""
    y_probs_class1 = [prob[1] for prob in y_probs]
    fpr, tpr, _ = roc_curve(y_true, y_probs_class1)
    roc_auc = roc_auc_score(y_true, y_probs_class1)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [9]:
def main_training():
    """Main training pipeline"""
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load and prepare data
    print("Loading and preparing data...")
    df = load_and_prepare_data()
    
    # Split data (70/15/15)
    X = df['clean_text'].values
    y = df['hoax'].values
    
    # First split: 70% train, 30% temp
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Second split: 15% val, 15% test from the 30% temp
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )
    
    print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
    
    # Initialize tokenizer and model
    model_name = 'indobenchmark/indobert-base-p1'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = IndoBERTHoaxClassifier(model_name=model_name, freeze_layers=8).to(device)
    
    # Create datasets and dataloaders
    batch_size = 32
    
    train_dataset = HoaxDataset(X_train, y_train, tokenizer)
    val_dataset = HoaxDataset(X_val, y_val, tokenizer)
    test_dataset = HoaxDataset(X_test, y_test, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Setup optimizer and scheduler
    epochs = 5
    learning_rate = 1e-5
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    
    # Training loop
    print("Starting training...")
    train_losses = []
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        
        # Training
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        train_losses.append(train_loss)
        
        # Validation
        val_preds, val_true, val_probs = eval_model(model, val_loader, device)
        val_metrics = calculate_metrics(val_true, val_preds, val_probs)
        
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
        print(f"Val F1-Score: {val_metrics['f1_score']:.4f}")
    
    # Final evaluation on test set
    print("\n" + "="*50)
    print("FINAL EVALUATION ON TEST SET")
    print("="*50)
    
    test_preds, test_true, test_probs = eval_model(model, test_loader, device)
    test_metrics = calculate_metrics(test_true, test_preds, test_probs)
    
    print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Test Precision: {test_metrics['precision']:.4f}")
    print(f"Test Recall: {test_metrics['recall']:.4f}")
    print(f"Test F1-Score: {test_metrics['f1_score']:.4f}")
    print(f"Test ROC-AUC: {test_metrics['roc_auc']:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(test_true, test_preds, target_names=['Non-Hoax', 'Hoax']))
    
    # Plot visualizations
    plot_confusion_matrix(test_true, test_preds)
    plot_roc_curve(test_true, test_probs)
    
    # Save model and tokenizer ke Kaggle output directory
    import os
    output_dir = '/kaggle/working/hoax_detector_model'
    os.makedirs(output_dir, exist_ok=True)
    
    torch.save(model.state_dict(), f'{output_dir}/hoax_detector_model.pth')
    tokenizer.save_pretrained(f'{output_dir}/tokenizer')
    
    # Save training metrics juga
    import pickle
    with open(f'{output_dir}/training_metrics.pkl', 'wb') as f:
        pickle.dump(test_metrics, f)
    
    print(f"\nModel and tokenizer saved to {output_dir}")
    print("PENTING: Setelah training selesai, commit notebook dan create dataset dari output!")
    
    return model, tokenizer, test_metrics

In [15]:
class HoaxDetector:
    def __init__(self, model_path=None, tokenizer_path=None):
        """
        Jika model_path None, akan coba load dari dataset Kaggle
        """
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Jika tidak ada path, coba load dari dataset
        if model_path is None:
            # Ganti dengan path dataset Anda nanti
            base_path = 'D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK'  # <-- GANTI INI
            model_path = f'{base_path}/bert_model_state_dict.pth'
            tokenizer_path = f'{base_path}/tokenizer'
        
        try:
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
            
            # Load model
            self.model = IndoBERTHoaxClassifier()
            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
            self.model.to(self.device)
            self.model.eval()
            
            print(f"✅ Model loaded successfully from {model_path}")
            
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            print("Pastikan path dataset sudah benar!")
    
    def predict(self, judul, isi_berita):
        """
        Predict whether news is hoax or not
        Args:
            judul: News title
            isi_berita: News content
        Returns:
            Dict with prediction, confidence, and probabilities
        """
        # Combine title and content
        text = f"{judul} {isi_berita}"
        clean_text = clean_text(text)
        
        # Tokenize
        encoding = self.tokenizer(
            clean_text,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        # Predict
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs, dim=1)
            predicted_class = torch.argmax(outputs, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        # Format results
        prob_non_hoax = probabilities[0][0].item()
        prob_hoax = probabilities[0][1].item()
        
        prediction = "HOAX" if predicted_class == 1 else "NON-HOAX"
        
        result = {
            'prediction': prediction,
            'confidence': f"{confidence*100:.1f}%",
            'probabilities': {
                'Non-Hoax': f"{prob_non_hoax:.3f}",
                'Hoax': f"{prob_hoax:.3f}"
            }
        }
        
        return result

In [22]:
def test_new_article():
    """Example function to test new articles"""
    
    # Initialize detector
    detector = HoaxDetector()
    
    # Example Hoaks
    judul1 = "Kemenkes Wajibkan Penumpang Pesawat Tervaksinasi TBC"
    isi1 = """Kemenkes: Semua penumpang yang akan naik pesawat agar sudah divaksin TBC dan menunjukan surat vaksin. tujuannya untuk mencegah penyebaran lewat udara. Budi Gunadi Sadikin, Menkes"""
    
    result1 = detector.predict(judul1, isi1)
    
    print("="*60)
    print("CONTOH TESTING BERITA BARU")
    print("="*60)
    print(f"Judul: {judul1}")
    print(f"Prediksi: {result1['prediction']}")
    print(f"Confidence: {result1['confidence']}")
    print(f"Probabilitas Non-Hoax: {result1['probabilities']['Non-Hoax']}")
    print(f"Probabilitas Hoax: {result1['probabilities']['Hoax']}")
    
    # Example 2
    judul2 = "Tanpa Messi dan Mbappe, PSG Bisa Juara Liga Champions"
    isi2 = """Paris Saint-Germain (PSG) akhirnya berhasil meraih gelar Liga Champions pertama mereka di musim 2024/2025, meski tanpa kehadiran dua bintang besar yang sebelumnya membela klub itu yaitu Lionel Messi dan Kylian Mbappe. Musim ini menjadi titik tolak bagi klub ibu kota Prancis itu. Dengan hengkangnya Messi pada musim panas 2023 lalu ke Inter Miami dan Mbappe yang memilih pindah ke Real Madrid pada Juli 2024, banyak yang meragukan kemampuan PSG untuk bersaing di kompetisi level tertinggi Eropa. Namun, justru di luar dugaan, PSG tampil lebih solid dan kolektif, mengesampingkan ketergantungan pada satu atau dua pemain berlabel bintang besar."""
    
    result2 = detector.predict(judul2, isi2)
    
    print("\n" + "-"*60)
    print(f"Judul: {judul2}")
    print(f"Prediksi: {result2['prediction']}")
    print(f"Confidence: {result2['confidence']}")
    print(f"Probabilitas Non-Hoax: {result2['probabilities']['Non-Hoax']}")
    print(f"Probabilitas Hoax: {result2['probabilities']['Hoax']}")

In [None]:
if __name__ == "__main__":
    # Run training
    model, tokenizer, metrics = main_training()
    
    # Test with new articles
    # test_new_article()
    
    print("\n" + "="*60)
    print("SCRIPT COMPLETED SUCCESSFULLY!")
    print("="*60)

Using device: cuda
Loading and preparing data...
Loading cnn dataset...
cnn: 4216 samples
Loading detik dataset...
detik: 4214 samples
Loading kompas dataset...
kompas: 4216 samples
Loading tbh dataset...
tbh: 12012 samples

Contoh baris TBH sebelum preprocessing:
                                               judul  \
0  [PENIPUAN] Tautan “New Gebyar Program Bank BCA...   
1  [SALAH] Video “Ada Bangkai Kereta di Tebing, A...   
2  [SALAH] Jokowi Pakai Rp38,5 Triliun Dana Haji ...   

                                             summary  
0  untuk seluruh Nasabah Bank BCA . unggahan ters...  
1  bangkai gerbong kereta bekas kecelakaan mangkr...  
2  pemeriksaan Fakta Tim Pemeriksa Fakta MAFINDO ...  

Contoh baris TBH setelah preprocessing:
                                          clean_text  hoax
0  tautan new gebyar program bank bca tahun untuk...     1
1  video ada bangkai kereta di tebing akibat jemb...     1
2  jokowi pakai rp triliun dana haji rakyat tak d...     1

Total combin

Training:   0%|          | 0/540 [00:00<?, ?it/s]

In [23]:
test_new_article()

✅ Model loaded successfully from D:/Semester 4/CCI/The Hack 2025/deteksi_berita_hoax-CCI_THE_HACK/bert_model_state_dict.pth
CONTOH TESTING BERITA BARU
Judul: Kemenkes Wajibkan Penumpang Pesawat Tervaksinasi TBC
Prediksi: HOAX
Confidence: 66.5%
Probabilitas Non-Hoax: 0.335
Probabilitas Hoax: 0.665

------------------------------------------------------------
Judul: Tanpa Messi dan Mbappe, PSG Bisa Juara Liga Champions
Prediksi: NON-HOAX
Confidence: 99.7%
Probabilitas Non-Hoax: 0.997
Probabilitas Hoax: 0.003
