# SBiGRU-MCNN: Hybrid Context-Aware Fake News Detection

Bu notebook, **"The Power of Context: A Novel Hybrid Context-Aware Fake News Detection Approach"** makalesindeki mimariyi LIAR dataset üzerinde implemente eder.

## Mimari
- **BERT + Multichannel CNN:** Metin içeriğini işler (statement + auxiliary text)
- **Stacked BiGRU:** Sayısal özellikleri işler (speaker credit history)
- **Concatenation + Sigmoid:** Binary classification

## Dataset
- **LIAR Dataset** (William Yang Wang, 2017)
- Train: ~10,240 samples
- Test: ~1,267 samples
- [Kaggle Link](https://www.kaggle.com/datasets/msudhan/liar-dataset)

## 1. Setup

In [None]:
# Install dependencies (uncomment if needed)
# !pip install transformers torch pandas scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import BertTokenizer, BertModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# Paths - Update these according to your setup
TRAIN_PATH = './data/train.tsv'
TEST_PATH = './data/test.tsv'
MODEL_SAVE_PATH = './models/sbigru_mcnn.pth'

# Hyperparameters
MAX_LENGTH = 256
BATCH_SIZE = 16
MAX_EPOCHS = 15
LEARNING_RATE = 2e-5
DROPOUT = 0.5
PATIENCE = 3  # Early stopping patience

# Model config
BERT_MODEL = 'bert-base-uncased'
MCNN_FILTERS = 128
MCNN_KERNEL_SIZES = [3, 4, 5]
BIGRU_HIDDEN_DIM = 50
BIGRU_NUM_LAYERS = 2

## 3. Data Loading

In [None]:
# Column names for LIAR dataset
column_names = [
    'id', 'label', 'statement', 'subject', 'speaker', 'job_title',
    'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]

# Load data
df_train = pd.read_csv(TRAIN_PATH, sep='\t', header=None, names=column_names)
df_test = pd.read_csv(TEST_PATH, sep='\t', header=None, names=column_names)

print(f"Train size: {df_train.shape}")
print(f"Test size: {df_test.shape}")

In [None]:
# Data cleaning - remove rows with missing critical values
critical_columns = ['statement', 'speaker', 'barely_true_counts', 'false_counts', 
                    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts']

df_train = df_train.dropna(subset=critical_columns)
df_test = df_test.dropna(subset=critical_columns)

print(f"After cleaning - Train: {len(df_train)}, Test: {len(df_test)}")

In [None]:
# Binary label conversion
# Real (1): true, mostly-true, half-true
# Fake (0): barely-true, false, pants-fire

def convert_to_binary(label):
    real_labels = ['true', 'mostly-true', 'half-true']
    return 1 if label in real_labels else 0

df_train['binary_label'] = df_train['label'].apply(convert_to_binary)
df_test['binary_label'] = df_test['label'].apply(convert_to_binary)

print("Train Label Distribution:")
print(df_train['binary_label'].value_counts())
print(f"\nReal ratio: {df_train['binary_label'].mean():.2%}")

## 4. Data Preprocessing

In [None]:
# Text preparation - combine statement with auxiliary information
def prepare_text_input(row):
    statement = str(row['statement']) if pd.notna(row['statement']) else ""
    speaker = str(row['speaker']) if pd.notna(row['speaker']) else ""
    job_title = str(row['job_title']) if pd.notna(row['job_title']) else ""
    party = str(row['party_affiliation']) if pd.notna(row['party_affiliation']) else ""
    state = str(row['state_info']) if pd.notna(row['state_info']) else ""
    subject = str(row['subject']) if pd.notna(row['subject']) else ""
    context = str(row['context']) if pd.notna(row['context']) else ""
    
    combined_text = f"{statement} [SEP] Speaker: {speaker} [SEP] Job: {job_title} [SEP] Party: {party} [SEP] State: {state} [SEP] Subject: {subject} [SEP] Context: {context}"
    return combined_text

df_train['text_input'] = df_train.apply(prepare_text_input, axis=1)
df_test['text_input'] = df_test.apply(prepare_text_input, axis=1)

print("Sample text input:")
print(df_train['text_input'].iloc[0][:200] + "...")

In [None]:
# Numerical features - speaker credit history
numerical_columns = [
    'barely_true_counts', 'false_counts', 'half_true_counts',
    'mostly_true_counts', 'pants_on_fire_counts'
]

for col in numerical_columns:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce').fillna(0)
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce').fillna(0)

# Standardization
scaler = StandardScaler()
numerical_features_train = scaler.fit_transform(df_train[numerical_columns])
numerical_features_test = scaler.transform(df_test[numerical_columns])

for i, col in enumerate(numerical_columns):
    df_train[f'{col}_scaled'] = numerical_features_train[:, i]
    df_test[f'{col}_scaled'] = numerical_features_test[:, i]

print(f"Numerical features shape: {numerical_features_train.shape}")

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
print(f"Tokenizer loaded: {BERT_MODEL}")

## 5. Dataset & DataLoader

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, numerical_columns_scaled):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.numerical_columns = numerical_columns_scaled
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        text = str(row['text_input'])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        numerical_features = torch.tensor(
            [row[col] for col in self.numerical_columns],
            dtype=torch.float32
        )
        
        label = torch.tensor(row['binary_label'], dtype=torch.float32)
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'numerical_features': numerical_features,
            'label': label
        }

In [None]:
numerical_columns_scaled = [f'{col}_scaled' for col in numerical_columns]

train_dataset = FakeNewsDataset(df_train, tokenizer, MAX_LENGTH, numerical_columns_scaled)
test_dataset = FakeNewsDataset(df_test, tokenizer, MAX_LENGTH, numerical_columns_scaled)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

## 6. Model Architecture

In [None]:
class MultichannelCNN(nn.Module):
    """Multichannel CNN for processing BERT outputs with different kernel sizes."""
    
    def __init__(self, bert_hidden_size=768, num_filters=128, kernel_sizes=[3, 4, 5], dropout=0.5):
        super(MultichannelCNN, self).__init__()
        
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=bert_hidden_size, out_channels=num_filters, kernel_size=ks)
            for ks in kernel_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        self.output_dim = num_filters * len(kernel_sizes)
        
    def forward(self, x):
        # x: [batch_size, seq_len, hidden_size]
        x = x.permute(0, 2, 1)  # [batch_size, hidden_size, seq_len]
        
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(x))
            pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
            conv_outputs.append(pooled)
        
        concatenated = torch.cat(conv_outputs, dim=1)
        return self.dropout(concatenated)


class StackedBiGRU(nn.Module):
    """Stacked Bidirectional GRU for processing numerical features."""
    
    def __init__(self, input_dim=5, hidden_dim=50, num_layers=2, dropout=0.5):
        super(StackedBiGRU, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.input_projection = nn.Linear(1, hidden_dim)
        
        self.bigru = nn.GRU(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
        self.output_dim = hidden_dim * 2
        
    def forward(self, x):
        # x: [batch_size, num_features]
        x = x.unsqueeze(2)  # [batch_size, num_features, 1]
        x = self.input_projection(x)  # [batch_size, num_features, hidden_dim]
        
        output, hidden = self.bigru(x)
        final_hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.dropout(final_hidden)


class SBiGRU_MCNN(nn.Module):
    """Hybrid model combining BERT+mCNN for text and sBiGRU for numerical features."""
    
    def __init__(
        self,
        bert_model_name='bert-base-uncased',
        num_numerical_features=5,
        mcnn_num_filters=128,
        mcnn_kernel_sizes=[3, 4, 5],
        bigru_hidden_dim=50,
        bigru_num_layers=2,
        dropout=0.5
    ):
        super(SBiGRU_MCNN, self).__init__()
        
        # BERT
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # mCNN
        self.mcnn = MultichannelCNN(
            bert_hidden_size=self.bert_hidden_size,
            num_filters=mcnn_num_filters,
            kernel_sizes=mcnn_kernel_sizes,
            dropout=dropout
        )
        
        # sBiGRU
        self.sbigru = StackedBiGRU(
            input_dim=num_numerical_features,
            hidden_dim=bigru_hidden_dim,
            num_layers=bigru_num_layers,
            dropout=dropout
        )
        
        combined_dim = self.mcnn.output_dim + self.sbigru.output_dim
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
        
    def forward(self, input_ids, attention_mask, numerical_features):
        # BERT + mCNN
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_sequence_output = bert_output.last_hidden_state
        mcnn_output = self.mcnn(bert_sequence_output)
        
        # sBiGRU
        sbigru_output = self.sbigru(numerical_features)
        
        # Concatenation & Classification
        combined = torch.cat((mcnn_output, sbigru_output), dim=1)
        logits = self.classifier(combined)
        
        return logits.squeeze(1)

In [None]:
# Initialize model
model = SBiGRU_MCNN(
    bert_model_name=BERT_MODEL,
    num_numerical_features=5,
    mcnn_num_filters=MCNN_FILTERS,
    mcnn_kernel_sizes=MCNN_KERNEL_SIZES,
    bigru_hidden_dim=BIGRU_HIDDEN_DIM,
    bigru_num_layers=BIGRU_NUM_LAYERS,
    dropout=DROPOUT
)

model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

## 7. Training

In [None]:
class EarlyStopping:
    """Early stopping to prevent overfitting."""
    
    def __init__(self, patience=3, min_delta=0.001, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_model_state = None
        
    def __call__(self, score, model):
        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict().copy()
        elif self._is_improvement(score):
            self.best_score = score
            self.best_model_state = model.state_dict().copy()
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
    def _is_improvement(self, score):
        if self.mode == 'max':
            return score > self.best_score + self.min_delta
        return score < self.best_score - self.min_delta

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device, max_grad_norm=1.0):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, numerical_features)
        
        loss = criterion(logits, labels)
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        
        preds = (torch.sigmoid(logits) >= 0.5).float()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(dataloader), accuracy_score(all_labels, all_preds)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask, numerical_features)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy_score(all_labels, all_preds),
        'precision': precision_score(all_labels, all_preds, zero_division=0),
        'recall': recall_score(all_labels, all_preds, zero_division=0),
        'f1': f1_score(all_labels, all_preds, zero_division=0),
        'fake_precision': precision_score(all_labels, all_preds, pos_label=0, zero_division=0),
        'fake_recall': recall_score(all_labels, all_preds, pos_label=0, zero_division=0),
        'fake_f1': f1_score(all_labels, all_preds, pos_label=0, zero_division=0),
        'predictions': all_preds,
        'labels': all_labels,
        'probabilities': all_probs
    }

In [None]:
# Class weights for imbalanced data
n_fake = (df_train['binary_label'] == 0).sum()
n_real = (df_train['binary_label'] == 1).sum()
pos_weight = torch.tensor([0.85]).to(device)  # Tuned value

# Loss, optimizer, scheduler
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)
early_stopping = EarlyStopping(patience=PATIENCE, mode='max')

print(f"Optimizer: AdamW (lr={LEARNING_RATE})")
print(f"Loss: BCEWithLogitsLoss (pos_weight={pos_weight.item()})")
print(f"Early Stopping: patience={PATIENCE}")

In [None]:
import time

history = {
    'train_loss': [], 'train_acc': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [],
    'val_fake_f1': [], 'lr': []
}

print("=" * 60)
print("TRAINING STARTED")
print("=" * 60)

for epoch in range(MAX_EPOCHS):
    start_time = time.time()
    
    print(f"\nEpoch {epoch+1}/{MAX_EPOCHS}")
    print("-" * 40)
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
    
    # Evaluate
    val_results = evaluate(model, test_loader, criterion, device)
    
    # Current LR
    current_lr = optimizer.param_groups[0]['lr']
    
    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_results['loss'])
    history['val_acc'].append(val_results['accuracy'])
    history['val_f1'].append(val_results['f1'])
    history['val_fake_f1'].append(val_results['fake_f1'])
    history['lr'].append(current_lr)
    
    elapsed = time.time() - start_time
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_results['loss']:.4f} | Val Acc: {val_results['accuracy']:.4f}")
    print(f"Val F1 (Real): {val_results['f1']:.4f} | Val F1 (Fake): {val_results['fake_f1']:.4f}")
    print(f"LR: {current_lr:.2e} | Time: {elapsed:.2f}s")
    
    # Scheduler & Early Stopping
    scheduler.step(val_results['f1'])
    early_stopping(val_results['f1'], model)
    
    if early_stopping.best_score == val_results['f1']:
        print(f"*** New best model! F1: {val_results['f1']:.4f} ***")
    
    if early_stopping.early_stop:
        print(f"\nEarly stopping triggered after {epoch+1} epochs.")
        break

print("\n" + "=" * 60)
print("TRAINING COMPLETED")
print(f"Best Validation F1: {early_stopping.best_score:.4f}")
print("=" * 60)

## 8. Evaluation

In [None]:
# Load best model
model.load_state_dict(early_stopping.best_model_state)

# Final evaluation
final_results = evaluate(model, test_loader, criterion, device)

print("=" * 60)
print("FINAL TEST RESULTS")
print("=" * 60)
print(f"\nOverall Metrics:")
print(f"  Accuracy:  {final_results['accuracy']:.4f}")
print(f"\nReal (1) Class:")
print(f"  Precision: {final_results['precision']:.4f}")
print(f"  Recall:    {final_results['recall']:.4f}")
print(f"  F1 Score:  {final_results['f1']:.4f}")
print(f"\nFake (0) Class:")
print(f"  Precision: {final_results['fake_precision']:.4f}")
print(f"  Recall:    {final_results['fake_recall']:.4f}")
print(f"  F1 Score:  {final_results['fake_f1']:.4f}")

In [None]:
# Classification Report
print("\nClassification Report:")
print("=" * 50)
print(classification_report(
    final_results['labels'], 
    final_results['predictions'], 
    target_names=['Fake (0)', 'Real (1)']
))

In [None]:
# Confusion Matrix
cm = confusion_matrix(final_results['labels'], final_results['predictions'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Fake (0)', 'Real (1)'],
            yticklabels=['Fake (0)', 'Real (1)'])
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix - SBiGRU-MCNN', fontsize=14)
plt.tight_layout()
plt.savefig('./confusion_matrix.png', dpi=150)
plt.show()

In [None]:
# Training curves
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].plot(history['train_loss'], label='Train', marker='o')
axes[0, 0].plot(history['val_loss'], label='Val', marker='s')
axes[0, 0].set_title('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(history['train_acc'], label='Train', marker='o')
axes[0, 1].plot(history['val_acc'], label='Val', marker='s')
axes[0, 1].set_title('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].plot(history['val_f1'], label='Real F1', marker='s', color='green')
axes[1, 0].plot(history['val_fake_f1'], label='Fake F1', marker='^', color='red')
axes[1, 0].set_title('F1 Scores')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(history['lr'], marker='o', color='purple')
axes[1, 1].set_title('Learning Rate')
axes[1, 1].set_yscale('log')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('./training_curves.png', dpi=150)
plt.show()

## 9. Save Model

In [None]:
import os

os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

torch.save({
    'model_state_dict': early_stopping.best_model_state,
    'config': {
        'bert_model': BERT_MODEL,
        'max_length': MAX_LENGTH,
        'mcnn_filters': MCNN_FILTERS,
        'mcnn_kernel_sizes': MCNN_KERNEL_SIZES,
        'bigru_hidden_dim': BIGRU_HIDDEN_DIM,
        'bigru_num_layers': BIGRU_NUM_LAYERS,
        'dropout': DROPOUT
    },
    'results': {
        'accuracy': final_results['accuracy'],
        'f1_real': final_results['f1'],
        'f1_fake': final_results['fake_f1']
    },
    'scaler_params': {
        'mean': scaler.mean_,
        'scale': scaler.scale_
    }
}, MODEL_SAVE_PATH)

print(f"Model saved to: {MODEL_SAVE_PATH}")