# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from collections import Counter
import time

# Importing datasets and doing some labeling

In [2]:
DEVICE = torch.device("cuda")
df_original = pd.read_csv("/kaggle/input/training-set-fakenews/train.csv")[['text', 'target']].rename(columns={'target': 'label'})
print(f"Kaggle dataset: {len(df_original)} rows")
crisisnlp_paths = [
    "/kaggle/input/dataset-for-crisisnlp/2013_Pakistan_eq_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_California_Earthquake_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_Hurricane_Odile_Mexico_en_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_Chile_Earthquake_en_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_India_floods_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_MERS_en_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_Pakistan_floods_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_Philippines_Typhoon_Hagupit_en_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2014_ebola_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2015_Cyclone_Pam_en_CF_labeled_data.tsv",
    "/kaggle/input/dataset-for-crisisnlp/2015_Nepal_Earthquake_en_CF_labeled_data.tsv",
]
irrelevant_labels = ['not_related_or_irrelevant', 'sympathy_and_emotional_support']
all_crisisnlp = []
for filepath in crisisnlp_paths:
    df_temp = pd.read_csv(filepath, sep='\t', usecols=[1, 2],names=['text', 'original_label'], header=0)
    df_temp['label'] = df_temp['original_label'].apply(lambda x: 0 if x in irrelevant_labels else 1)
    all_crisisnlp.append(df_temp[['text', 'label']])
df_crisisnlp = pd.concat(all_crisisnlp, ignore_index=True)
print(f"CrisisNLP combination data: {len(df_crisisnlp)} rows")

Kaggle dataset: 8272 rows
CrisisNLP combination data: 20514 rows


# Concatenating both datasets and randomizing it

In [3]:
df_all = pd.concat([df_original, df_crisisnlp], ignore_index=True).drop_duplicates(subset=['text'])
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nTotal clean data: {len(df_all)} rows")
print("Label distribution:")
print(df_all['label'].value_counts())


Total clean data: 28662 rows
Label distribution:
label
1    19353
0     9309
Name: count, dtype: int64


# Calculating and Applying Class Weights

In [4]:
counts = df_all['label'].value_counts().sort_index()
total = len(df_all)
weight_0 = total / (2 * counts.get(0, 1))
weight_1 = total / (2 * counts.get(1, 1))
class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float32).to(DEVICE) 
print(f"Class weights: [{weight_0:.3f}, {weight_1:.3f}]")

Class weights: [1.539, 0.741]


# Train and test splitting

In [5]:
train_df, val_df = train_test_split(df_all, test_size=0.20, random_state=42, stratify=df_all['label'])
print(f"Training: {len(train_df)} samples | Validation: {len(val_df)} samples")

Training: 22929 samples | Validation: 5733 samples


# Normalization & Tokenization & Sequences

In [6]:
MAX_SEQ_LEN = 128
VOCAB_SIZE = 20000
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text
def text_to_indices(text, vocab, max_len):
    words = clean_text(text).split()
    indices = [vocab.get(word, vocab['<UNK>']) for word in words]
    if len(indices) > MAX_SEQ_LEN:
        indices = indices[:MAX_SEQ_LEN]
    return indices
def pad_sequences(sequences, max_len, pad_value=0):
    padded = []
    for seq in sequences:
        if len(seq) < MAX_SEQ_LEN:
            seq = seq + [pad_value] * (max_len - len(seq))
        padded.append(seq)
    return padded

In [7]:
word_counts = Counter(word for text in train_df['text'] for word in clean_text(text).split())
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, count in word_counts.most_common(VOCAB_SIZE - 2):
    vocab[word] = len(vocab)
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 20000


# Making Tensors

In [8]:
train_sequences = train_df['text'].apply(lambda x: text_to_indices(x, vocab, MAX_SEQ_LEN)).tolist()
val_sequences = val_df['text'].apply(lambda x: text_to_indices(x, vocab, MAX_SEQ_LEN)).tolist()

train_padded = pad_sequences(train_sequences, MAX_SEQ_LEN, vocab['<PAD>'])
val_padded = pad_sequences(val_sequences, MAX_SEQ_LEN, vocab['<PAD>'])

train_data = torch.tensor(train_padded, dtype=torch.long)
train_labels = torch.tensor(train_df['label'].values, dtype=torch.long)
val_data = torch.tensor(val_padded, dtype=torch.long)
val_labels = torch.tensor(val_df['label'].values, dtype=torch.long)

print(f"Training tensor shape: {train_data.shape}")

Training tensor shape: torch.Size([22929, 128])


# Bi-LSTM Classifier

In [9]:
EMBEDDING_DIM = 200
HIDDEN_DIM = 100
DROPOUT_RATE = 0.50
NUM_LAYERS = 2

class SimpleClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, pad_idx):
        super().__init__()  
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.dropout = nn.Dropout(dropout_rate)
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=num_layers,
            bidirectional=True, 
            batch_first=True, 
            dropout=dropout_rate if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim * 2, 2)
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
    def forward(self, x, use_temperature=False):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out.permute(0, 2, 1)
        pooled = F.max_pool1d(lstm_out, lstm_out.size(2)).squeeze(2)
        logits = self.fc(self.dropout(pooled))
        return logits
        
model = SimpleClassifier(
    vocab_size=len(vocab),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout_rate=DROPOUT_RATE,
    pad_idx=vocab['<PAD>']
).to(DEVICE)

total_params = sum(p.numel() for p in model.parameters())
print(f"\nModel parameters: {total_params:,}")


Model parameters: 4,483,603


# Model Training and Evaluation

In [10]:
BATCH_SIZE = 32
epochs = 100
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 0.001
PATIENCE = 5

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss(weight=class_weights)
best_val_loss = float('inf')
best_val_f1 = 0.0
patience_counter = 0

for epoch in range(epochs):
    start_time = time.time()

    model.train()
    running_train_loss = 0.0
    y_train_preds = []
    y_train_labels = []

    perm = torch.randperm(len(train_data))
    train_data_shuffled = train_data[perm]
    train_labels_shuffled = train_labels[perm]

    num_batches = 0
    for i in range(0, len(train_data), BATCH_SIZE):
        batch_data = train_data_shuffled[i:i+BATCH_SIZE].to(DEVICE)
        batch_labels = train_labels_shuffled[i:i+BATCH_SIZE].to(DEVICE)

        optimizer.zero_grad()
        logits = model(batch_data)
        loss = criterion(logits, batch_labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        running_train_loss += loss.item()
        num_batches += 1

        probs = torch.softmax(logits, dim=1)
        predictions = torch.argmax(probs, dim=1)
        
        y_train_preds.extend(predictions.cpu().numpy())
        y_train_labels.extend(batch_labels.cpu().numpy())

    avg_train_loss = running_train_loss / num_batches
    
    train_f1 = f1_score(y_train_labels, y_train_preds, zero_division=0)
    train_precision = precision_score(y_train_labels, y_train_preds, zero_division=0)
    train_recall = recall_score(y_train_labels, y_train_preds, zero_division=0)

    model.eval()
    running_val_loss = 0.0
    y_val_preds = []
    y_val_labels = []

    with torch.no_grad():
        num_val_batches = 0
        for i in range(0, len(val_data), BATCH_SIZE):
            batch_data = val_data[i:i+BATCH_SIZE].to(DEVICE)
            batch_labels = val_labels[i:i+BATCH_SIZE].to(DEVICE)

            logits = model(batch_data)
            loss = criterion(logits, batch_labels)

            running_val_loss += loss.item()
            num_val_batches += 1
            
            probs = torch.softmax(logits, dim=1)
            predictions = torch.argmax(probs, dim=1)
            
            y_val_preds.extend(predictions.cpu().numpy())
            y_val_labels.extend(batch_labels.cpu().numpy())

        avg_val_loss = running_val_loss / num_val_batches
        
        val_f1 = f1_score(y_val_labels, y_val_preds, zero_division=0)
        val_precision = precision_score(y_val_labels, y_val_preds, zero_division=0)
        val_recall = recall_score(y_val_labels, y_val_preds, zero_division=0)

    
    improved = False
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_val_f1 = val_f1 
        patience_counter = 0
        improved = True

        torch.save({'model_state_dict': model.state_dict(), 'vocab': vocab, 'config': {
            'vocab_size': len(vocab), 'embedding_dim': EMBEDDING_DIM, 'hidden_dim': HIDDEN_DIM,
            'num_layers': NUM_LAYERS, 'dropout_rate': DROPOUT_RATE, 'pad_idx': vocab['<PAD>']
        }}, 'best_model.pt')
    else:
        patience_counter += 1

    elapsed = time.time() - start_time
    status = "Saved" if improved else f"({patience_counter}/{PATIENCE})"

    print(f"Epoch {epoch+1:2d}/{epochs} | Status: {status}")
    print(f"  Train Loss: {avg_train_loss:.4f} | F1: {train_f1:.4f} (P: {train_precision:.4f} R: {train_recall:.4f})")
    print(f"  Val Loss:   {avg_val_loss:.4f} | F1: {val_f1:.4f} (P: {val_precision:.4f} R: {val_recall:.4f})")

    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping, validation not improving")
        break


print("Training complete.")
print("\n")

checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"\nBest validation loss: {best_val_loss:.4f}")
print(f"Best validation F1: {best_val_f1:.4f}")

Epoch  1/100 | Status: Saved
  Train Loss: 0.5322 | F1: 0.7855 (P: 0.8515 R: 0.7289)
  Val Loss:   0.4253 | F1: 0.8420 (P: 0.9233 R: 0.7740)
Epoch  2/100 | Status: Saved
  Train Loss: 0.4504 | F1: 0.8380 (P: 0.8927 R: 0.7896)
  Val Loss:   0.3980 | F1: 0.8626 (P: 0.9231 R: 0.8096)
Epoch  3/100 | Status: Saved
  Train Loss: 0.4181 | F1: 0.8569 (P: 0.9070 R: 0.8120)
  Val Loss:   0.3725 | F1: 0.8741 (P: 0.9316 R: 0.8233)
Epoch  4/100 | Status: (1/5)
  Train Loss: 0.3779 | F1: 0.8735 (P: 0.9210 R: 0.8306)
  Val Loss:   0.3732 | F1: 0.8544 (P: 0.9538 R: 0.7737)
Epoch  5/100 | Status: Saved
  Train Loss: 0.3560 | F1: 0.8860 (P: 0.9264 R: 0.8489)
  Val Loss:   0.3459 | F1: 0.9033 (P: 0.9170 R: 0.8900)
Epoch  6/100 | Status: Saved
  Train Loss: 0.3427 | F1: 0.8902 (P: 0.9322 R: 0.8518)
  Val Loss:   0.3329 | F1: 0.8959 (P: 0.9361 R: 0.8590)
Epoch  7/100 | Status: (1/5)
  Train Loss: 0.3292 | F1: 0.8962 (P: 0.9370 R: 0.8587)
  Val Loss:   0.3349 | F1: 0.8948 (P: 0.9337 R: 0.8590)
Epoch  8/100 

# Testing

In [11]:
CONFIDENCE_THRESHOLD = 0.65

def predict(text, threshold=CONFIDENCE_THRESHOLD):
    model.eval()
    
    indices = text_to_indices(text, vocab, MAX_SEQ_LEN)
    padded = indices + [vocab['<PAD>']] * (MAX_SEQ_LEN - len(indices))
    tensor = torch.tensor([padded], dtype=torch.long).to(DEVICE)
    
    with torch.no_grad():
        logits = model(tensor, use_temperature=True)
        probs = torch.softmax(logits, dim=1)
        confidence = probs[0, 1].item()
    
    is_real = confidence > threshold
    if is_real:
        if confidence > 0.85:
            priority = "HIGH - Immediate attention"
        elif confidence > 0.70:
            priority = "MEDIUM - Review within hour"
        else:
            priority = "LOW - Review when possible"
        prediction = "REAL DISASTER"
    else:
        priority = "IGNORE - Not real disaster"
        prediction = "FAKE/ABSURD"
    
    return {
        'prediction': prediction,
        'confidence': confidence,
        'priority': priority
    }

test_cases = [
    "Mandatory evacuation order for coastal zones due to Category 4 storm surge.",
    "Official update: 8 confirmed fatalities and critical infrastructure damaged by the flooding.",
    "Hospital running low on supplies after the storm; desperately need bandages and generators.",
    "ALERT: The government is shutting down the internet due to a fake solar flare. Stock up now!",
    "So excited for the new movie tonight! I heard the CGI is absolutely insane.",
    "Giant radioactive snail terrorizes Tokyo; please stay indoors and avoid salty snacks.",
    "Massive power outage downtown. Traffic lights are out, total gridlock.",
    "Thinking of everyone affected by the California fires. Stay safe and strong! üôè"
]

for text in test_cases:
    result = predict(text)
    print(f"\n{text}")
    print(f"{result['prediction']} ({result['confidence']:.1%})")
    print(f"{result['priority']}")


Mandatory evacuation order for coastal zones due to Category 4 storm surge.
REAL DISASTER (95.9%)
HIGH - Immediate attention

Official update: 8 confirmed fatalities and critical infrastructure damaged by the flooding.
REAL DISASTER (95.8%)
HIGH - Immediate attention

Hospital running low on supplies after the storm; desperately need bandages and generators.
REAL DISASTER (93.4%)
HIGH - Immediate attention

ALERT: The government is shutting down the internet due to a fake solar flare. Stock up now!
FAKE/ABSURD (20.5%)
IGNORE - Not real disaster

So excited for the new movie tonight! I heard the CGI is absolutely insane.
FAKE/ABSURD (4.2%)
IGNORE - Not real disaster

Giant radioactive snail terrorizes Tokyo; please stay indoors and avoid salty snacks.
FAKE/ABSURD (5.8%)
IGNORE - Not real disaster

Massive power outage downtown. Traffic lights are out, total gridlock.
FAKE/ABSURD (35.0%)
IGNORE - Not real disaster

Thinking of everyone affected by the California fires. Stay safe and str