In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

NLP Disaster Tweet Classification
Kaggle Competition: Natural Language Processing with Disaster Tweets

This script builds a binary classifier to identify real disaster tweets (1) vs fake/metaphorical (0)

BERT-Based Disaster Tweet Classification
F1-Score Target: 0.85-1.00 (State-of-the-art)

This uses transformer models (DistilBERT) which is how people achieve near-perfect scores.
Requires: transformers, torch



In [None]:
pip install transformers torch

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW  # Import AdamW from torch instead of transformersfrom sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
class TweetDataset(Dataset):
    """Custom dataset for tweets"""
    def __init__(self, texts, targets, tokenizer, max_len=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    """Train for one epoch"""
    model.train()
    losses = []
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )
        
        loss = outputs.loss
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
    
    return np.mean(losses)

In [None]:
def eval_model(model, data_loader, device):
    """Evaluate model"""
    model.eval()
    predictions = []
    real_values = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            real_values.extend(targets.cpu().tolist())
    
    return predictions, real_values

# DATA LOADING

In [None]:
print("Loading datasets...")
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# MINIMAL PREPROCESSING

In [None]:
# BERT handles most preprocessing internally, we just need basic cleaning

def clean_text_bert(text):
    """Minimal cleaning for BERT"""
    if pd.isna(text):
        return ""
    # BERT can handle most text as-is, just basic cleaning
    text = str(text)
    return text

train_df['text'] = train_df['text'].apply(clean_text_bert)
test_df['text'] = test_df['text'].apply(clean_text_bert)

# SETUP

In [None]:
# Initialize tokenizer and model
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

# Hyperparameters
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 5
LEARNING_RATE = 2e-5

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'].values,
    train_df['target'].values,
    test_size=0.15,
    random_state=42,
    stratify=train_df['target'].values
)

print(f"\nTrain size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")

# Create datasets
train_dataset = TweetDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = TweetDataset(X_val, y_val, tokenizer, MAX_LEN)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# TRAINING

In [None]:
print("\n" + "="*50)
print("Training BERT model...")
print("="*50)

best_f1 = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Train loss: {train_loss:.4f}")
    
    # Evaluate
    predictions, real_values = eval_model(model, val_loader, device)
    f1 = f1_score(real_values, predictions)
    
    print(f"Validation F1-Score: {f1:.4f}")
    print(classification_report(real_values, predictions))
    
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"✅ Best model saved! F1: {best_f1:.4f}")

# Load best model
model.load_state_dict(torch.load('best_model.pth'))

# PREDICTION

In [None]:
print("\n" + "="*50)
print("Generating predictions on test set...")
print("="*50)

# Create test dataset
test_dataset = TweetDataset(
    test_df['text'].values,
    np.zeros(len(test_df)),  # Dummy targets
    tokenizer,
    MAX_LEN
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Predict
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# SUBMISSION

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions
})

submission.to_csv('submission_bert.csv', index=False)

print("\n✅ BERT pipeline completed!")
print(f"Best Validation F1-Score: {best_f1:.4f}")
print(f"Predicted distribution:")
unique, counts = np.unique(test_predictions, return_counts=True)
for label, count in zip(unique, counts):
    print(f"  Class {label}: {count} ({count/len(test_predictions)*100:.1f}%)")
print("\nSubmission file: submission_bert.csv")
print("\nNote: BERT typically achieves F1-scores of 0.85-1.00 on this competition!")