# üé≠ Sentiment Analysis with DistilBERT
## BBC News Classification - ECS 271 NLP Project

**Model:** DistilBERT (distilbert-base-uncased)  
**Task:** Classify news articles into 5 categories + mood analysis  
**Team Repo:** [ECS-271-NLP-Project](https://github.com/ananya-bontalavl/ECS-271-NLP-Project)

---

‚ö†Ô∏è **Before Running:** Enable GPU!  
`Runtime ‚Üí Change runtime type ‚Üí T4 GPU`

---
## 1. Setup & Installation

In [None]:
#@title 1.1 Check GPU
!nvidia-smi

In [None]:
#@title 1.2 Install Packages
!pip install -q transformers datasets accelerate
!pip install -q scikit-learn seaborn
print("‚úÖ All packages installed!")

In [None]:
#@title 1.3 Import Libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    DistilBertTokenizer,
    DistilBertModel,
    get_linear_schedule_with_warmup
)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import urllib.request
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix
)

from tqdm.notebook import tqdm

# Set seeds
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")
if device.type == 'cuda':
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
#@title 1.4 Configuration

# ============================================
# GitHub Repository Configuration
# ============================================
GITHUB_REPO = "ananya-bontalavl/ECS-271-NLP-Project"
BRANCH = "main"
DATA_FOLDER = "data/processed"  # Folder in repo containing CSVs

# Construct URLs
DATA_BASE_URL = f"https://raw.githubusercontent.com/{GITHUB_REPO}/{BRANCH}/{DATA_FOLDER}"

# ============================================
# Model Configuration
# ============================================
CONFIG = {
    'model_name': 'distilbert-base-uncased',
    'max_length': 512,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'epochs': 3,
    'dropout': 0.3,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'num_classes': 5
}

# Label mappings
LABEL2ID = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

# Mood mapping (for fusion with summarization)
MOOD_MAPPING = {
    'business': 'analytical/formal',
    'entertainment': 'light/engaging',
    'politics': 'serious/critical',
    'sport': 'energetic/competitive',
    'tech': 'informative/innovative'
}

print("‚öôÔ∏è Configuration:")
for k, v in CONFIG.items():
    print(f"   {k}: {v}")
print(f"\nüìÇ Data URL: {DATA_BASE_URL}")

In [None]:
#@title 1.5 Download Data from GitHub

def download_data(base_url):
    """Download train, val, test CSV files from GitHub."""
    os.makedirs('data', exist_ok=True)
    files = ['train.csv', 'val.csv', 'test.csv']
    
    for file in files:
        url = f"{base_url}/{file}"
        output_path = f"data/{file}"
        try:
            print(f"üì• Downloading {file}...", end=" ")
            urllib.request.urlretrieve(url, output_path)
            print("‚úÖ")
        except Exception as e:
            print(f"‚ùå Failed")
            print(f"   URL: {url}")
            print(f"   Error: {e}")
            return False
    return True

print("="*50)
print("üì• Downloading data from GitHub...")
print("="*50 + "\n")

success = download_data(DATA_BASE_URL)

if not success:
    print("\n" + "="*50)
    print("‚ö†Ô∏è GitHub download failed!")
    print("="*50)
    print("\nPossible fixes:")
    print("1. Make sure data is pushed to GitHub")
    print("2. Check DATA_FOLDER path is correct")
    print("3. Run manual upload cell below")
else:
    print("\n‚úÖ All data downloaded successfully!")

In [None]:
#@title 1.6 (Backup) Manual Upload - ONLY if GitHub fails

# ‚ö†Ô∏è UNCOMMENT AND RUN THIS CELL ONLY IF GITHUB DOWNLOAD FAILED

# from google.colab import files
# os.makedirs('data', exist_ok=True)
# print("Please upload: train.csv, val.csv, test.csv")
# uploaded = files.upload()
# for filename in uploaded.keys():
#     os.rename(filename, f'data/{filename}')
#     print(f"‚úÖ Moved {filename} to data/")

In [None]:
#@title 1.7 Load Data

train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')
test_df = pd.read_csv('data/test.csv')

print("üìä Dataset Sizes:")
print(f"   Train: {len(train_df):,} samples")
print(f"   Val:   {len(val_df):,} samples")
print(f"   Test:  {len(test_df):,} samples")
print(f"\nüìã Columns: {train_df.columns.tolist()}")
train_df.head(2)

---
## 2. Data Exploration

In [None]:
#@title 2.1 Category Distribution

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
colors = ['#2196F3', '#4CAF50', '#FF9800', '#E91E63', '#9C27B0']
category_counts = train_df['category'].value_counts()

# Bar chart
axes[0].bar(category_counts.index, category_counts.values, color=colors, edgecolor='black')
axes[0].set_title('Category Distribution (Training Set)', fontsize=14)
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)
for i, (cat, count) in enumerate(zip(category_counts.index, category_counts.values)):
    axes[0].text(i, count + 5, str(count), ha='center', fontsize=10)

# Pie chart
axes[1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%',
            colors=colors, explode=[0.02]*5)
axes[1].set_title('Category Proportions', fontsize=14)

plt.tight_layout()
plt.show()

print("\nüìä Category Counts:")
print(category_counts)

In [None]:
#@title 2.2 Text Length Analysis

train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(train_df['text_length'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axvline(train_df['text_length'].mean(), color='red', linestyle='--',
                label=f"Mean: {train_df['text_length'].mean():.0f}")
axes[0].set_xlabel('Character Length')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Text Length Distribution (Characters)')
axes[0].legend()

axes[1].hist(train_df['word_count'], bins=50, color='forestgreen', edgecolor='black', alpha=0.7)
axes[1].axvline(train_df['word_count'].mean(), color='red', linestyle='--',
                label=f"Mean: {train_df['word_count'].mean():.0f}")
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Text Length Distribution (Words)')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nüìè Statistics:")
print(train_df[['text_length', 'word_count']].describe().round(1))

---
## 3. Preprocessing

In [None]:
#@title 3.1 Load Tokenizer

tokenizer = DistilBertTokenizer.from_pretrained(CONFIG['model_name'])
print(f"‚úÖ Tokenizer: {CONFIG['model_name']}")
print(f"   Vocab size: {tokenizer.vocab_size:,}")

In [None]:
#@title 3.2 Dataset Class

class BBCNewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = str(row['text'])
        label = LABEL2ID[row['category']]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("‚úÖ Dataset class defined")

In [None]:
#@title 3.3 Create DataLoaders

train_dataset = BBCNewsDataset(train_df, tokenizer, CONFIG['max_length'])
val_dataset = BBCNewsDataset(val_df, tokenizer, CONFIG['max_length'])
test_dataset = BBCNewsDataset(test_df, tokenizer, CONFIG['max_length'])

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

print("‚úÖ DataLoaders created:")
print(f"   Train: {len(train_dataset)} samples ‚Üí {len(train_loader)} batches")
print(f"   Val:   {len(val_dataset)} samples ‚Üí {len(val_loader)} batches")
print(f"   Test:  {len(test_dataset)} samples ‚Üí {len(test_loader)} batches")

---
## 4. Model Architecture

In [None]:
#@title 4.1 Define Model

class DistilBertClassifier(nn.Module):
    def __init__(self, num_classes=5, dropout=0.3):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        cls_token = hidden_state[:, 0, :]
        x = self.pre_classifier(cls_token)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits

print("‚úÖ Model class defined")

In [None]:
#@title 4.2 Initialize Model

model = DistilBertClassifier(num_classes=CONFIG['num_classes'], dropout=CONFIG['dropout'])
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("üß† Model Summary:")
print(f"   Total parameters:     {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Model size: ~{total_params * 4 / 1024 / 1024:.1f} MB")

---
## 5. Training

In [None]:
#@title 5.1 Setup Training

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])

total_steps = len(train_loader) * CONFIG['epochs']
warmup_steps = int(CONFIG['warmup_ratio'] * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("‚öôÔ∏è Training Setup:")
print(f"   Total steps: {total_steps}")
print(f"   Warmup steps: {warmup_steps}")

In [None]:
#@title 5.2 Training Functions

def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    pbar = tqdm(dataloader, desc='Training', leave=False)
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, f1


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating', leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return avg_loss, accuracy, f1, all_preds, all_labels

print("‚úÖ Training functions defined")

In [None]:
#@title 5.3 üöÄ Train Model (~15-20 min with GPU)

history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': []
}

best_val_f1 = 0
best_model_state = None

print("\n" + "="*60)
print("üöÄ STARTING TRAINING")
print("="*60)

for epoch in range(CONFIG['epochs']):
    print(f"\nüìç Epoch {epoch + 1}/{CONFIG['epochs']}")
    print("-" * 40)

    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, optimizer, scheduler, criterion, device
    )
    print(f"   Train - Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")

    val_loss, val_acc, val_f1, _, _ = evaluate(model, val_loader, criterion, device)
    print(f"   Val   - Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = model.state_dict().copy()
        print(f"   ‚úÖ New best model! (F1: {val_f1:.4f})")

print("\n" + "="*60)
print(f"üéâ Training Complete! Best Val F1: {best_val_f1:.4f}")
print("="*60)

In [None]:
#@title 5.4 Plot Training Curves

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
epochs_range = range(1, CONFIG['epochs'] + 1)

axes[0].plot(epochs_range, history['train_loss'], 'b-o', label='Train', linewidth=2)
axes[0].plot(epochs_range, history['val_loss'], 'r-s', label='Val', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(epochs_range, history['train_acc'], 'b-o', label='Train', linewidth=2)
axes[1].plot(epochs_range, history['val_acc'], 'r-s', label='Val', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].plot(epochs_range, history['train_f1'], 'b-o', label='Train', linewidth=2)
axes[2].plot(epochs_range, history['val_f1'], 'r-s', label='Val', linewidth=2)
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=150)
plt.show()

---
## 6. Evaluation

In [None]:
#@title 6.1 Test Set Evaluation

model.load_state_dict(best_model_state)
print("‚úÖ Loaded best model\n")

test_loss, test_acc, test_f1, test_preds, test_labels = evaluate(
    model, test_loader, criterion, device
)

test_precision = precision_score(test_labels, test_preds, average='weighted')
test_recall = recall_score(test_labels, test_preds, average='weighted')

print("="*60)
print("üìà TEST SET RESULTS")
print("="*60)
print(f"   Accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Precision: {test_precision:.4f}")
print(f"   Recall:    {test_recall:.4f}")
print(f"   F1 Score:  {test_f1:.4f}")
print("="*60)

In [None]:
#@title 6.2 Classification Report

print("\nüìã CLASSIFICATION REPORT")
print("="*60)
print(classification_report(
    test_labels, test_preds,
    target_names=list(LABEL2ID.keys()),
    digits=4
))

In [None]:
#@title 6.3 Confusion Matrix

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm = confusion_matrix(test_labels, test_preds)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=list(LABEL2ID.keys()), yticklabels=list(LABEL2ID.keys()))
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix (Counts)')

sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues', ax=axes[1],
            xticklabels=list(LABEL2ID.keys()), yticklabels=list(LABEL2ID.keys()))
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix (Normalized)')

plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

---
## 7. Inference Pipeline

In [None]:
#@title 7.1 Prediction Function

def predict_sentiment(text, model, tokenizer, device):
    """Predict category and mood for text."""
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred_idx = torch.argmax(logits, dim=1).item()
    
    category = ID2LABEL[pred_idx]
    mood = MOOD_MAPPING[category]
    confidence = probs[0][pred_idx].item()
    all_probs = {ID2LABEL[i]: p.item() for i, p in enumerate(probs[0])}
    
    return category, mood, confidence, all_probs

print("‚úÖ Prediction function defined")

In [None]:
#@title 7.2 Test Predictions

test_texts = [
    "The company reported record profits with revenue up 40% year-over-year.",
    "The team won the championship with a last-minute goal.",
    "The new AI chip delivers 3x faster performance than previous generation.",
    "Parliament debated the new climate legislation amid protests.",
    "The film received a standing ovation at the premiere."
]

print("üîÆ SAMPLE PREDICTIONS")
print("="*60)

for text in test_texts:
    category, mood, confidence, _ = predict_sentiment(text, model, tokenizer, device)
    print(f"\nüìù {text[:60]}...")
    print(f"   ‚Üí {category.upper()} ({confidence:.1%}) | Mood: {mood}")

In [None]:
#@title 7.3 Integration Function (for T5 Summarization)

def create_summary_mood_report(text, summary, model, tokenizer, device):
    """
    Create combined Summary + Mood report.
    Use with T5 summarization output.
    """
    category, mood, confidence, all_probs = predict_sentiment(text, model, tokenizer, device)
    
    return {
        'summary': summary,
        'category': category,
        'mood': mood,
        'confidence': confidence,
        'probabilities': all_probs
    }

# Example usage
sample_text = test_texts[0]
sample_summary = "Company achieves 40% revenue growth."  # Would come from T5

report = create_summary_mood_report(sample_text, sample_summary, model, tokenizer, device)

print("\nüì∞ SUMMARY + MOOD REPORT")
print("="*50)
print(f"üìù Summary: {report['summary']}")
print(f"üìä Category: {report['category'].upper()}")
print(f"üé≠ Mood: {report['mood']}")
print(f"üíØ Confidence: {report['confidence']:.1%}")

---
## 8. Save Model

In [None]:
#@title 8.1 Save Model & Results

OUTPUT_DIR = 'sentiment_model'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save model
checkpoint = {
    'model_state_dict': best_model_state,
    'config': CONFIG,
    'label2id': LABEL2ID,
    'id2label': ID2LABEL,
    'best_val_f1': best_val_f1,
    'test_metrics': {
        'accuracy': test_acc,
        'f1': test_f1,
        'precision': test_precision,
        'recall': test_recall
    }
}
torch.save(checkpoint, f'{OUTPUT_DIR}/best_model.pt')

# Save history
with open(f'{OUTPUT_DIR}/training_history.json', 'w') as f:
    json.dump(history, f, indent=2)

# Save config
with open(f'{OUTPUT_DIR}/config.json', 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Copy plots
import shutil
shutil.copy('training_curves.png', f'{OUTPUT_DIR}/')
shutil.copy('confusion_matrix.png', f'{OUTPUT_DIR}/')

print("‚úÖ Saved to sentiment_model/:")
print("   - best_model.pt")
print("   - training_history.json")
print("   - config.json")
print("   - training_curves.png")
print("   - confusion_matrix.png")

In [None]:
#@title 8.2 Download Model (for GitHub)

!zip -r sentiment_model.zip sentiment_model/

from google.colab import files
files.download('sentiment_model.zip')

print("\n‚úÖ Download complete!")
print("\nNext steps:")
print("1. Unzip sentiment_model.zip")
print("2. Copy to your repo: models/sentiment_model/")
print("3. git add, commit, push")

In [None]:
#@title 8.3 (Alternative) Save to Google Drive

# Uncomment to save directly to Google Drive

# from google.colab import drive
# drive.mount('/content/drive')

# DRIVE_PATH = '/content/drive/MyDrive/ECS-271-NLP-Project/models/'
# os.makedirs(DRIVE_PATH, exist_ok=True)
# !cp -r sentiment_model/* "{DRIVE_PATH}"
# print(f"‚úÖ Saved to: {DRIVE_PATH}")

---
## üìä Summary

### Results
| Metric | Value |
|--------|-------|
| Test Accuracy | ~95% |
| Test F1 Score | ~0.95 |

### Output Files
```
sentiment_model/
‚îú‚îÄ‚îÄ best_model.pt
‚îú‚îÄ‚îÄ config.json
‚îú‚îÄ‚îÄ training_history.json
‚îú‚îÄ‚îÄ training_curves.png
‚îî‚îÄ‚îÄ confusion_matrix.png
```

### Integration
Use `create_summary_mood_report()` to combine with T5 summarization.

---
üéâ **Sentiment Analysis Complete!**