## 1. Import Required Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# HuggingFace Transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Progress bar
from tqdm import tqdm

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"âœ… Using device: {device}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Load Processed Data

For deep learning, we'll use a subset of the data for faster training. You can use the full dataset by removing `.head()`.

In [None]:
# Load processed data
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

# Use subset for faster training (remove .head() for full dataset)
train_df = train_df.head(10000)  # Use 10k samples
test_df = test_df.head(2000)     # Use 2k samples

print(f"âœ… Data loaded!")
print(f"Training samples: {len(train_df):,}")
print(f"Test samples: {len(test_df):,}")
print(f"\nðŸ’¡ Tip: For full dataset, remove .head() calls above")

## 3. LSTM Model Implementation

### 3.1 Text Tokenization for LSTM

In [None]:
# Build vocabulary from training data
from collections import Counter

def build_vocabulary(texts, max_vocab_size=10000):
    """Build vocabulary from texts."""
    all_words = []
    for text in texts:
        all_words.extend(str(text).split())
    
    word_counts = Counter(all_words)
    vocab = ['<PAD>', '<UNK>'] + [word for word, _ in word_counts.most_common(max_vocab_size - 2)]
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    return word_to_idx, vocab

def tokenize_and_pad(texts, word_to_idx, max_len=256):
    """Convert texts to padded sequences of indices."""
    sequences = []
    for text in texts:
        tokens = str(text).split()[:max_len]
        indices = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
        # Pad to max_len
        indices += [word_to_idx['<PAD>']] * (max_len - len(indices))
        sequences.append(indices)
    return np.array(sequences)

# Build vocabulary and tokenize
print("Building vocabulary...")
word_to_idx, vocab = build_vocabulary(train_df['cleaned_text'], max_vocab_size=10000)
print(f"Vocabulary size: {len(vocab):,}")

print("\nTokenizing texts...")
max_len = 256
X_train_seq = tokenize_and_pad(train_df['cleaned_text'], word_to_idx, max_len)
X_test_seq = tokenize_and_pad(test_df['cleaned_text'], word_to_idx, max_len)
y_train = train_df['label'].values
y_test = test_df['label'].values

print(f"âœ… Tokenization complete!")
print(f"Training shape: {X_train_seq.shape}")
print(f"Test shape: {X_test_seq.shape}")

### 3.2 Create PyTorch Datasets and DataLoaders

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.LongTensor(X_train_seq)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.LongTensor(X_test_seq)
y_test_tensor = torch.FloatTensor(y_test)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"âœ… DataLoaders created!")
print(f"Training batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

### 3.3 LSTM Model Architecture

In [None]:
class LSTMSentimentClassifier(nn.Module):
    """Bidirectional LSTM for sentiment classification."""
    
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.5):
        super(LSTMSentimentClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # *2 for bidirectional
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # x: (batch_size, seq_len)
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        
        # LSTM forward pass
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Concatenate last hidden states from both directions
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.dropout(hidden)
        
        # Final classification
        output = self.fc(hidden)
        output = self.sigmoid(output)
        
        return output.squeeze()

# Initialize model
lstm_model = LSTMSentimentClassifier(
    vocab_size=len(vocab),
    embedding_dim=128,
    hidden_dim=256,
    num_layers=2,
    dropout=0.5
).to(device)

print("âœ… LSTM model created!")
print(f"\nModel architecture:")
print(lstm_model)
print(f"\nTotal parameters: {sum(p.numel() for p in lstm_model.parameters()):,}")

### 3.4 Train LSTM Model

In [None]:
print("="*80)
print("TRAINING LSTM MODEL")
print("="*80)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
train_losses = []
train_accs = []

for epoch in range(num_epochs):
    lstm_model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for sequences, labels in pbar:
        sequences, labels = sequences.to(device), labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = lstm_model(sequences)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Statistics
        epoch_loss += loss.item()
        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
        pbar.set_postfix({'loss': f'{loss.item():.4f}', 'acc': f'{correct/total:.4f}'})
    
    avg_loss = epoch_loss / len(train_loader)
    avg_acc = correct / total
    train_losses.append(avg_loss)
    train_accs.append(avg_acc)
    
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}')

print("\nâœ… LSTM training complete!")

### 3.5 Evaluate LSTM Model

In [None]:
# Evaluation
lstm_model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for sequences, labels in tqdm(test_loader, desc='Evaluating LSTM'):
        sequences = sequences.to(device)
        outputs = lstm_model(sequences)
        predictions = (outputs > 0.5).float().cpu().numpy()
        all_predictions.extend(predictions)
        all_labels.extend(labels.numpy())

# Calculate metrics
lstm_accuracy = accuracy_score(all_labels, all_predictions)
lstm_precision = precision_score(all_labels, all_predictions)
lstm_recall = recall_score(all_labels, all_predictions)
lstm_f1 = f1_score(all_labels, all_predictions)

print("\n" + "="*60)
print("LSTM MODEL - TEST SET PERFORMANCE")
print("="*60)
print(f"Accuracy:  {lstm_accuracy:.4f}")
print(f"Precision: {lstm_precision:.4f}")
print(f"Recall:    {lstm_recall:.4f}")
print(f"F1-Score:  {lstm_f1:.4f}")

print("\nðŸ“‹ Classification Report:")
print(classification_report(all_labels, all_predictions, target_names=['Negative', 'Positive']))

## 4. BERT Model Implementation

### 4.1 Load DistilBERT and Tokenizer

In [None]:
print("="*80)
print("DISTILBERT MODEL SETUP")
print("="*80)

# Load tokenizer and model
print("\nLoading DistilBERT model and tokenizer...")
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
).to(device)

print("âœ… DistilBERT loaded!")
print(f"Total parameters: {sum(p.numel() for p in bert_model.parameters()):,}")

### 4.2 Prepare BERT Data

In [None]:
# Tokenize data for BERT (use original text, not cleaned_text)
print("Tokenizing data for BERT...")
train_encodings = bert_tokenizer(
    train_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors='pt'
)
test_encodings = bert_tokenizer(
    test_df['text'].tolist(),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors='pt'
)

# Create PyTorch datasets
class BERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

bert_train_dataset = BERTDataset(train_encodings, train_df['label'].tolist())
bert_test_dataset = BERTDataset(test_encodings, test_df['label'].tolist())

# Create dataloaders
bert_train_loader = DataLoader(bert_train_dataset, batch_size=16, shuffle=True)
bert_test_loader = DataLoader(bert_test_dataset, batch_size=16, shuffle=False)

print(f"âœ… BERT data preparation complete!")
print(f"Training batches: {len(bert_train_loader)}")
print(f"Test batches: {len(bert_test_loader)}")

### 4.3 Train BERT Model

In [None]:
print("\n" + "="*80)
print("TRAINING DISTILBERT MODEL")
print("="*80)

# Optimizer and scheduler
bert_optimizer = AdamW(bert_model.parameters(), lr=2e-5)
num_epochs_bert = 3
total_steps = len(bert_train_loader) * num_epochs_bert
scheduler = get_linear_schedule_with_warmup(
    bert_optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
for epoch in range(num_epochs_bert):
    bert_model.train()
    total_loss = 0
    
    pbar = tqdm(bert_train_loader, desc=f'BERT Epoch {epoch+1}/{num_epochs_bert}')
    for batch in pbar:
        bert_optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = bert_model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()
        bert_optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(bert_train_loader)
    print(f'Epoch {epoch+1}: Average Loss = {avg_loss:.4f}')

print("\nâœ… BERT training complete!")

### 4.4 Evaluate BERT Model

In [None]:
# Evaluation
bert_model.eval()
bert_predictions = []
bert_labels = []

with torch.no_grad():
    for batch in tqdm(bert_test_loader, desc='Evaluating BERT'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        
        bert_predictions.extend(predictions)
        bert_labels.extend(labels.numpy())

# Calculate metrics
bert_accuracy = accuracy_score(bert_labels, bert_predictions)
bert_precision = precision_score(bert_labels, bert_predictions)
bert_recall = recall_score(bert_labels, bert_predictions)
bert_f1 = f1_score(bert_labels, bert_predictions)

print("\n" + "="*60)
print("DISTILBERT MODEL - TEST SET PERFORMANCE")
print("="*60)
print(f"Accuracy:  {bert_accuracy:.4f}")
print(f"Precision: {bert_precision:.4f}")
print(f"Recall:    {bert_recall:.4f}")
print(f"F1-Score:  {bert_f1:.4f}")

print("\nðŸ“‹ Classification Report:")
print(classification_report(bert_labels, bert_predictions, target_names=['Negative', 'Positive']))

## 5. Save Deep Learning Models

In [None]:
# Save LSTM model
torch.save(lstm_model.state_dict(), '../models/lstm_model.pth')
print("âœ… LSTM model saved to ../models/lstm_model.pth")

# Save BERT model
bert_model.save_pretrained('../models/distilbert_model')
bert_tokenizer.save_pretrained('../models/distilbert_tokenizer')
print("âœ… DistilBERT model saved to ../models/distilbert_model/")

# Save deep learning results
dl_results = pd.DataFrame({
    'Model': ['LSTM', 'DistilBERT'],
    'Accuracy': [lstm_accuracy, bert_accuracy],
    'Precision': [lstm_precision, bert_precision],
    'Recall': [lstm_recall, bert_recall],
    'F1-Score': [lstm_f1, bert_f1]
})

dl_results.to_csv('../results/deep_learning_results.csv', index=False)
print("âœ… Results saved to ../results/deep_learning_results.csv")

## ðŸŽ‰ Notebook 3 Complete!

**What we accomplished**:
- âœ… Implemented custom Bidirectional LSTM model
- âœ… Fine-tuned DistilBERT transformer model
- âœ… Trained both models with proper training loops
- âœ… Evaluated performance on test set
- âœ… Saved trained models for deployment

**Expected Results**:
- **LSTM**: ~87-90% accuracy
- **DistilBERT**: ~92-94% accuracy (best overall!)

**Next Steps**: Proceed to Notebook 4 for comprehensive model comparison and final analysis!