# Neural Networks Project: Text Generation with Transformer

This notebook demonstrates how to use the neural networks framework to train a Transformer model for text generation.

## Setup

First, let's import the necessary libraries and set up our environment.

In [None]:
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split

# Add the project root to the path
sys.path.append('..')

# Import project modules
from src.models.transformer_model import TransformerModel
from src.utils.trainer import Trainer
from src.data.datasets import TextDataset
from src.config.config_manager import ConfigManager, get_default_config

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Create a Mock Tokenizer

For demonstration purposes, we'll create a simple tokenizer. In a real application, you would use a more sophisticated tokenizer like those from the Hugging Face `transformers` library.

In [None]:
class SimpleTokenizer:
    def __init__(self, vocab_size=10000):
        self.vocab_size = vocab_size
        
    def __call__(self, text, max_length=128, padding='max_length', truncation=True, return_tensors=None):
        # This is a very simplified tokenization for demonstration purposes
        # In a real scenario, you would use a proper tokenizer (e.g., from Hugging Face)
        tokens = [hash(word) % (self.vocab_size - 4) + 4 for word in text.split()]
        
        # Add special tokens: 0=PAD, 1=BOS, 2=EOS, 3=UNK
        tokens = [1] + tokens + [2]  # Add BOS and EOS tokens
        
        # Truncate if necessary
        if truncation and len(tokens) > max_length:
            tokens = tokens[:max_length-1] + [2]  # Keep EOS token
        
        # Pad if necessary
        if padding == 'max_length':
            tokens = tokens + [0] * (max_length - len(tokens))
        
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1 if token != 0 else 0 for token in tokens]
        
        # Convert to tensors if requested
        if return_tensors == 'pt':
            return {
                'input_ids': torch.tensor([tokens]),
                'attention_mask': torch.tensor([attention_mask])
            }
        
        return {
            'input_ids': tokens,
            'attention_mask': attention_mask
        }

## Generate Mock Dataset

Let's create a mock dataset for demonstration purposes. In a real application, you would load actual text data.

In [None]:
def generate_mock_dataset(num_samples, seq_length, vocab_size):
    """Generate a mock text dataset for demonstration purposes."""
    sequences = []
    for _ in range(num_samples):
        # Generate random "text" (just space-separated numbers as words)
        num_words = np.random.randint(5, seq_length // 2)
        words = [str(np.random.randint(1, vocab_size)) for _ in range(num_words)]
        text = ' '.join(words)
        sequences.append(text)
    
    return sequences

# Define dataset parameters
vocab_size = 5000
max_seq_length = 50
num_samples = 5000

# Create tokenizer
tokenizer = SimpleTokenizer(vocab_size=vocab_size)

# Generate mock dataset
print("Generating mock dataset...")
all_texts = generate_mock_dataset(
    num_samples=num_samples,
    seq_length=max_seq_length,
    vocab_size=vocab_size
)

# Print a sample
print("\nSample text from dataset:")
print(all_texts[0])

## Create Dataset and DataLoaders

Now let's create a dataset and dataloaders using our TextDataset class.

In [None]:
# Define collate function for batching
def collate_batch(batch):
    """Collate function for DataLoader."""
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    
    # For transformer training, we use the input shifted right as target
    target = input_ids[:, 1:].contiguous()
    source = input_ids[:, :-1].contiguous()
    
    return source, target

# Create dataset
dataset = TextDataset(all_texts, tokenizer, max_length=max_seq_length)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch,
    num_workers=4
)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

## Configure and Create the Model

Let's configure and create our Transformer model.

In [None]:
# Start with the default configuration
config_manager = ConfigManager(default_config=get_default_config())
config = config_manager.get_all()

# Update configuration for our transformer model
config_manager.set('transformer.vocab_size', vocab_size)
config_manager.set('transformer.d_model', 128)  # Smaller for faster training
config_manager.set('transformer.nhead', 4)
config_manager.set('transformer.num_encoder_layers', 3)
config_manager.set('transformer.num_decoder_layers', 3)
config_manager.set('transformer.dim_feedforward', 512)
config_manager.set('transformer.max_seq_length', max_seq_length)
config_manager.set('model.dropout_rate', 0.1)
config_manager.set('training.num_epochs', 3)  # Small number for demonstration
config_manager.set('training.learning_rate', 0.0005)

# Create model configuration
model_config = {
    'vocab_size': config['transformer']['vocab_size'],
    'd_model': config['transformer']['d_model'],
    'nhead': config['transformer']['nhead'],
    'num_encoder_layers': config['transformer']['num_encoder_layers'],
    'num_decoder_layers': config['transformer']['num_decoder_layers'],
    'dim_feedforward': config['transformer']['dim_feedforward'],
    'dropout': config['model']['dropout_rate'],
    'max_seq_length': config['transformer']['max_seq_length']
}

# Create the model
model = TransformerModel(model_config)
model = model.to(device)

# Print model summary
print(f"Transformer Model created with {model.get_parameter_count():,} trainable parameters")

## Prepare for Training

We need to customize our forward pass for training the transformer.

In [None]:
# Save the original forward method
original_forward = model.forward

# Custom forward method for the trainer
def train_forward(x):
    # Unpack source and target from input
    src, tgt = x
    # Call the original forward method
    output = original_forward(src, tgt)
    # Reshape output for cross-entropy loss
    batch_size, seq_len, vocab_size = output.size()
    return output.reshape(batch_size * seq_len, vocab_size)

# Monkey patch the forward method for training
model.forward = train_forward

## Set Up the Trainer

Now let's set up the training configuration and create our trainer.

In [None]:
# Create trainer configuration
trainer_config = {
    'learning_rate': config['training']['learning_rate'],
    'weight_decay': config['training']['weight_decay'],
    'num_epochs': config['training']['num_epochs'],
    'batch_size': batch_size,
    'optimizer': 'adamw',  # Use AdamW optimizer
    'scheduler': 'cosine',  # Use cosine annealing scheduler
    'criterion': 'cross_entropy',  # Use cross-entropy loss
    'clip_grad_norm': 1.0,  # Clip gradients
    'early_stopping_patience': 3,  # Stop training if no improvement after 3 epochs
    'checkpoint_dir': '../checkpoints/transformer',  # Directory to save model checkpoints
    'save_best_only': True  # Only save the best model
}

# Create directories if they don't exist
os.makedirs(trainer_config['checkpoint_dir'], exist_ok=True)

# Create the trainer
trainer = Trainer(model, trainer_config, device)

## Train the Model

Now we're ready to train our model.

In [None]:
# Start training
print(f"Starting training for {trainer_config['num_epochs']} epochs...")
stats = trainer.train(train_loader, val_loader)

# Print best results
print(f"\nBest validation accuracy: {stats['best_val_acc']:.2f}%")
print(f"Best validation loss: {stats['best_val_loss']:.4f} (epoch {stats['best_epoch']})")

# Restore the original forward method
model.forward = original_forward

## Visualize Training Results

Let's visualize how the training and validation metrics changed during training.

In [None]:
# Plot training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(stats['train_loss']) + 1), stats['train_loss'], label='Training Loss')
plt.plot(range(1, len(stats['val_loss']) + 1), stats['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(stats['train_acc']) + 1), stats['train_acc'], label='Training Accuracy')
plt.plot(range(1, len(stats['val_acc']) + 1), stats['val_acc'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## Generate Text with the Trained Model

In [None]:
# Load the best model
best_model_path = os.path.join(trainer_config['checkpoint_dir'], 'best_model.pt')
model.load(best_model_path)

# Create some seed sequences for text generation
seed_texts = [
    "1000 2000 3000",
    "500 600 700 800",
    "100 200"
]

# Function to generate text
def generate_text(model, seed_text, tokenizer, max_length=30, temperature=1.0):
    # Tokenize the seed text
    encoding = tokenizer(seed_text, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    
    # Generate sequence
    generated_ids = model.generate(input_ids, max_length=max_length, temperature=temperature)
    
    # Convert token IDs back to words (in a real scenario, you would use the tokenizer's decode method)
    # Here we just print the token IDs since our tokenizer is very simple
    return generated_ids.cpu().numpy()

# Generate text from each seed
print("\nGenerating text from seeds:")
model.eval()
for i, seed_text in enumerate(seed_texts):
    generated_ids = generate_text(model, seed_text, tokenizer)
    print(f"\nSeed {i+1}: {seed_text}")
    print(f"Generated sequence: {generated_ids[0]}")

## Save the Model Configuration

Let's save the model configuration for future reference.

In [None]:
# Save the configuration to a file
os.makedirs('../outputs', exist_ok=True)
config_path = '../outputs/transformer_config.yaml'
config_manager.save_config(config_path)
print(f"Configuration saved to {config_path}")

## Conclusion

In this notebook, we have demonstrated how to use the neural networks framework to:

1. Create a custom tokenizer and generate a mock text dataset
2. Configure and create a Transformer model for text generation
3. Train the model using our training utilities
4. Generate text sequences using the trained model
5. Save the model and configuration for future use

For a real-world application, you would replace the mock dataset with actual text data and use a proper tokenizer like those from the Hugging Face `transformers` library. Additionally, you would likely train for more epochs and use a larger model for better results.