# üöÄ Train Large Model - VS Code Version

This notebook trains ONLY the large/bestfit model.

## Setup Instructions:
1. Make sure dataset is at: `dataset/Pride_and_Prejudice-Jane_Austen.txt`
2. Run all cells
3. Results will be saved to `models/` and `results/` folders

In [1]:
# Check dependencies (already installed in your environment)
import sys
print(f"Python: {sys.version}")
print(f"‚úÖ Using existing environment")

Python: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
‚úÖ Using existing environment


## üìÅ Step 2: Set Dataset Path

Using dataset from your project folder

In [2]:
import os

# Set dataset path (relative to notebook location)
dataset_filename = '../dataset/Pride_and_Prejudice-Jane_Austen.txt'

# Verify file exists
if os.path.exists(dataset_filename):
    print(f"‚úÖ Found dataset: {dataset_filename}")
    print(f"   Size: {os.path.getsize(dataset_filename) / 1024:.2f} KB")
else:
    print(f"‚ùå Dataset not found at: {dataset_filename}")
    print("   Please check the path!")

‚úÖ Found dataset: ../dataset/Pride_and_Prejudice-Jane_Austen.txt
   Size: 694.67 KB


## üîß Step 3: Setup Project Structure

In [3]:
# Create directories (relative to notebook location)
os.makedirs('../models', exist_ok=True)
os.makedirs('../vocab', exist_ok=True)
os.makedirs('../results/plots', exist_ok=True)
os.makedirs('../results/metrics', exist_ok=True)
os.makedirs('../results/logs', exist_ok=True)

print("‚úÖ Directory structure created")

‚úÖ Directory structure created


## üíª Step 4: Define Source Code

In [4]:
# Configuration
def get_config(model_type='large'):
    """Get configuration for large model - BESTFIT (balanced size)"""
    return {
        'data_path': dataset_filename,
        'vocab_path': '../vocab/vocab.pkl',
        'model_save_dir': '../models/',
        'results_dir': '../results/',
        
        # Data parameters
        'seq_length': 35,
        'min_freq': 2,
        'batch_size': 64,
        'train_ratio': 0.8,
        'val_ratio': 0.1,
        'num_workers': 0,  # VS Code/Windows compatibility
        
        # Training parameters
        'num_epochs': 20,
        'learning_rate': 0.001,
        'grad_clip': 5.0,
        'patience': 5,
        'save_every': 5,
        
        # Large model architecture (Balanced between small and medium)
        'embedding_dim': 200,  # Between small (128) and medium (256)
        'hidden_dim': 400,     # Between small (256) and medium (512)
        'num_layers': 2,       # Between small (1) and medium (2)
        'dropout': 0.4,        # Between small (0.3) and medium (0.4)
        'model_type': model_type,
        
        # Generation parameters
        'gen_length': 50,
        'temperature': 1.0,
    }

config = get_config('large')
print("‚úÖ Configuration loaded - BESTFIT MODEL")
print(f"   Model: {config['model_type'].upper()} (Balanced)")
print(f"   Embedding: {config['embedding_dim']}, Hidden: {config['hidden_dim']}, Layers: {config['num_layers']}")
print(f"   This is between SMALL (128/256/1) and MEDIUM (256/512/2)")
print(f"   Uses SAME vocab as your trained models ‚úÖ")

‚úÖ Configuration loaded - BESTFIT MODEL
   Model: LARGE (Balanced)
   Embedding: 200, Hidden: 400, Layers: 2
   This is between SMALL (128/256/1) and MEDIUM (256/512/2)
   Uses SAME vocab as your trained models ‚úÖ


In [5]:
import torch
import torch.nn as nn
from collections import Counter
import pickle

# Vocabulary class
class Vocabulary:
    def __init__(self, min_freq=2):
        self.min_freq = min_freq
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.word_freq = Counter()
    
    def build_vocab(self, texts):
        for text in texts:
            self.word_freq.update(text.split())
        
        idx = 2
        for word, freq in self.word_freq.items():
            if freq >= self.min_freq:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1
    
    def __len__(self):
        return len(self.word2idx)
    
    def encode(self, text):
        return [self.word2idx.get(word, 1) for word in text.split()]
    
    def decode(self, indices):
        return ' '.join([self.idx2word.get(idx, '<unk>') for idx in indices])

print("‚úÖ Vocabulary class defined")

‚úÖ Vocabulary class defined


In [6]:
from torch.utils.data import Dataset, DataLoader

# Dataset class
class TextDataset(Dataset):
    def __init__(self, text, vocab, seq_length):
        self.vocab = vocab
        self.seq_length = seq_length
        self.encoded_text = vocab.encode(text)
    
    def __len__(self):
        return max(0, len(self.encoded_text) - self.seq_length)
    
    def __getitem__(self, idx):
        x = self.encoded_text[idx:idx + self.seq_length]
        y = self.encoded_text[idx + 1:idx + self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

print("‚úÖ Dataset class defined")

‚úÖ Dataset class defined


In [7]:
# LSTM Model
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
    
    def forward(self, x, hidden=None):
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.lstm(embedded, hidden)
        output = self.dropout(output)
        logits = self.fc(output)
        return logits, hidden
    
    def init_hidden(self, batch_size, device):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

print("‚úÖ LSTM Model class defined")

‚úÖ LSTM Model class defined


## üìö Step 5: Prepare Dataset & Vocabulary

In [8]:
# Load and prepare text
with open(dataset_filename, 'r', encoding='utf-8') as f:
    text = f.read()

# Load EXISTING vocabulary (same as small/medium models)
vocab_path = config['vocab_path']
if os.path.exists(vocab_path):
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    print(f"‚úÖ Loaded EXISTING vocabulary (same as small/medium models)")
    print(f"   Vocab size: {len(vocab)}")
else:
    # Only build if doesn't exist
    vocab = Vocabulary(min_freq=config['min_freq'])
    vocab.build_vocab([text])
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"‚úÖ Built NEW vocabulary")
    print(f"   Vocab size: {len(vocab)}")

print(f"   Total words in dataset: {len(text.split())}")

# Split data (SAME split as small/medium models)
train_size = int(config['train_ratio'] * len(text))
val_size = int(config['val_ratio'] * len(text))

train_text = text[:train_size]
val_text = text[train_size:train_size + val_size]
test_text = text[train_size + val_size:]

# Create datasets
train_dataset = TextDataset(train_text, vocab, config['seq_length'])
val_dataset = TextDataset(val_text, vocab, config['seq_length'])
test_dataset = TextDataset(test_text, vocab, config['seq_length'])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], 
                         shuffle=True, num_workers=config['num_workers'])
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], 
                       shuffle=False, num_workers=config['num_workers'])
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], 
                        shuffle=False, num_workers=config['num_workers'])

print(f"‚úÖ Datasets created (using SAME split as small/medium)")
print(f"   Train: {len(train_dataset)} sequences")
print(f"   Val: {len(val_dataset)} sequences")
print(f"   Test: {len(test_dataset)} sequences")

‚úÖ Loaded EXISTING vocabulary (same as small/medium models)
   Vocab size: 6250
   Total words in dataset: 124970
‚úÖ Datasets created (using SAME split as small/medium)
   Train: 100000 sequences
   Val: 12655 sequences
   Test: 12211 sequences


## üéØ Step 6: Initialize Model & Training

In [9]:
import math
from tqdm import tqdm

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üñ•Ô∏è  Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Create model
model = LSTMLanguageModel(
    vocab_size=len(vocab),
    embedding_dim=config['embedding_dim'],
    hidden_dim=config['hidden_dim'],
    num_layers=config['num_layers'],
    dropout=config['dropout']
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n‚úÖ Model initialized")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")

üñ•Ô∏è  Using device: cuda
   GPU: NVIDIA GeForce RTX 3050 Laptop GPU
   Memory: 4.29 GB

‚úÖ Model initialized
   Total parameters: 6,002,650
   Trainable parameters: 6,002,650

‚úÖ Model initialized
   Total parameters: 6,002,650
   Trainable parameters: 6,002,650


## üöÄ Step 7: Train Large Model

In [10]:
import time

# Training function
def train_epoch(model, train_loader, criterion, optimizer, device, grad_clip):
    model.train()
    total_loss = 0
    
    with tqdm(train_loader, desc='Training', leave=False) as pbar:
        for inputs, targets in pbar:
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs, _ = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            
            total_loss += loss.item()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(train_loader)

# Validation function
def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs, _ = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            total_loss += loss.item()
    
    avg_loss = total_loss / len(val_loader)
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity

print("‚úÖ Training functions defined")

‚úÖ Training functions defined


In [None]:
# Training loop
print("\n" + "="*70)
print("üöÄ TRAINING LARGE MODEL")
print("="*70)

best_val_loss = float('inf')
patience_counter = 0
train_losses = []
val_losses = []
val_perplexities = []

start_time = time.time()

for epoch in range(1, config['num_epochs'] + 1):
    epoch_start = time.time()
    
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device, config['grad_clip'])
    train_losses.append(train_loss)
    
    # Validate
    val_loss, val_perplexity = validate(model, val_loader, criterion, device)
    val_losses.append(val_loss)
    val_perplexities.append(val_perplexity)
    
    epoch_time = time.time() - epoch_start
    
    print(f"Epoch {epoch:2d}/{config['num_epochs']} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val PPL: {val_perplexity:.2f} | "
          f"Time: {epoch_time:.1f}s")
    
    # Save checkpoint
    if epoch % config['save_every'] == 0:
        checkpoint_path = os.path.join(config['model_save_dir'], f"large_epoch_{epoch}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_perplexity': val_perplexity,
        }, checkpoint_path)
        print(f"   üíæ Checkpoint saved: large_epoch_{epoch}.pt")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        
        best_model_path = os.path.join(config['model_save_dir'], "large_model_best.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_perplexity': val_perplexity,
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_perplexities': val_perplexities,
            'config': config,
        }, best_model_path)
        print(f"   ‚≠ê Best model saved! (Val Loss: {val_loss:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= config['patience']:
            print(f"\n‚ö†Ô∏è  Early stopping triggered (patience={config['patience']})")
            break

total_time = time.time() - start_time
print(f"\n‚úÖ Training complete! Total time: {total_time/60:.1f} minutes")
print(f"   Best validation loss: {best_val_loss:.4f}")
print(f"   Best validation perplexity: {math.exp(best_val_loss):.2f}")


üöÄ TRAINING LARGE MODEL


                                                                          

Epoch  1/20 | Train Loss: 5.4043 | Val Loss: 5.3586 | Val PPL: 212.42 | Time: 56.7s
   ‚≠ê Best model saved! (Val Loss: 5.3586)


                                                                          

Epoch  2/20 | Train Loss: 4.2693 | Val Loss: 5.5080 | Val PPL: 246.65 | Time: 56.6s


                                                                          

Epoch  3/20 | Train Loss: 3.7583 | Val Loss: 5.7679 | Val PPL: 319.86 | Time: 56.7s


Training:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 1095/1563 [00:38<00:16, 28.74it/s, loss=3.3162]

## üìä Step 8: Plot Training Curves

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(train_losses, label='Train Loss', marker='o')
axes[0].plot(val_losses, label='Val Loss', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss - Large Model')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Perplexity plot
axes[1].plot(val_perplexities, label='Val Perplexity', marker='s', color='orange')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Perplexity')
axes[1].set_title('Validation Perplexity - Large Model')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plot_path = 'results/plots/large_model_training.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ Plot saved: {plot_path}")

## üß™ Step 9: Evaluate on Test Set

In [None]:
# Load best model
checkpoint = torch.load(best_model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

# Evaluate on test set
test_loss, test_perplexity = validate(model, test_loader, criterion, device)

print("\n" + "="*70)
print("üìä TEST SET EVALUATION")
print("="*70)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Perplexity: {test_perplexity:.2f}")
print("="*70)

## üìù Step 10: Generate Text Samples

In [None]:
def generate_text(model, vocab, seed_text, length=50, temperature=1.0, device='cpu'):
    """Generate text from seed text"""
    model.eval()
    
    tokens = seed_text.split()
    input_seq = torch.tensor([vocab.word2idx.get(w, 1) for w in tokens]).unsqueeze(0).to(device)
    
    generated = tokens.copy()
    hidden = None
    
    with torch.no_grad():
        for _ in range(length):
            output, hidden = model(input_seq, hidden)
            logits = output[0, -1] / temperature
            probs = torch.softmax(logits, dim=0)
            next_token = torch.multinomial(probs, 1).item()
            
            next_word = vocab.idx2word.get(next_token, '<unk>')
            generated.append(next_word)
            
            input_seq = torch.tensor([[next_token]]).to(device)
    
    return ' '.join(generated)

# Generate samples
print("\n" + "="*70)
print("üìù GENERATED TEXT SAMPLES")
print("="*70)

seed_texts = [
    "It is a truth universally acknowledged that",
    "Mr. Darcy was",
    "Elizabeth felt"
]

generated_samples = []
for i, seed in enumerate(seed_texts, 1):
    generated = generate_text(model, vocab, seed, length=config['gen_length'], 
                             temperature=config['temperature'], device=device)
    generated_samples.append(generated)
    print(f"\nSample {i}:")
    print(f"Seed: '{seed}'")
    print(f"Generated: {generated}")
    print("-" * 70)

## üíæ Step 11: Save Training Metrics

In [None]:
import json

# Save metrics
metrics = {
    'model_type': 'large',
    'total_epochs': len(train_losses),
    'best_epoch': checkpoint['epoch'],
    'best_val_loss': best_val_loss,
    'best_val_perplexity': math.exp(best_val_loss),
    'test_loss': test_loss,
    'test_perplexity': test_perplexity,
    'train_losses': train_losses,
    'val_losses': val_losses,
    'val_perplexities': val_perplexities,
    'training_time_minutes': total_time / 60,
    'total_parameters': total_params,
    'config': config,
}

metrics_path = 'results/metrics/large_model_metrics.json'
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"‚úÖ Metrics saved: {metrics_path}")

## üì• Step 12: Results Saved

All results are saved to your project folders

In [None]:
# List saved files
print("üì¶ Saved files:\n")

print("Checkpoints:")
checkpoint_files = [f for f in os.listdir('../models/') if f.startswith('large_')]
for i, ckpt in enumerate(checkpoint_files, 1):
    print(f"  {i}. models/{ckpt}")

print("\nVocabulary:")
print(f"  ‚Ä¢ vocab/vocab.pkl")

print("\nPlots:")
if os.path.exists('../results/plots/large_model_training.png'):
    print(f"  ‚Ä¢ results/plots/large_model_training.png")

print("\nMetrics:")
if os.path.exists('../results/metrics/large_model_metrics.json'):
    print(f"  ‚Ä¢ results/metrics/large_model_metrics.json")

print("\n‚úÖ All files saved in your project folders!")
print(f"   Total checkpoints: {len(checkpoint_files)} files")

## üìä Step 13: Final Summary

In [None]:
print("\n" + "="*70)
print("üéâ TRAINING COMPLETE - SUMMARY")
print("="*70)
print(f"\nüìà Model Performance:")
print(f"   ‚Ä¢ Model Type: LARGE (bestfit)")
print(f"   ‚Ä¢ Architecture: {config['embedding_dim']}‚Üí{config['hidden_dim']}√ó{config['num_layers']} layers")
print(f"   ‚Ä¢ Total Parameters: {total_params:,}")
print(f"   ‚Ä¢ Training Time: {total_time/60:.1f} minutes")
print(f"   ‚Ä¢ Epochs Trained: {len(train_losses)}/{config['num_epochs']}")
print(f"\nüéØ Best Results:")
print(f"   ‚Ä¢ Best Epoch: {checkpoint['epoch']}")
print(f"   ‚Ä¢ Best Val Loss: {best_val_loss:.4f}")
print(f"   ‚Ä¢ Best Val Perplexity: {math.exp(best_val_loss):.2f}")
print(f"\nüß™ Test Performance:")
print(f"   ‚Ä¢ Test Loss: {test_loss:.4f}")
print(f"   ‚Ä¢ Test Perplexity: {test_perplexity:.2f}")
print(f"\nüíæ Saved Files:")
print(f"   ‚Ä¢ Model: large_model_best.pt")
print(f"   ‚Ä¢ Vocabulary: vocab.pkl")
print(f"   ‚Ä¢ Plot: large_model_training.png")
print(f"   ‚Ä¢ Metrics: large_model_metrics.json")
print(f"   ‚Ä¢ Checkpoints: {len(checkpoint_files)} files")
print("\n" + "="*70)
print("‚úÖ Upload these files to your local project's models/ and vocab/ folders")
print("="*70)