In [1]:
!pip install evaluate gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting gensim
  Downloading gensim-4.4.0.tar.gz (23.3 MB)
     ---------------------------------------- 0.0/23.3 MB ? eta -:--:--
     - -------------------------------------- 1.0/23.3 MB 32.1 MB/s eta 0:00:01
     - -------------------------------------- 1.0/23.3 MB 32.1 MB/s eta 0:00:01
     ----- ---------------------------------- 3.1/23.3 MB 6.8 MB/s eta 0:00:03
     ------- -------------------------------- 4.2/23.3 MB 6.3 MB/s eta 0:00:04
     ------- -------------------------------- 4.2/23.3 MB 6.3 MB/s eta 0:00:04
     ------- -------------------------------- 4.2/23.3 MB 6.3 MB/s eta 0:00:04
     ------- -------------------------------- 4.2/23.3 MB 6.3 MB/s eta 0:00:04
     ------- -------------------------------- 4.2/23.3 MB 6.3 MB/s eta 0:00:04
     --------- ------------------------------ 5.2/23.3 MB 2.8 MB/s eta

  error: subprocess-exited-with-error
  
  × Building wheel for gensim (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [807 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-314\gensim
      copying gensim\downloader.py -> build\lib.win-amd64-cpython-314\gensim
      copying gensim\interfaces.py -> build\lib.win-amd64-cpython-314\gensim
      copying gensim\matutils.py -> build\lib.win-amd64-cpython-314\gensim
      copying gensim\nosy.py -> build\lib.win-amd64-cpython-314\gensim
      copying gensim\utils.py -> build\lib.win-amd64-cpython-314\gensim
      copying gensim\__init__.py -> build\lib.win-amd64-cpython-314\gensim
      creating build\lib.win-amd64-cpython-314\gensim\corpora
      copying gensim\corpora\bleicorpus.py -> build\lib.win-amd64-cpython-314\gensim\corpora
      copying gensim\corpora\csvcorpus.py -> build\lib.win-amd64-cpython-314\gensim\corpora
      copying gensim\cor

In [2]:
import torch , math
from torch.utils.data import Dataset
from tqdm import tqdm
import os , re , json
import pickle
import hashlib
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from gensim.models import KeyedVectors
import gensim.downloader as api
import time
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict , Counter


In [3]:
from model.DecoderTransformer import DecoderTransformer
from Dataset.Vocabulary import Vocabulary
from Dataset.TinyStories import TinyStoriesDataset
from Dataset.load_fasttext_model import load_fasttext_model

In [12]:
CONFIG = {
    'name': 'baseline',
    'description': 'Standard baseline configuration from assignment',
    'context_length': 64,
    'num_layers': 3,
    'num_heads': 8,
    'd_model': 296,
    'd_ff': 1184,
    'dropout': 0.1,
    'batch_size': 32,
    'learning_rate': 3e-4,
    'num_epochs': 4,
    'max_train_samples': 15000,
    'max_val_samples': 5000,
    'save_dir': 'checkpoints/baseline',
    'plot_dir': 'plots/baseline'
}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"\nConfiguration:")
for k, v in CONFIG.items():
     print(f"  {k}: {v}")

    # Load FastText
print("\n" + "="*50)
print("Loading FastText embeddings...")
print("="*50)
fasttext_model = load_fasttext_model()

    # Load Dataset
print("\n" + "="*50)
print("Loading TinyStories dataset...")
print("="*50)
dataset = load_dataset("roneneldan/TinyStories")

print("\n" + "="*50)
print("Building vocabulary...")
print("="*50)
vocab_path = f"{CONFIG['save_dir']}/vocab.json"

if os.path.exists(vocab_path):
        print("Loading existing vocabulary...")
        vocab = Vocabulary.load(vocab_path, fasttext_model)
else:
    vocab = Vocabulary(fasttext_model)
    # Build vocabulary from training data
    num_samples = min(CONFIG['max_train_samples'], len(dataset['train']))
    for i in tqdm(range(num_samples), desc="Building vocabulary"):
        text = dataset['train'][i]['text']
        for word in vocab.tokenize(text):
            vocab.add_word(word)
    vocab.save(vocab_path)

print(f"Vocabulary size: {len(vocab)}")

# Create Datasets
print("\n" + "="*50)
print("Creating datasets...")
print("="*50)

# Prepare train texts
train_texts = [dataset['train'][i]['text'] for i in range(min(CONFIG['max_train_samples'], len(dataset['train'])))]
val_texts = [dataset['validation'][i]['text'] for i in range(min(CONFIG['max_val_samples'], len(dataset['validation'])))]

train_dataset = TinyStoriesDataset(
    train_texts,
    vocab,
    CONFIG['context_length'],
    CONFIG['max_train_samples']
)

val_dataset = TinyStoriesDataset(
    val_texts,
    vocab,
    CONFIG['context_length'],
    CONFIG['max_val_samples']
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'],
                         shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'],
                       shuffle=False, num_workers=0)

# Initialize Model
print("\n" + "="*50)
print("Initializing model...")
print("="*50)
embedding_matrix = vocab.create_embedding_matrix()

model = DecoderTransformer(
    vocab_size=len(vocab),
    d_model=CONFIG['d_model'],
    num_layers=CONFIG['num_layers'],
    num_heads=CONFIG['num_heads'],
    d_ff=CONFIG['d_ff'],
    max_seq_len=CONFIG['context_length'],
    dropout=CONFIG['dropout'],
    pretrained_embeddings=embedding_matrix
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])
criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx[vocab.PAD_TOKEN])

Using device: cuda

Configuration:
  name: baseline
  description: Standard baseline configuration from assignment
  context_length: 64
  num_layers: 3
  num_heads: 8
  d_model: 296
  d_ff: 1184
  dropout: 0.1
  batch_size: 32
  learning_rate: 0.0003
  num_epochs: 4
  max_train_samples: 15000
  max_val_samples: 5000
  save_dir: checkpoints/baseline
  plot_dir: plots/baseline

Loading FastText embeddings...
Loading FastText model from cache...
Model loaded successfully!

Loading TinyStories dataset...

Building vocabulary...
Loading existing vocabulary...
Vocabulary size: 10598

Creating datasets...
Preparing dataset...


100%|██████████| 15000/15000 [00:08<00:00, 1816.32it/s]


Created 3083375 sequences
Preparing dataset...


100%|██████████| 5000/5000 [00:02<00:00, 2105.63it/s]


Created 925828 sequences

Initializing model...
Found 9972/10598 words in FastText


In [None]:
def train_with_gradient_accumulation(
    model, dataloader, optimizer, criterion, device, accumulation_steps=1, epoch=1
):
    """Train with gradient accumulation"""
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch} (accum={accumulation_steps})")

    for batch_idx, batch in enumerate(progress_bar):
        batch = batch.to(device)
        inputs = batch[:, :-1]
        targets = batch[:, 1:]

        # Forward pass
        logits = model(inputs)
        loss = criterion(logits.reshape(-1, logits.size(-1)), targets.reshape(-1))

        # Normalize loss by accumulation steps
        loss = loss / accumulation_steps
        loss.backward()

        # Update weights every accumulation_steps
        if (batch_idx + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps
        progress_bar.set_postfix({"loss": f"{loss.item() * accumulation_steps:.4f}"})

    # Handle remaining gradients
    if (batch_idx + 1) % accumulation_steps != 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    perplexity = np.exp(avg_loss)
    return avg_loss, perplexity


def evaluate_model(model, dataloader, criterion, device):
    """Evaluate model and calculate perplexity"""
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            logits = model(inputs)
            loss = criterion(logits.reshape(-1, logits.size(-1)), targets.reshape(-1))
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    perplexity = np.exp(avg_loss)
    return avg_loss, perplexity


def experiment_gradient_accumulation(
    model, train_loader, val_loader, optimizer, criterion, device,
    num_epochs=3, save_dir="results"
):
    """Experiment with different gradient accumulation steps"""
    print("\nGradient Accumulation Experiment...")

    os.makedirs(save_dir, exist_ok=True)

    accumulation_configs = [1, 2, 4, 8]

    for accum_steps in accumulation_configs:
        print(f"\n{'='*50}")
        print(f"Accumulation Steps: {accum_steps}")
        print(f"Effective Batch Size: {train_loader.batch_size * accum_steps}")
        print(f"{'='*50}")

        # Reset model and optimizer
        model_state = model.state_dict()
        opt_state = optimizer.state_dict()

        epoch_times = []
        train_losses = []
        train_perplexities = []
        val_losses = []
        val_perplexities = []

        for epoch in range(1, num_epochs + 1):
            start_time = time.time()

            train_loss, train_ppl = train_with_gradient_accumulation(
                model,
                train_loader,
                optimizer,
                criterion,
                device,
                accumulation_steps=accum_steps,
                epoch=epoch,
            )

            # Evaluate on validation set
            val_loss, val_ppl = evaluate_model(model, val_loader, criterion, device)

            epoch_time = time.time() - start_time
            epoch_times.append(epoch_time)
            train_losses.append(train_loss)
            train_perplexities.append(train_ppl)
            val_losses.append(val_loss)
            val_perplexities.append(val_ppl)

            print(f"Epoch {epoch}:")
            print(f"  Train - Loss: {train_loss:.4f}, Perplexity: {train_ppl:.2f}")
            print(f"  Val   - Loss: {val_loss:.4f}, Perplexity: {val_ppl:.2f}")
            print(f"  Time: {epoch_time:.2f}s")

        # Save results for this accumulation config
        result = {
            "num_epochs": num_epochs,
            "train_losses": train_losses,
            "train_perplexities": train_perplexities,
            "val_losses": val_losses,
            "val_perplexities": val_perplexities,
            "best_train_loss": min(train_losses),
            "best_train_ppl": min(train_perplexities),
            "best_val_loss": min(val_losses),
            "best_val_ppl": min(val_perplexities)
        }

        save_path = f"{save_dir}/result_accum_{accum_steps}.json"
        with open(save_path, 'w') as f:
            json.dump(result, f, indent=2)

        print(f"✓ Saved: {save_path}")

        # Restore model state for fair comparison
        model.load_state_dict(model_state)
        optimizer.load_state_dict(opt_state)

    print(f"\n{'='*50}")
    print(f"All results saved to: {save_dir}/")
    print(f"{'='*50}\n")


experiment_gradient_accumulation(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    num_epochs=1,
    save_dir="results"
)


Gradient Accumulation Experiment...

Accumulation Steps: 1
Effective Batch Size: 32


Epoch 1 (accum=1):   0%|          | 286/96356 [00:09<50:15, 31.86it/s, loss=4.1640] 

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

# Load data from all accumulation step files
accumulation_steps = [1, 2, 4, 8]
data = {}

for step in accumulation_steps:
    filename = f'results/accumulation/accum_{step}.json'
    try:
        with open(filename, 'r') as f:
            data[step] = json.load(f)
        print(f"Loaded {filename}")
    except FileNotFoundError:
        print(f"Warning: {filename} not found, skipping...")
    except json.JSONDecodeError as e:
        print(f"Error decoding {filename}: {e}")

if not data:
    print("No data files found!")
    exit(1)

# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Training Loss vs Accumulation Steps
steps_list = sorted(data.keys())
final_losses = [data[step]['train_losses'][-1] for step in steps_list]

ax1.plot(steps_list, final_losses, marker='o', linewidth=2.5, 
         color='#3b82f6', markersize=10, markerfacecolor='#60a5fa', 
         markeredgewidth=2, markeredgecolor='#3b82f6')

# Add value labels on each point
for step, loss in zip(steps_list, final_losses):
    ax1.annotate(f'{loss:.4f}', 
                xy=(step, loss), 
                xytext=(0, 10),
                textcoords='offset points',
                ha='center',
                fontsize=10,
                fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7))

ax1.set_xlabel('Accumulation Steps', fontsize=12, fontweight='bold')
ax1.set_ylabel('Training Loss', fontsize=12, fontweight='bold')
ax1.set_title('Training Loss vs Accumulation Steps', fontsize=14, fontweight='bold')
ax1.set_xticks(steps_list)
ax1.grid(True, alpha=0.3)
ax1.set_xlim(0.5, max(steps_list) + 0.5)

# Plot 2: Epoch Times vs Accumulation Steps
steps_list = sorted(data.keys())
epoch_times = [data[step]['epoch_times'][0] for step in steps_list]

ax2.plot(steps_list, epoch_times, marker='s', linewidth=2.5, 
         color='#10b981', markersize=10, markerfacecolor='#34d399', 
         markeredgewidth=2, markeredgecolor='#10b981')

# Add value labels on each point
for step, time in zip(steps_list, epoch_times):
    ax2.annotate(f'{time:.1f}s', 
                xy=(step, time), 
                xytext=(0, 10),
                textcoords='offset points',
                ha='center',
                fontsize=10,
                fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.7))

ax2.set_xlabel('Accumulation Steps', fontsize=12, fontweight='bold')
ax2.set_ylabel('Epoch Time (seconds)', fontsize=12, fontweight='bold')
ax2.set_title('Training Time vs Accumulation Steps', fontsize=14, fontweight='bold')
ax2.set_xticks(steps_list)
ax2.grid(True, alpha=0.3)
ax2.set_xlim(0.5, max(steps_list) + 0.5)

# Adjust layout and save
plt.tight_layout()
output_filename = 'results/accumulation/comparison_plot.png'
plt.savefig(output_filename, dpi=300, bbox_inches='tight')
print(f"\nPlot saved as: {output_filename}")

# Print summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)
for step in sorted(data.keys()):
    print(f"\nAccumulation Steps = {step}:")
    print(f"  Effective batch size: {data[step]['effective_batch_size']}")
    print(f"  Final training loss: {data[step]['train_losses'][-1]:.4f}")
    print(f"  Epoch time: {data[step]['epoch_times'][0]:.2f} seconds")
print("="*60)

plt.show()