In [1]:
!pip3 install torch

Collecting torch
  Using cached torch-2.7.1-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting setuptools (from torch)
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import random

# ---------------------------------------------
# Device setup for Apple M2
# ---------------------------------------------
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# ---------------------------------------------
# Tiny Dataset (synthetic text data)
# ---------------------------------------------
vocab_size = 100
seq_length = 128
batch_size = 8
num_batches = 50

def generate_fake_data():
    return torch.randint(0, vocab_size, (batch_size, seq_length), dtype=torch.long)

# ---------------------------------------------
# Tiny Transformer-style LLM (MiniGPT)
# ---------------------------------------------
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, n_heads=4, n_layers=2):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.token_embedding(x)  # (B, T, C)
        x = x.permute(1, 0, 2)       # Transformer expects (T, B, C)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        return self.fc_out(x)

# ---------------------------------------------
# Custom IDAM Optimizer
# ---------------------------------------------
class IDAM(torch.optim.Optimizer):
    def __init__(self, params, alpha=1e-2, eps=1e-8):
        defaults = dict(alpha=alpha, eps=eps)
        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        for group in self.param_groups:
            alpha = group['alpha']
            eps = group['eps']
            for p in group['params']:
                if p.grad is None:
                    continue
                state = self.state[p]
                if 'prev_param' not in state:
                    state['prev_param'] = p.data.clone()
                displacement = p.data - state['prev_param']
                eta = alpha / (torch.sqrt(1 + displacement**2) + eps)
                state['prev_param'] = p.data.clone()
                p.data -= eta * p.grad

# ---------------------------------------------
# Training Function
# ---------------------------------------------
def train_model(optimizer_name):
    model = MiniGPT(vocab_size).to(device)
    criterion = nn.CrossEntropyLoss()

    if optimizer_name == "SGD":
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    elif optimizer_name == "Adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    elif optimizer_name == "IDAM":
        optimizer = IDAM(model.parameters(), alpha=0.1)
    else:
        raise ValueError("Unsupported optimizer")

    model.train()
    total_loss = 0.0
    start_time = time.time()

    for _ in range(num_batches):
        x = generate_fake_data().to(device)
        y = x.clone().to(device)  # next-token prediction

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    end_time = time.time()
    avg_loss = total_loss / num_batches
    runtime = end_time - start_time
    return avg_loss, runtime

# ---------------------------------------------
# Run Training with All Optimizers
# ---------------------------------------------
results = {}
for opt_name in ["SGD", "Adam", "IDAM"]:
    loss, runtime = train_model(opt_name)
    results[opt_name] = {"avg_loss": loss, "runtime_sec": runtime}

# ---------------------------------------------
# Print Results
# ---------------------------------------------
import pandas as pd
df = pd.DataFrame(results).T
print(df)




      avg_loss  runtime_sec
SGD   1.290987     0.633550
Adam  5.409834     0.733557
IDAM  1.342105     0.703720
