In [1]:
import numpy as np
import tnn
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as f

from datasets import load_dataset

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [2]:
dataset = load_dataset("ylecun/mnist", num_proc=2)
train_size = 60000
test_size = 10000

train = dataset.get("train")
test = dataset.get("test")

train_indices = np.random.choice(len(train), size=train_size, replace=False)
test_indices = np.random.choice(len(test), size=test_size, replace=False)

train = train.select(train_indices)
test = test.select(test_indices)

In [3]:
def to_numpy(example):
    arr = np.reshape(example["image"], -1) / 255.0
    example["input"] = arr
    return example


train_dataset = train.map(to_numpy, num_proc=2).select_columns(["input", "label"])
test_dataset = test.map(to_numpy, num_proc=2).select_columns(["input", "label"])

Map (num_proc=2):   0%|          | 0/60000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
def collate_fn(batch):
    inputs = torch.tensor([ex["input"] for ex in batch]).float()
    labels = torch.tensor([ex["label"] for ex in batch]).long()
    return inputs, labels


trainloader = data.DataLoader(
    train_dataset,
    batch_size=len(train_dataset),
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)
testloader = data.DataLoader(
    test_dataset,
    batch_size=len(test_dataset),
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

## Batch Gradient Descent

In [6]:
lr = 1e-1
loss_fn = nn.CrossEntropyLoss()
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [7]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-batch.h5",
    verbose=10,
)

In [8]:
batch_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 1.4270, test loss: 1.7843, train acc: 0.4883, test acc: 0.4816)
(epoch: 20): (train loss: 0.9668, test loss: 0.7176, train acc: 0.6687, test acc: 0.7613)
(epoch: 30): (train loss: 0.5820, test loss: 0.4427, train acc: 0.8017, test acc: 0.8569)
(epoch: 40): (train loss: 0.5093, test loss: 0.4371, train acc: 0.8344, test acc: 0.8577)
(epoch: 50): (train loss: 0.3980, test loss: 0.3329, train acc: 0.8807, test acc: 0.8971)
(epoch: 60): (train loss: 0.3509, test loss: 0.2790, train acc: 0.8932, test acc: 0.9156)
(epoch: 70): (train loss: 0.3067, test loss: 0.2400, train acc: 0.9085, test acc: 0.9284)
(epoch: 80): (train loss: 0.2847, test loss: 0.2219, train acc: 0.9148, test acc: 0.9339)
training complete
train_losses saved to ../training/mnist-batch.h5/metrics/train_losses
test_losses saved to ../training/mnist-batch.h5/metrics/test_losses
train_accs saved to ../training/mnist-batch.h5/metrics/train_accs
test_accs saved to ../tr

## Stochastic Gradient Descent

In [9]:
inputs = torch.tensor([ex["input"] for ex in train_dataset]).float().to(device)
labels = torch.tensor([ex["label"] for ex in train_dataset]).long().to(device)
train_tensor_dataset = data.TensorDataset(inputs, labels)

trainloader = data.DataLoader(
    train_tensor_dataset, batch_size=1, shuffle=True, drop_last=False
)

In [10]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [11]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-sgd.h5",
    verbose=10,
)

In [12]:
sgd_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0860, test loss: 0.0914, train acc: 0.9745, test acc: 0.9761)
(epoch: 20): (train loss: 0.0526, test loss: 0.0768, train acc: 0.9840, test acc: 0.9810)
(epoch: 30): (train loss: 0.0390, test loss: 0.0719, train acc: 0.9882, test acc: 0.9819)
(epoch: 40): (train loss: 0.0308, test loss: 0.0752, train acc: 0.9903, test acc: 0.9850)
(epoch: 50): (train loss: 0.0250, test loss: 0.0681, train acc: 0.9920, test acc: 0.9851)
(epoch: 60): (train loss: 0.0213, test loss: 0.0735, train acc: 0.9934, test acc: 0.9840)
(epoch: 70): (train loss: 0.0187, test loss: 0.0738, train acc: 0.9939, test acc: 0.9859)
(epoch: 80): (train loss: 0.0141, test loss: 0.0774, train acc: 0.9956, test acc: 0.9877)
training complete
train_losses saved to ../training/mnist-sgd.h5/metrics/train_losses
test_losses saved to ../training/mnist-sgd.h5/metrics/test_losses
train_accs saved to ../training/mnist-sgd.h5/metrics/train_accs
test_accs saved to ../training

## Mini-batch Gradient Descent

### Batch size 32

In [13]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [14]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [15]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-mini-batch-32.h5",
    verbose=10,
)

In [16]:
mini_batch_32_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0550, test loss: 0.0626, train acc: 0.9818, test acc: 0.9820)
(epoch: 20): (train loss: 0.0318, test loss: 0.0572, train acc: 0.9891, test acc: 0.9851)
(epoch: 30): (train loss: 0.0216, test loss: 0.0524, train acc: 0.9923, test acc: 0.9873)
(epoch: 40): (train loss: 0.0169, test loss: 0.0572, train acc: 0.9943, test acc: 0.9861)
(epoch: 50): (train loss: 0.0127, test loss: 0.0596, train acc: 0.9953, test acc: 0.9866)
(epoch: 60): (train loss: 0.0105, test loss: 0.0565, train acc: 0.9965, test acc: 0.9873)
(epoch: 70): (train loss: 0.0100, test loss: 0.0698, train acc: 0.9964, test acc: 0.9849)
(epoch: 80): (train loss: 0.0079, test loss: 0.0603, train acc: 0.9971, test acc: 0.9872)
training complete
train_losses saved to ../training/mnist-mini-batch-32.h5/metrics/train_losses
test_losses saved to ../training/mnist-mini-batch-32.h5/metrics/test_losses
train_accs saved to ../training/mnist-mini-batch-32.h5/metrics/train_accs


### Batch size 64

In [17]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [18]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [19]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-mini-batch-64.h5",
    verbose=10,
)

In [20]:
mini_batch_64_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0574, test loss: 0.0665, train acc: 0.9812, test acc: 0.9811)
(epoch: 20): (train loss: 0.0342, test loss: 0.0611, train acc: 0.9887, test acc: 0.9834)
(epoch: 30): (train loss: 0.0241, test loss: 0.0583, train acc: 0.9918, test acc: 0.9844)
(epoch: 40): (train loss: 0.0150, test loss: 0.0587, train acc: 0.9950, test acc: 0.9858)
(epoch: 50): (train loss: 0.0131, test loss: 0.0608, train acc: 0.9958, test acc: 0.9856)
(epoch: 60): (train loss: 0.0099, test loss: 0.0628, train acc: 0.9968, test acc: 0.9858)
(epoch: 70): (train loss: 0.0092, test loss: 0.0708, train acc: 0.9969, test acc: 0.9843)
(epoch: 80): (train loss: 0.0072, test loss: 0.0631, train acc: 0.9975, test acc: 0.9864)
training complete
train_losses saved to ../training/mnist-mini-batch-64.h5/metrics/train_losses
test_losses saved to ../training/mnist-mini-batch-64.h5/metrics/test_losses
train_accs saved to ../training/mnist-mini-batch-64.h5/metrics/train_accs


### Batch size 128

In [21]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [22]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [23]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-mini-batch-128.h5",
    verbose=10,
)

In [24]:
mini_batch_128_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0620, test loss: 0.0641, train acc: 0.9800, test acc: 0.9809)
(epoch: 20): (train loss: 0.0356, test loss: 0.0562, train acc: 0.9882, test acc: 0.9835)
(epoch: 30): (train loss: 0.0229, test loss: 0.0528, train acc: 0.9921, test acc: 0.9853)
(epoch: 40): (train loss: 0.0196, test loss: 0.0599, train acc: 0.9933, test acc: 0.9852)
(epoch: 50): (train loss: 0.0143, test loss: 0.0564, train acc: 0.9951, test acc: 0.9860)
(epoch: 60): (train loss: 0.0111, test loss: 0.0604, train acc: 0.9965, test acc: 0.9863)
(epoch: 70): (train loss: 0.0096, test loss: 0.0589, train acc: 0.9968, test acc: 0.9861)
(epoch: 80): (train loss: 0.0083, test loss: 0.0588, train acc: 0.9971, test acc: 0.9862)
training complete
train_losses saved to ../training/mnist-mini-batch-128.h5/metrics/train_losses
test_losses saved to ../training/mnist-mini-batch-128.h5/metrics/test_losses
train_accs saved to ../training/mnist-mini-batch-128.h5/metrics/train_ac

### Batch size 256

In [25]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [26]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [27]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-mini-batch-256.h5",
    verbose=10,
)

In [28]:
mini_batch_256_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0694, test loss: 0.0949, train acc: 0.9776, test acc: 0.9717)
(epoch: 20): (train loss: 0.0414, test loss: 0.0616, train acc: 0.9862, test acc: 0.9833)
(epoch: 30): (train loss: 0.0287, test loss: 0.0569, train acc: 0.9900, test acc: 0.9840)
(epoch: 40): (train loss: 0.0224, test loss: 0.0536, train acc: 0.9921, test acc: 0.9859)
(epoch: 50): (train loss: 0.0159, test loss: 0.0623, train acc: 0.9947, test acc: 0.9848)
(epoch: 60): (train loss: 0.0137, test loss: 0.0582, train acc: 0.9952, test acc: 0.9862)
(epoch: 70): (train loss: 0.0136, test loss: 0.0800, train acc: 0.9951, test acc: 0.9807)
(epoch: 80): (train loss: 0.0106, test loss: 0.0602, train acc: 0.9964, test acc: 0.9865)
training complete
train_losses saved to ../training/mnist-mini-batch-256.h5/metrics/train_losses
test_losses saved to ../training/mnist-mini-batch-256.h5/metrics/test_losses
train_accs saved to ../training/mnist-mini-batch-256.h5/metrics/train_ac

### Batch size 512

In [29]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=512,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [30]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [31]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=False,
    device=device,
    path="../training/mnist-mini-batch-512.h5",
    verbose=10,
)

In [32]:
mini_batch_512_metrics = trainer.train(epochs=80)

model using cuda
training started
(epoch: 10): (train loss: 0.0852, test loss: 0.0772, train acc: 0.9727, test acc: 0.9763)
(epoch: 20): (train loss: 0.0521, test loss: 0.0832, train acc: 0.9830, test acc: 0.9762)
(epoch: 30): (train loss: 0.0374, test loss: 0.0647, train acc: 0.9874, test acc: 0.9817)
(epoch: 40): (train loss: 0.0287, test loss: 0.0619, train acc: 0.9908, test acc: 0.9830)
(epoch: 50): (train loss: 0.0230, test loss: 0.0564, train acc: 0.9920, test acc: 0.9848)
(epoch: 60): (train loss: 0.0176, test loss: 0.0618, train acc: 0.9942, test acc: 0.9851)
(epoch: 70): (train loss: 0.0175, test loss: 0.0614, train acc: 0.9939, test acc: 0.9853)
(epoch: 80): (train loss: 0.0132, test loss: 0.0653, train acc: 0.9954, test acc: 0.9845)
training complete
train_losses saved to ../training/mnist-mini-batch-512.h5/metrics/train_losses
test_losses saved to ../training/mnist-mini-batch-512.h5/metrics/test_losses
train_accs saved to ../training/mnist-mini-batch-512.h5/metrics/train_ac

## SGD w/ Momentum 

### No Nesterov Accelerated Gradient

In [7]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [8]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [9]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=True,
    device=device,
    path="../training/mnist-momentum-no-nag.h5",
    verbose=10,
)

In [10]:
trainer.train(epochs=80)

model using cuda
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-0
training started
(epoch: 10): (train loss: 0.0669, test loss: 0.0743, train acc: 0.9790, test acc: 0.9801)
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-10
(epoch: 20): (train loss: 0.0402, test loss: 0.0702, train acc: 0.9873, test acc: 0.9807)
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-20
(epoch: 30): (train loss: 0.0257, test loss: 0.0757, train acc: 0.9917, test acc: 0.9812)
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-30
(epoch: 40): (train loss: 0.0220, test loss: 0.0655, train acc: 0.9924, test acc: 0.9847)
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-40
(epoch: 50): (train loss: 0.0165, test loss: 0.0667, train acc: 0.9944, test acc: 0.9852)
weights saved to ../training/mnist-momentum-no-nag.h5/trajectory/weights-epoch-50
(epoch: 60): (train loss:

{'train_losses': [0.5985611695418755,
  0.18856167202318708,
  0.1434230515262733,
  0.1201763673953712,
  0.10605379535742104,
  0.09637066434745988,
  0.08485990781184907,
  0.07843096272163093,
  0.07438694721724218,
  0.0668565278552783,
  0.06471786569419007,
  0.057502524716391536,
  0.05444102565497936,
  0.052016794004718155,
  0.05131923339942781,
  0.048752644522177674,
  0.04460433808189506,
  0.04290967062970934,
  0.04076763595228937,
  0.040228486071926695,
  0.03784696913231164,
  0.03789304332807272,
  0.03708780899144476,
  0.032885170380092074,
  0.03362413924287927,
  0.030258482327896248,
  0.028606511689382993,
  0.029410228717078764,
  0.03131446189238535,
  0.025725869717646857,
  0.026558552697271805,
  0.026257856425953408,
  0.02492504226627255,
  0.025677308864719816,
  0.023794044868836257,
  0.023801514842642063,
  0.023825092202108742,
  0.02145008636948866,
  0.022041363735190438,
  0.0220118609846589,
  0.020919278764414292,
  0.020032052988017676,
  0.0

### With Nesterov Accelerated Gradient

In [11]:
model = tnn.Model(tnn.MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, nesterov=True)

In [12]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    save_weights=True,
    device=device,
    path="../training/mnist-momentum-nag.h5",
    verbose=10,
)

In [13]:
trainer.train(epochs=80)

model using cuda
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-0
training started
(epoch: 10): (train loss: 0.0654, test loss: 0.0740, train acc: 0.9798, test acc: 0.9790)
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-10
(epoch: 20): (train loss: 0.0391, test loss: 0.0704, train acc: 0.9871, test acc: 0.9826)
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-20
(epoch: 30): (train loss: 0.0277, test loss: 0.0634, train acc: 0.9905, test acc: 0.9844)
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-30
(epoch: 40): (train loss: 0.0227, test loss: 0.0645, train acc: 0.9925, test acc: 0.9847)
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-40
(epoch: 50): (train loss: 0.0177, test loss: 0.0647, train acc: 0.9940, test acc: 0.9856)
weights saved to ../training/mnist-momentum-nag.h5/trajectory/weights-epoch-50
(epoch: 60): (train loss: 0.0143, test loss

{'train_losses': [0.7204050019135078,
  0.1927170776426792,
  0.14414930815870564,
  0.12046453702238699,
  0.10292337784587095,
  0.09469479335245366,
  0.08658997339668373,
  0.07900292803120489,
  0.0719273826321587,
  0.06536854678097491,
  0.06216247117669942,
  0.057235438624909145,
  0.053561639556987214,
  0.05370467489737397,
  0.05186383808501996,
  0.04699741806386349,
  0.04464017447271229,
  0.04171012033326163,
  0.04085260028645086,
  0.03909053338547625,
  0.03779841918401265,
  0.03583321638777076,
  0.034455858610364765,
  0.033279461566354925,
  0.032560931285800565,
  0.03073949133998831,
  0.031593134674528844,
  0.02856901146681048,
  0.02837536054282682,
  0.027659829418054626,
  0.025913673863492054,
  0.02326272172590737,
  0.026765040010239074,
  0.022067675690071578,
  0.023893739136727526,
  0.023025026109656514,
  0.02332136068728287,
  0.02223326201994981,
  0.02103977776548709,
  0.022715109101529622,
  0.022078231856899704,
  0.01985949269994841,
  0.019