In [None]:
import numpy as np
import tnn
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as f

from datasets import load_dataset
from typing import Dict


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [2]:
dataset = load_dataset("ylecun/mnist", num_proc=2)
train_size = 60000
test_size = 10000

train = dataset.get("train")
test = dataset.get("test")

train_indices = np.random.choice(len(train), size=train_size, replace=False)
test_indices = np.random.choice(len(test), size=test_size, replace=False)

train = train.select(train_indices)
test = test.select(test_indices)

In [3]:
def to_numpy(example):
    arr = np.reshape(example["image"], -1) / 255.0
    example["input"] = arr
    return example


train_dataset = train.map(to_numpy, num_proc=2).select_columns(["input", "label"])
test_dataset = test.map(to_numpy, num_proc=2).select_columns(["input", "label"])

Map (num_proc=2):   0%|          | 0/60000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
def collate_fn(batch):
    inputs = torch.tensor([ex["input"] for ex in batch]).float()
    labels = torch.tensor([ex["label"] for ex in batch]).long()
    return inputs, labels


trainloader = data.DataLoader(
    train_dataset,
    batch_size=len(train_dataset),
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)
testloader = data.DataLoader(
    test_dataset,
    batch_size=len(test_dataset),
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [None]:
class MLP(tnn.Model):

    def __init__(self) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(28 * 28, 512)
        self.norm_1 = nn.LayerNorm(512)
        self.drop_1 = nn.Dropout(0.4)
        self.linear_2 = nn.Linear(512, 512)
        self.norm_2 = nn.LayerNorm(512)
        self.drop_2 = nn.Dropout(0.2)
        self.linear_3 = nn.Linear(512, 512)
        self.norm_3 = nn.LayerNorm(512)
        self.linear_4 = nn.Linear(512, 10)

    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
        x = self.norm_1(self.linear_1(x))
        x = self.drop_1(f.relu(x))

        x = self.norm_2(self.linear_2(x))
        x = self.drop_2(f.relu(x))

        x = self.norm_3(self.linear_3(x))
        x = self.linear_4(f.relu(x))
        return {"logits": x}

## Batch Gradient Descent

In [5]:
lr = 1e-1
loss_fn = nn.CrossEntropyLoss()
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-batch.h5",
    verbose=10,
    profile=True,
)

In [7]:
batch_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 1.4240, test loss: 0.9099, train acc: 52.52%, test acc: 70.87%)
(gpu memory profile): (average allocated: 211.0 MB, average reserved: 1503.0 MB)
(epoch: 20/50): (train loss: 0.8854, test loss: 0.7624, train acc: 70.61%, test acc: 73.76%)
(gpu memory profile): (average allocated: 211.0 MB, average reserved: 1503.0 MB)
(epoch: 30/50): (train loss: 0.5818, test loss: 0.5544, train acc: 82.04%, test acc: 82.56%)
(gpu memory profile): (average allocated: 211.0 MB, average reserved: 1503.0 MB)
(epoch: 40/50): (train loss: 0.6385, test loss: 0.4184, train acc: 78.42%, test acc: 86.86%)
(gpu memory profile): (average allocated: 211.0 MB, average reserved: 1503.0 MB)
(epoch: 50/50): (train loss: 0.3498, test loss: 0.2704, train acc: 89.68%, test acc: 91.77%)
(gpu memory profile): (average allocated: 211.0 MB, average reserved: 1503.0 MB)
training complete
train_losses saved to ../training/mnist-batch.h5/metrics/train_losses
test_los

## Stochastic Gradient Descent

In [22]:
inputs = (
    torch.tensor([ex["input"] for ex in train_dataset])
    .float()
    .to(device, non_blocking=True)
)
labels = (
    torch.tensor([ex["label"] for ex in train_dataset])
    .long()
    .to(device, non_blocking=True)
)
train_tensor_dataset = data.TensorDataset(inputs, labels)

trainloader = data.DataLoader(
    train_tensor_dataset,
    batch_size=1,
    shuffle=True,
    num_workers=0,
    drop_last=False,
    pin_memory=False,
)

In [23]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-sgd.h5",
    verbose=10,
)

In [25]:
sgd_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0837, test loss: 0.1049, train acc: 97.55%, test acc: 97.09%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 20/50): (train loss: 0.0530, test loss: 0.0785, train acc: 98.40%, test acc: 98.15%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 30/50): (train loss: 0.0382, test loss: 0.0773, train acc: 98.85%, test acc: 98.39%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 40/50): (train loss: 0.0304, test loss: 0.0700, train acc: 99.03%, test acc: 98.45%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 50/50): (train loss: 0.0264, test loss: 0.0692, train acc: 99.19%, test acc: 98.65%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
training complete
train_losses saved to ../training/mnist-sgd.h5/metrics/train_losses
test_losse

## Mini-batch Gradient Descent

### Batch size 32

In [26]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [27]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-mini-batch-32.h5",
    verbose=10,
)

In [29]:
mini_batch_32_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0563, test loss: 0.0594, train acc: 98.19%, test acc: 98.19%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 20/50): (train loss: 0.0295, test loss: 0.0559, train acc: 99.00%, test acc: 98.53%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 30/50): (train loss: 0.0204, test loss: 0.0555, train acc: 99.33%, test acc: 98.54%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 40/50): (train loss: 0.0149, test loss: 0.0537, train acc: 99.49%, test acc: 98.64%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
(epoch: 50/50): (train loss: 0.0135, test loss: 0.0678, train acc: 99.53%, test acc: 98.56%)
(gpu memory profile): (average allocated: 400.0 MB, average reserved: 1691.0 MB)
training complete
train_losses saved to ../training/mnist-mini-batch-32.h5/metrics/train_losses


### Batch size 64

In [30]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [31]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-mini-batch-64.h5",
    verbose=10,
)

In [33]:
mini_batch_64_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0582, test loss: 0.0648, train acc: 98.01%, test acc: 98.16%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 20/50): (train loss: 0.0328, test loss: 0.0580, train acc: 98.91%, test acc: 98.45%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 30/50): (train loss: 0.0228, test loss: 0.0621, train acc: 99.22%, test acc: 98.36%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 40/50): (train loss: 0.0159, test loss: 0.0532, train acc: 99.45%, test acc: 98.60%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 50/50): (train loss: 0.0135, test loss: 0.0637, train acc: 99.54%, test acc: 98.53%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
training complete
train_losses saved to ../training/mnist-mini-batch-64.h5/metrics/train_losses


### Batch size 128

In [34]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [35]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-mini-batch-128.h5",
    verbose=10,
)

In [37]:
mini_batch_128_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0618, test loss: 0.0589, train acc: 97.99%, test acc: 98.23%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 20/50): (train loss: 0.0364, test loss: 0.0563, train acc: 98.78%, test acc: 98.19%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 30/50): (train loss: 0.0256, test loss: 0.0604, train acc: 99.12%, test acc: 98.44%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 40/50): (train loss: 0.0177, test loss: 0.0568, train acc: 99.39%, test acc: 98.56%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
(epoch: 50/50): (train loss: 0.0138, test loss: 0.0573, train acc: 99.51%, test acc: 98.54%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1691.0 MB)
training complete
train_losses saved to ../training/mnist-mini-batch-128.h5/metrics/train_losses

### Batch size 256

In [38]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [39]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-mini-batch-256.h5",
    verbose=10,
)

In [41]:
mini_batch_256_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0700, test loss: 0.0706, train acc: 97.73%, test acc: 97.85%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1693.0 MB)
(epoch: 20/50): (train loss: 0.0414, test loss: 0.0645, train acc: 98.57%, test acc: 98.16%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1693.0 MB)
(epoch: 30/50): (train loss: 0.0297, test loss: 0.0613, train acc: 98.98%, test acc: 98.35%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1693.0 MB)
(epoch: 40/50): (train loss: 0.0214, test loss: 0.0568, train acc: 99.26%, test acc: 98.46%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1693.0 MB)
(epoch: 50/50): (train loss: 0.0177, test loss: 0.0562, train acc: 99.42%, test acc: 98.60%)
(gpu memory profile): (average allocated: 401.0 MB, average reserved: 1693.0 MB)
training complete
train_losses saved to ../training/mnist-mini-batch-256.h5/metrics/train_losses

### Batch size 512

In [42]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=512,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    pin_memory=True,
    num_workers=2,
)

In [43]:
model = MLP()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    epochs=50,
    unpack_inputs=False,
    device=device,
    pin_memory=True,
    non_blocking=True,
    path="../training/mnist-mini-batch-512.h5",
    verbose=10,
)

In [45]:
mini_batch_512_metrics = trainer.train()

model using cuda
training started
(epoch: 10/50): (train loss: 0.0852, test loss: 0.1488, train acc: 97.33%, test acc: 95.85%)
(gpu memory profile): (average allocated: 402.0 MB, average reserved: 1699.0 MB)
(epoch: 20/50): (train loss: 0.0525, test loss: 0.0652, train acc: 98.24%, test acc: 98.15%)
(gpu memory profile): (average allocated: 402.0 MB, average reserved: 1699.0 MB)
(epoch: 30/50): (train loss: 0.0383, test loss: 0.0607, train acc: 98.72%, test acc: 98.25%)
(gpu memory profile): (average allocated: 402.0 MB, average reserved: 1699.0 MB)
(epoch: 40/50): (train loss: 0.0286, test loss: 0.0656, train acc: 99.03%, test acc: 98.26%)
(gpu memory profile): (average allocated: 402.0 MB, average reserved: 1699.0 MB)
(epoch: 50/50): (train loss: 0.0226, test loss: 0.0669, train acc: 99.25%, test acc: 98.33%)
(gpu memory profile): (average allocated: 402.0 MB, average reserved: 1699.0 MB)
training complete
train_losses saved to ../training/mnist-mini-batch-512.h5/metrics/train_losses