In [1]:
import numpy as np
import tnn
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as f

from datasets import load_dataset
from torch.utils.data import DataLoader

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [2]:
dataset = load_dataset("ylecun/mnist", num_proc=2)
train_size = 10000
test_size = 1000

train = dataset.get("train")
test = dataset.get("test")

train_indices = np.random.choice(len(train), size=train_size, replace=False)
test_indices = np.random.choice(len(test), size=test_size, replace=False)

train = train.select(train_indices)
test = test.select(test_indices)

In [3]:
def to_numpy(example):
    arr = np.reshape(example["image"], -1) / 255.0
    example["input"] = arr
    return example


train_dataset = train.map(to_numpy, num_proc=2).select_columns(["input", "label"])
test_dataset = test.map(to_numpy, num_proc=2).select_columns(["input", "label"])

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
def collate_fn(batch):
    inputs = torch.tensor([ex["input"] for ex in batch]).float()
    labels = torch.tensor([ex["label"] for ex in batch]).long()
    return inputs, labels


trainloader = data.DataLoader(
    train_dataset,
    batch_size=len(train_dataset),
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)
testloader = data.DataLoader(
    test_dataset,
    batch_size=len(test_dataset),
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [5]:
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(784, 512)
        self.norm_1 = nn.LayerNorm(512)
        self.drop_1 = nn.Dropout(0.4)
        self.linear_2 = nn.Linear(512, 512)
        self.norm_2 = nn.LayerNorm(512)
        self.drop_2 = nn.Dropout(0.2)
        self.linear_3 = nn.Linear(512, 512)
        self.norm_3 = nn.LayerNorm(512)
        self.drop_3 = nn.Dropout(0.2)
        self.linear_4 = nn.Linear(512, 10)

    def forward(self, x):
        x = self.norm_1(self.linear_1(x))
        x = self.drop_1(f.relu(x))

        x = self.norm_2(self.linear_2(x))
        x = self.drop_2(f.relu(x))

        x = self.norm_3(self.linear_3(x))
        x = self.drop_3(f.relu(x))

        x = f.relu(self.linear_4(x))
        return {"logits": x}

## Batch Gradient Descent

In [6]:
lr = 5e-2
model = tnn.Model(MLP())
loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [7]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    path="./batch.h5",
    device=device,
    verbose=10,
)

In [8]:
metrics = trainer.train(epochs=100)

model using cuda
weights saved to ./batch.h5/trajectory/weights-epoch-0
training started
(epoch: 10): (train loss: 1.8252, test loss: 1.4482, train acc: 0.4764, test acc: 0.5930)
weights saved to ./batch.h5/trajectory/weights-epoch-10
(epoch: 20): (train loss: 1.0882, test loss: 0.7085, train acc: 0.7310, test acc: 0.8070)
weights saved to ./batch.h5/trajectory/weights-epoch-20
(epoch: 30): (train loss: 0.7735, test loss: 0.5069, train acc: 0.8008, test acc: 0.8480)
weights saved to ./batch.h5/trajectory/weights-epoch-30
(epoch: 40): (train loss: 0.6233, test loss: 0.4158, train acc: 0.8319, test acc: 0.8770)
weights saved to ./batch.h5/trajectory/weights-epoch-40
(epoch: 50): (train loss: 0.5426, test loss: 0.3629, train acc: 0.8501, test acc: 0.8960)
weights saved to ./batch.h5/trajectory/weights-epoch-50
(epoch: 60): (train loss: 0.4766, test loss: 0.3313, train acc: 0.8685, test acc: 0.9000)
weights saved to ./batch.h5/trajectory/weights-epoch-60
(epoch: 70): (train loss: 0.4308, t

## Mini-batch Gradient Descent

In [9]:
trainloader = data.DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=2,
)

In [10]:
model = tnn.Model(MLP())
optim = torch.optim.SGD(model.parameters(), lr=lr)

In [11]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    path="./mini-batch.h5",
    device=device,
    verbose=10,
)

In [12]:
metrics = trainer.train(epochs=100)

model using cuda
weights saved to ./mini-batch.h5/trajectory/weights-epoch-0
training started
(epoch: 10): (train loss: 0.1344, test loss: 1.6508, train acc: 0.9577, test acc: 0.6770)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-10
(epoch: 20): (train loss: 0.0637, test loss: 0.1532, train acc: 0.9794, test acc: 0.9590)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-20
(epoch: 30): (train loss: 0.0370, test loss: 0.1474, train acc: 0.9870, test acc: 0.9690)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-30
(epoch: 40): (train loss: 0.0271, test loss: 0.1448, train acc: 0.9905, test acc: 0.9640)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-40
(epoch: 50): (train loss: 0.0198, test loss: 0.1390, train acc: 0.9932, test acc: 0.9730)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-50
(epoch: 60): (train loss: 0.0169, test loss: 0.1873, train acc: 0.9942, test acc: 0.9560)
weights saved to ./mini-batch.h5/trajectory/weights-epoch-60


## Stochastic Gradient Descent

In [13]:
inputs = torch.tensor([ex["input"] for ex in train_dataset]).float().to(device)
labels = torch.tensor([ex["label"] for ex in train_dataset]).long().to(device)
train_tensor_dataset = data.TensorDataset(inputs, labels)

trainloader = data.DataLoader(
    train_tensor_dataset,
    batch_size=1,
    shuffle=True,
    drop_last=False,
    collate_fn=None,
    num_workers=0,
)

In [17]:
model = tnn.Model(MLP())
optim = torch.optim.SGD(model.parameters(), lr=1e-1)

In [18]:
trainer = tnn.Trainer(
    model,
    optim,
    loss_fn,
    trainloader,
    testloader,
    path="./sgd.h5",
    device=device,
    verbose=10,
)

In [19]:
metrics = trainer.train(epochs=100)

model using cuda
weights saved to ./sgd.h5/trajectory/weights-epoch-0
training started
(epoch: 10): (train loss: 2.3026, test loss: 2.3026, train acc: 0.1015, test acc: 0.0950)
weights saved to ./sgd.h5/trajectory/weights-epoch-10
(epoch: 20): (train loss: 2.3026, test loss: 2.3026, train acc: 0.1015, test acc: 0.0950)
weights saved to ./sgd.h5/trajectory/weights-epoch-20


KeyboardInterrupt: 