In [1]:
import datetime
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as functional

from torchvision import datasets, transforms

In [2]:
DATA_PATH = '../datasets/'
BATCH_SIZE = 100
MOMENTUM = 0.9
EPOCHS = 20

In [3]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_train += loss.item()

        print('{} Epoch {}, Training loss {}'.format(
            datetime.datetime.now(), epoch,
            loss_train / len(train_loader)))


def calculate_accuracy(model, train_loader, test_loader):
    accdict = {}
    for name, loader in [("train", train_loader), ("test", test_loader)]:
        correct = 0
        total = 0

        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.to(device=device)
                labels = labels.to(device=device)
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())

        print("Accuracy {}: {:.3f}".format(name, correct / total))
        accdict[name] = correct / total
    return accdict

In [4]:
loss_fn = nn.CrossEntropyLoss()
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
device

device(type='cuda')

In [5]:
mnist_train = datasets.MNIST(
    DATA_PATH, train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST(
    DATA_PATH, train=False, download=True, transform=transforms.ToTensor())

In [6]:
cifar10_train = datasets.CIFAR10(
    DATA_PATH, train=True, download=True, transform=transforms.ToTensor())
cifar10_test = datasets.CIFAR10(
    DATA_PATH, train=False, download=True, transform=transforms.ToTensor())

Files already downloaded and verified
Files already downloaded and verified


### LeNet, MNIST

In [7]:
class LeNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        out = functional.max_pool2d(torch.relu(self.conv1(x)), 2)
        out = functional.max_pool2d(torch.relu(self.conv2(out)), 2)
        out = out.view(-1, 16 * 5 * 5)
        out = torch.tanh(self.fc1(out))
        out = torch.tanh(self.fc2(out))
        out = self.fc3(out)
        return out

In [8]:
model = LeNet().to(device=device)
train_loader = torch.utils.data.DataLoader(
    mnist_train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    mnist_test, batch_size=BATCH_SIZE, shuffle=True)

**SGD**

In [9]:
optimizer = optim.SGD(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:13:20.581924 Epoch 1, Training loss 2.276938224633535
2024-12-26 08:13:26.627528 Epoch 2, Training loss 1.1631027681132158
2024-12-26 08:13:32.945105 Epoch 3, Training loss 0.39932225505510965
2024-12-26 08:13:39.018924 Epoch 4, Training loss 0.26489552290489277
2024-12-26 08:13:44.808644 Epoch 5, Training loss 0.20278167814016343
2024-12-26 08:13:50.754194 Epoch 6, Training loss 0.1656926867303749
2024-12-26 08:13:56.592733 Epoch 7, Training loss 0.14115072692433994
2024-12-26 08:14:02.668240 Epoch 8, Training loss 0.12361197590517502
2024-12-26 08:14:09.077670 Epoch 9, Training loss 0.1108416389580816
2024-12-26 08:14:15.295437 Epoch 10, Training loss 0.10060321598934631
2024-12-26 08:14:22.146360 Epoch 11, Training loss 0.09229257516562939
2024-12-26 08:14:28.009884 Epoch 12, Training loss 0.08563395408292611
2024-12-26 08:14:34.067016 Epoch 13, Training loss 0.08017338744985561
2024-12-26 08:14:40.146595 Epoch 14, Training loss 0.07517853382974864
2024-12-26 08:14:46.

{'train': 0.9850833333333333, 'test': 0.98}

**Adadelta**

In [10]:
optimizer = optim.Adadelta(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:15:28.198869 Epoch 1, Training loss 0.04960355846909806
2024-12-26 08:15:34.391344 Epoch 2, Training loss 0.04829390860628337
2024-12-26 08:15:40.797905 Epoch 3, Training loss 0.047366416196649275
2024-12-26 08:15:46.780371 Epoch 4, Training loss 0.04657051068653042
2024-12-26 08:15:52.630566 Epoch 5, Training loss 0.04581610673262427
2024-12-26 08:15:58.558954 Epoch 6, Training loss 0.045076508945785465
2024-12-26 08:16:04.354194 Epoch 7, Training loss 0.04435825238237157
2024-12-26 08:16:10.155746 Epoch 8, Training loss 0.043641508157985905
2024-12-26 08:16:16.346295 Epoch 9, Training loss 0.042999777543203284
2024-12-26 08:16:22.247407 Epoch 10, Training loss 0.042249247402263185
2024-12-26 08:16:27.986949 Epoch 11, Training loss 0.04172372637549415
2024-12-26 08:16:33.767577 Epoch 12, Training loss 0.04111467192104707
2024-12-26 08:16:39.662801 Epoch 13, Training loss 0.040542408785937976
2024-12-26 08:16:45.471972 Epoch 14, Training loss 0.03997549627286692
2024-12-2

{'train': 0.9904166666666666, 'test': 0.9853}

**NAG**

In [11]:
optimizer = optim.SGD(model.parameters(), lr=1e-2,
                      momentum=MOMENTUM, nesterov=True)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:17:31.703078 Epoch 1, Training loss 0.06964853660979618
2024-12-26 08:17:37.748810 Epoch 2, Training loss 0.049897559319312376
2024-12-26 08:17:43.760447 Epoch 3, Training loss 0.04047720941094061
2024-12-26 08:17:49.538135 Epoch 4, Training loss 0.03361775763876115
2024-12-26 08:17:55.321488 Epoch 5, Training loss 0.028085828527497747
2024-12-26 08:18:01.289329 Epoch 6, Training loss 0.02278367590760657
2024-12-26 08:18:07.645114 Epoch 7, Training loss 0.019558110081707127
2024-12-26 08:18:13.650253 Epoch 8, Training loss 0.01661251410279268
2024-12-26 08:18:19.592713 Epoch 9, Training loss 0.014719425882018793
2024-12-26 08:18:25.619131 Epoch 10, Training loss 0.011715772923198529
2024-12-26 08:18:31.605980 Epoch 11, Training loss 0.0102217508504206
2024-12-26 08:18:37.572838 Epoch 12, Training loss 0.008248848584771621
2024-12-26 08:18:43.615405 Epoch 13, Training loss 0.006561602674143311
2024-12-26 08:18:49.526835 Epoch 14, Training loss 0.005262437969310365
2024-12-

{'train': 0.9999833333333333, 'test': 0.9906}

**Adam**

In [12]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:19:36.512101 Epoch 1, Training loss 0.0323631153270253
2024-12-26 08:19:43.053631 Epoch 2, Training loss 0.02537921016589583
2024-12-26 08:19:49.288710 Epoch 3, Training loss 0.021104802537932604
2024-12-26 08:19:55.389502 Epoch 4, Training loss 0.019093650427773053
2024-12-26 08:20:01.341908 Epoch 5, Training loss 0.01346805975146708
2024-12-26 08:20:07.282793 Epoch 6, Training loss 0.013038066118145555
2024-12-26 08:20:13.492932 Epoch 7, Training loss 0.013758047778489223
2024-12-26 08:20:19.642327 Epoch 8, Training loss 0.011388000254385891
2024-12-26 08:20:25.662690 Epoch 9, Training loss 0.012107427830123925
2024-12-26 08:20:31.949977 Epoch 10, Training loss 0.010359086387082547
2024-12-26 08:20:38.166003 Epoch 11, Training loss 0.007441450822346572
2024-12-26 08:20:44.420340 Epoch 12, Training loss 0.007820859958919756
2024-12-26 08:20:50.891369 Epoch 13, Training loss 0.006391670997181791
2024-12-26 08:20:57.202230 Epoch 14, Training loss 0.006589637215286833
2024-

{'train': 0.9982666666666666, 'test': 0.9885}

| Optimizer | Epochs | Learning rate | Accuracy (test) |
| --------  | ------ | ------------- | --------------- |
| SGD       | 20     | 1e-2          | 0.981           |
| Adadelta  | 20     | 1e-2          | 0.986           |  
| NAG       | 20     | 1e-2          | 0.990           |
| Adam      | 20     | 1e-3          | 0.990           |

### VGG16, CIFAR10

In [13]:
class VGG16(nn.Module):
    def __init__(self, dropout_p):
        super().__init__()
        self.conv11 = nn.Conv2d(3, 128, kernel_size=3, padding=1)
        self.conv12 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.conv21 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv22 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv31 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.conv32 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(512 * 4 * 4, 1024)
        self.fc1_dropout = nn.Dropout(p=dropout_p)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc2_dropout = nn.Dropout(p=dropout_p)
        self.fc3 = nn.Linear(1024, 10)

    def forward(self, x):
        out = torch.relu(self.conv11(x))     # 3 x 32 x 32   -> 128 x 32 x 32
        out = torch.relu(self.conv12(out))   # 128 x 32 x 32 -> 128 x32 x 32
        out = functional.max_pool2d(out, 2)  # 128 x 32 x 32 -> 128 x 16 x 16
        out = torch.relu(self.conv21(out))   # 128 x 16 x 16 -> 256 x 16 x 16
        out = torch.relu(self.conv22(out))   # 256 x 16 x 16 -> 256 x 16 x 16
        out = functional.max_pool2d(out, 2)  # 256 x 16 x 16 -> 256 x 8 x 8
        out = torch.relu(self.conv31(out))   # 256 x 8 x 8   -> 512 x 8 x 8
        out = torch.relu(self.conv32(out))   # 512 x 8 x 8   -> 512 x 8 x 8
        out = functional.max_pool2d(out, 2)  # 512 x 8 x 8   -> 512 x 4 x 4
        out = out.view(-1, 512 * 4 * 4)
        out = torch.relu(self.fc1(out))      # 512 x 4 x 4 -> 1024
        out = self.fc1_dropout(out)
        out = torch.relu(self.fc2(out))      # 1024 -> 1024
        out = self.fc2_dropout(out)
        out = self.fc3(out)                  # 1024 -> 10
        return out

In [14]:
model = VGG16(dropout_p=0.4).to(device=device)
train_loader = torch.utils.data.DataLoader(
    cifar10_train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    cifar10_test, batch_size=BATCH_SIZE, shuffle=True)

**SGD**

In [15]:
optimizer = optim.SGD(model.parameters(), lr=1e-1)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:22:25.196666 Epoch 1, Training loss 2.3028539319038392
2024-12-26 08:23:11.712005 Epoch 2, Training loss 2.296835040092468
2024-12-26 08:23:58.653550 Epoch 3, Training loss 2.267014611721039
2024-12-26 08:24:45.559300 Epoch 4, Training loss 2.0661916205883024
2024-12-26 08:25:31.281106 Epoch 5, Training loss 1.8112792677879332
2024-12-26 08:26:17.066253 Epoch 6, Training loss 1.60315660071373
2024-12-26 08:27:03.240924 Epoch 7, Training loss 1.3987265512943268
2024-12-26 08:27:49.075935 Epoch 8, Training loss 1.1913824157714843
2024-12-26 08:28:34.879118 Epoch 9, Training loss 1.0195326380729675
2024-12-26 08:29:20.679660 Epoch 10, Training loss 0.8745715825557708
2024-12-26 08:30:06.541475 Epoch 11, Training loss 0.7405826632976532
2024-12-26 08:30:53.917452 Epoch 12, Training loss 0.6213848806619644
2024-12-26 08:31:39.889440 Epoch 13, Training loss 0.5191734665632248
2024-12-26 08:32:25.716378 Epoch 14, Training loss 0.4214385597705841
2024-12-26 08:33:11.560023 Epoch 

{'train': 0.98256, 'test': 0.7803}

**Adadelta**

In [16]:
optimizer = optim.Adadelta(model.parameters(), lr=1e-1)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:38:14.851763 Epoch 1, Training loss 0.03297575830109417
2024-12-26 08:39:04.360485 Epoch 2, Training loss 0.021022967101132963
2024-12-26 08:39:53.967982 Epoch 3, Training loss 0.013720194828259992
2024-12-26 08:40:43.649051 Epoch 4, Training loss 0.011445752762781921
2024-12-26 08:41:33.247102 Epoch 5, Training loss 0.010177779191813898
2024-12-26 08:42:22.888684 Epoch 6, Training loss 0.009154703930675169
2024-12-26 08:43:12.510709 Epoch 7, Training loss 0.00784739794989582
2024-12-26 08:44:02.209933 Epoch 8, Training loss 0.007509752997120814
2024-12-26 08:44:51.838453 Epoch 9, Training loss 0.006772421129848226
2024-12-26 08:45:41.567886 Epoch 10, Training loss 0.009229511368524982
2024-12-26 08:46:31.239815 Epoch 11, Training loss 0.007513171138441976
2024-12-26 08:47:20.790808 Epoch 12, Training loss 0.006206588208439825
2024-12-26 08:48:10.239790 Epoch 13, Training loss 0.00581864067404058
2024-12-26 08:48:59.805371 Epoch 14, Training loss 0.006599108374612115
2024

{'train': 0.99866, 'test': 0.7968}

**NAG**

In [17]:
optimizer = optim.SGD(model.parameters(), lr=1e-3,
                      momentum=MOMENTUM, nesterov=True)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 08:55:08.938122 Epoch 1, Training loss 0.0019071426413001973
2024-12-26 08:55:55.987443 Epoch 2, Training loss 0.0016463169206604106
2024-12-26 08:56:43.022090 Epoch 3, Training loss 0.0010902917907933442
2024-12-26 08:57:30.055482 Epoch 4, Training loss 0.0009569177530449906
2024-12-26 08:58:17.105728 Epoch 5, Training loss 0.0009727753662818941
2024-12-26 08:59:04.113517 Epoch 6, Training loss 0.0010030303802368507
2024-12-26 08:59:51.105327 Epoch 7, Training loss 0.0006386276791116643
2024-12-26 09:00:38.088887 Epoch 8, Training loss 0.0005571369862813639
2024-12-26 09:01:24.992995 Epoch 9, Training loss 0.0007281336642593032
2024-12-26 09:02:11.882607 Epoch 10, Training loss 0.0006397370681983148
2024-12-26 09:02:58.746065 Epoch 11, Training loss 0.0007244740190581069
2024-12-26 09:03:45.634915 Epoch 12, Training loss 0.0004910186269695487
2024-12-26 09:04:32.584037 Epoch 13, Training loss 0.0007263162776123408
2024-12-26 09:05:19.560412 Epoch 14, Training loss 0.0007747

{'train': 0.99992, 'test': 0.8022}

**Adam**

In [25]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-26 09:49:19.821065 Epoch 1, Training loss 49.034860153198245
2024-12-26 09:50:07.408478 Epoch 2, Training loss 48.15267549133301


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
