In [1]:
import datetime
import torch
import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, transforms, models

In [2]:
DATA_PATH = '../../datasets/'
BATCH_SIZE = 100
MOMENTUM = 0.9
EPOCHS = 20

In [3]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_train += loss.item()

        print('{} Epoch {}, Training loss {}'.format(
            datetime.datetime.now(), epoch,
            loss_train / len(train_loader)))


def calculate_accuracy(model, train_loader, test_loader):
    accdict = {}
    for name, loader in [("train", train_loader), ("test", test_loader)]:
        correct = 0
        total = 0

        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.to(device=device)
                labels = labels.to(device=device)
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())

        print("Accuracy {}: {:.3f}".format(name, correct / total))
        accdict[name] = correct / total
    return accdict

In [4]:
device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f'Using {device}')
loss_fn = nn.CrossEntropyLoss()

Using cuda


In [5]:
mnist_train = datasets.MNIST(
    DATA_PATH, train=True, download=True, transform=transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.1307,), std=(0.3081,))]))
mnist_test = datasets.MNIST(
    DATA_PATH, train=False, download=True, transform=transforms.Compose([
        transforms.Resize((32, 32)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.1325,), std=(0.3105,))]))

In [6]:
cifar10_train = datasets.CIFAR10(
    DATA_PATH, train=True, download=True, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))
cifar10_test = datasets.CIFAR10(
    DATA_PATH, train=False, download=True, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4915, 0.4823, 0.4468),
                             (0.2470, 0.2435, 0.2616))
    ]))

Files already downloaded and verified
Files already downloaded and verified


### LeNet, MNIST

In [7]:
class LeNet5(nn.Module):
    def __init__(self, num_classes):
        super(LeNet5, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(400, 120)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(120, 84)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(84, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.relu(out)
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        return out

In [8]:
model = LeNet5(num_classes=10).to(device=device)
train_loader = torch.utils.data.DataLoader(
    mnist_train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    mnist_test, batch_size=BATCH_SIZE, shuffle=True)

**SGD**

In [9]:
optimizer = optim.SGD(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 05:43:44.382239 Epoch 1, Training loss 0.9709130334854126
2024-12-27 05:43:54.280244 Epoch 2, Training loss 0.18116920304174225
2024-12-27 05:44:04.439525 Epoch 3, Training loss 0.12263486022129655
2024-12-27 05:44:14.654213 Epoch 4, Training loss 0.09835940724549194
2024-12-27 05:44:24.710068 Epoch 5, Training loss 0.08443749769901236
2024-12-27 05:44:35.998078 Epoch 6, Training loss 0.07507873718005915
2024-12-27 05:44:46.857432 Epoch 7, Training loss 0.06755098795052618
2024-12-27 05:44:57.171180 Epoch 8, Training loss 0.06253591927078862
2024-12-27 05:45:07.343682 Epoch 9, Training loss 0.05784833033025886
2024-12-27 05:45:17.392449 Epoch 10, Training loss 0.053663668102429556
2024-12-27 05:45:27.203085 Epoch 11, Training loss 0.05041135947375248
2024-12-27 05:45:37.214248 Epoch 12, Training loss 0.048087020746121804
2024-12-27 05:45:47.751013 Epoch 13, Training loss 0.04544614834245295
2024-12-27 05:45:57.912969 Epoch 14, Training loss 0.04318678759775745
2024-12-27 05:

{'train': 0.9907333333333334, 'test': 0.9875}

**Adadelta**

In [10]:
optimizer = optim.Adadelta(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 05:47:16.143790 Epoch 1, Training loss 0.029975940874622513
2024-12-27 05:47:25.918703 Epoch 2, Training loss 0.029087861885588307
2024-12-27 05:47:35.624744 Epoch 3, Training loss 0.028585986309141543
2024-12-27 05:47:45.388101 Epoch 4, Training loss 0.02808055791441196
2024-12-27 05:47:55.125202 Epoch 5, Training loss 0.02759824851178564
2024-12-27 05:48:04.913413 Epoch 6, Training loss 0.027496049007750115
2024-12-27 05:48:14.691488 Epoch 7, Training loss 0.026986650489270687
2024-12-27 05:48:24.347665 Epoch 8, Training loss 0.02632795707070424
2024-12-27 05:48:34.006511 Epoch 9, Training loss 0.026086838807871875
2024-12-27 05:48:43.648947 Epoch 10, Training loss 0.025703956344902204
2024-12-27 05:48:53.302229 Epoch 11, Training loss 0.025484344362048432
2024-12-27 05:49:02.964348 Epoch 12, Training loss 0.025385530125931838
2024-12-27 05:49:12.623512 Epoch 13, Training loss 0.025172311485318156
2024-12-27 05:49:22.284246 Epoch 14, Training loss 0.024544269107670214
2024

{'train': 0.9940333333333333, 'test': 0.9898}

**NAG**

In [11]:
optimizer = optim.SGD(model.parameters(), lr=1e-2,
                      momentum=MOMENTUM, nesterov=True)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 05:50:41.102758 Epoch 1, Training loss 0.051500526732221864
2024-12-27 05:50:50.907622 Epoch 2, Training loss 0.03743715446481171
2024-12-27 05:51:00.600226 Epoch 3, Training loss 0.031433000217657533
2024-12-27 05:51:10.340814 Epoch 4, Training loss 0.026698848272887214
2024-12-27 05:51:20.042751 Epoch 5, Training loss 0.022512387492267104
2024-12-27 05:51:29.713219 Epoch 6, Training loss 0.019742955657517693
2024-12-27 05:51:39.482034 Epoch 7, Training loss 0.01834316719992785
2024-12-27 05:51:49.153628 Epoch 8, Training loss 0.01563745901391182
2024-12-27 05:51:59.059716 Epoch 9, Training loss 0.01371840792632914
2024-12-27 05:52:08.804944 Epoch 10, Training loss 0.012235049629598507
2024-12-27 05:52:18.392998 Epoch 11, Training loss 0.01039045274189751
2024-12-27 05:52:28.154860 Epoch 12, Training loss 0.009174281926437591
2024-12-27 05:52:38.032180 Epoch 13, Training loss 0.008772535492856453
2024-12-27 05:52:47.768919 Epoch 14, Training loss 0.008014437909053717
2024-1

{'train': 0.9994666666666666, 'test': 0.9933}

**Adam**

In [12]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 05:54:07.475926 Epoch 1, Training loss 0.12787433224720493
2024-12-27 05:54:17.350632 Epoch 2, Training loss 0.051721842843010866
2024-12-27 05:54:27.302634 Epoch 3, Training loss 0.043220377285033466
2024-12-27 05:54:37.256713 Epoch 4, Training loss 0.03932285569045538
2024-12-27 05:54:47.138577 Epoch 5, Training loss 0.0364250664539577
2024-12-27 05:54:57.071279 Epoch 6, Training loss 0.0349117018582668
2024-12-27 05:55:07.952487 Epoch 7, Training loss 0.03298534250941884
2024-12-27 05:55:18.584845 Epoch 8, Training loss 0.03568884287732847
2024-12-27 05:55:28.474390 Epoch 9, Training loss 0.03184653880197099
2024-12-27 05:55:38.487818 Epoch 10, Training loss 0.031566978905151095
2024-12-27 05:55:48.401318 Epoch 11, Training loss 0.029672477661588346
2024-12-27 05:55:58.301295 Epoch 12, Training loss 0.02824134395073391
2024-12-27 05:56:08.058776 Epoch 13, Training loss 0.025393053956080016
2024-12-27 05:56:18.018293 Epoch 14, Training loss 0.02602242799771678
2024-12-27 0

{'train': 0.9964333333333333, 'test': 0.9898}

### VGG16, CIFAR10

In [13]:
model = models.vgg16(num_classes=10, dropout=0.5).to(device=device)
train_loader = torch.utils.data.DataLoader(
    cifar10_train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    cifar10_test, batch_size=BATCH_SIZE, shuffle=True)

**SGD**

In [14]:
optimizer = optim.SGD(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 05:58:29.126492 Epoch 1, Training loss 2.2296725933551786
2024-12-27 05:59:28.596724 Epoch 2, Training loss 1.9435393800735474
2024-12-27 06:00:28.311521 Epoch 3, Training loss 1.6871578221321106
2024-12-27 06:01:28.338746 Epoch 4, Training loss 1.4983220281600953
2024-12-27 06:02:29.377392 Epoch 5, Training loss 1.368983939409256
2024-12-27 06:03:29.966802 Epoch 6, Training loss 1.249396910905838
2024-12-27 06:04:30.393539 Epoch 7, Training loss 1.1458671696186065
2024-12-27 06:05:30.666567 Epoch 8, Training loss 1.0404935233592987
2024-12-27 06:06:30.964003 Epoch 9, Training loss 0.9448642734289169
2024-12-27 06:07:31.294690 Epoch 10, Training loss 0.8627091385126114
2024-12-27 06:08:31.504153 Epoch 11, Training loss 0.7808680199384689
2024-12-27 06:09:31.710665 Epoch 12, Training loss 0.7030427494049072
2024-12-27 06:10:31.980979 Epoch 13, Training loss 0.6319894831180572
2024-12-27 06:11:32.273003 Epoch 14, Training loss 0.5574026307463646
2024-12-27 06:12:32.531915 Epoc

{'train': 0.94304, 'test': 0.7295}

**Adadelta**

In [15]:
optimizer = optim.Adadelta(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 06:19:41.523693 Epoch 1, Training loss 0.05363255119137466
2024-12-27 06:21:19.083092 Epoch 2, Training loss 0.018434931389987468
2024-12-27 06:22:56.620713 Epoch 3, Training loss 0.008190690333023668
2024-12-27 06:24:34.154383 Epoch 4, Training loss 0.003232448733091587
2024-12-27 06:26:11.647139 Epoch 5, Training loss 0.0014996128559287172
2024-12-27 06:27:49.105822 Epoch 6, Training loss 0.0008537345769873355
2024-12-27 06:29:26.660707 Epoch 7, Training loss 0.000410893400603527
2024-12-27 06:31:04.214972 Epoch 8, Training loss 0.00027842877916737054
2024-12-27 06:32:41.727296 Epoch 9, Training loss 0.00028123685726313854
2024-12-27 06:34:19.239466 Epoch 10, Training loss 0.0001833191162404546
2024-12-27 06:35:56.814737 Epoch 11, Training loss 0.00015718891753203933
2024-12-27 06:37:34.366307 Epoch 12, Training loss 0.00018403081660108
2024-12-27 06:39:11.937801 Epoch 13, Training loss 0.00018820938594581093
2024-12-27 06:40:49.400105 Epoch 14, Training loss 0.00013267117

{'train': 1.0, 'test': 0.7599}

**NAG**

In [16]:
optimizer = optim.SGD(model.parameters(), lr=1e-2,
                      momentum=MOMENTUM, nesterov=True)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 06:52:15.905873 Epoch 1, Training loss 2.102879531441773
2024-12-27 06:53:27.389289 Epoch 2, Training loss 2.242182055234909
2024-12-27 06:54:38.725647 Epoch 3, Training loss 2.303988904476166
2024-12-27 06:55:50.138673 Epoch 4, Training loss 2.131973477602005
2024-12-27 06:57:01.763997 Epoch 5, Training loss 1.6647806313037872
2024-12-27 06:58:13.541781 Epoch 6, Training loss 1.2597789784669877
2024-12-27 06:59:25.301577 Epoch 7, Training loss 1.014926502943039
2024-12-27 07:00:37.038996 Epoch 8, Training loss 0.8506213674545288
2024-12-27 07:01:48.703864 Epoch 9, Training loss 0.7294409753680229
2024-12-27 07:03:00.512596 Epoch 10, Training loss 0.6243732739686966
2024-12-27 07:04:12.309809 Epoch 11, Training loss 0.5337138203382492
2024-12-27 07:05:24.071524 Epoch 12, Training loss 0.46514758133888245
2024-12-27 07:06:35.825504 Epoch 13, Training loss 0.39540938463807107
2024-12-27 07:07:47.595111 Epoch 14, Training loss 0.34314043936133387
2024-12-27 07:08:59.366145 Epoc

{'train': 0.96302, 'test': 0.7851}

**Adam**

In [17]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 07:16:50.751684 Epoch 1, Training loss 17664.79536634597
2024-12-27 07:18:12.638721 Epoch 2, Training loss 2.45382753944397
2024-12-27 07:19:34.602374 Epoch 3, Training loss 2.3197092747688295
2024-12-27 07:20:56.613386 Epoch 4, Training loss 2.324620099067688
2024-12-27 07:22:18.622945 Epoch 5, Training loss 2.313235326766968
2024-12-27 07:23:40.604565 Epoch 6, Training loss 2.3107501273155213
2024-12-27 07:25:02.554130 Epoch 7, Training loss 2.3041173877716066
2024-12-27 07:26:24.490521 Epoch 8, Training loss 2.3037308440208437
2024-12-27 07:27:46.431350 Epoch 9, Training loss 2.308174916267395
2024-12-27 07:29:08.440138 Epoch 10, Training loss 66.16066323995591
2024-12-27 07:30:30.345933 Epoch 11, Training loss 18.246604873180388
2024-12-27 07:31:52.184555 Epoch 12, Training loss 2.3064777994155885
2024-12-27 07:33:13.997023 Epoch 13, Training loss 2.3039369015693665
2024-12-27 07:34:35.852795 Epoch 14, Training loss 2.30405695104599
2024-12-27 07:35:57.677704 Epoch 15, T

{'train': 0.1, 'test': 0.0999}

### ResNet34, CIFAR10

In [18]:
model = models.resnet34(num_classes=10).to(device)

**SGD**

In [19]:
optimizer = optim.SGD(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 07:43:56.413408 Epoch 1, Training loss 1.741152853012085
2024-12-27 07:44:35.481356 Epoch 2, Training loss 1.3936313543319703
2024-12-27 07:45:14.534921 Epoch 3, Training loss 1.2149140868186952
2024-12-27 07:45:53.648642 Epoch 4, Training loss 1.078296981573105
2024-12-27 07:46:32.794918 Epoch 5, Training loss 0.9506015863418579
2024-12-27 07:47:11.869833 Epoch 6, Training loss 0.8397922549247742
2024-12-27 07:47:50.979581 Epoch 7, Training loss 0.7313443795442581
2024-12-27 07:48:30.101202 Epoch 8, Training loss 0.6348851273655891
2024-12-27 07:49:09.222406 Epoch 9, Training loss 0.5431101906299591
2024-12-27 07:49:48.344859 Epoch 10, Training loss 0.4654421541392803
2024-12-27 07:50:27.463641 Epoch 11, Training loss 0.39376586812734604
2024-12-27 07:51:06.545274 Epoch 12, Training loss 0.33358875414729117
2024-12-27 07:51:45.634584 Epoch 13, Training loss 0.2801771402209997
2024-12-27 07:52:24.659222 Epoch 14, Training loss 0.2407137423604727
2024-12-27 07:53:03.672738 Ep

{'train': 0.96778, 'test': 0.6135}

**Adadelta**

In [20]:
optimizer = optim.Adadelta(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 07:57:20.737752 Epoch 1, Training loss 0.04060892623476684
2024-12-27 07:58:05.610311 Epoch 2, Training loss 0.014850982825038954
2024-12-27 07:58:50.481070 Epoch 3, Training loss 0.008961891505285166
2024-12-27 07:59:35.365232 Epoch 4, Training loss 0.006469234968884848
2024-12-27 08:00:20.248606 Epoch 5, Training loss 0.005384723340626806
2024-12-27 08:01:05.147839 Epoch 6, Training loss 0.004588261961791432
2024-12-27 08:01:50.012632 Epoch 7, Training loss 0.0038700731814606116
2024-12-27 08:02:34.912241 Epoch 8, Training loss 0.0036040999270335306
2024-12-27 08:03:19.799730 Epoch 9, Training loss 0.0027631831181643067
2024-12-27 08:04:04.672032 Epoch 10, Training loss 0.0027456491850316525
2024-12-27 08:04:49.414105 Epoch 11, Training loss 0.003078521876304876
2024-12-27 08:05:34.239618 Epoch 12, Training loss 0.0018907826265494804
2024-12-27 08:06:19.134727 Epoch 13, Training loss 0.00195129486406222
2024-12-27 08:07:04.040736 Epoch 14, Training loss 0.00141282574192155

{'train': 0.9998, 'test': 0.6435}

**NAG**

In [21]:
optimizer = optim.SGD(model.parameters(), lr=1e-2,
                      momentum=MOMENTUM, nesterov=True)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 08:12:30.904321 Epoch 1, Training loss 1.300702776776656
2024-12-27 08:13:11.814550 Epoch 2, Training loss 0.9454583877325058
2024-12-27 08:13:52.726813 Epoch 3, Training loss 0.7825373379588128
2024-12-27 08:14:33.629662 Epoch 4, Training loss 0.6566253761053086
2024-12-27 08:15:14.414279 Epoch 5, Training loss 0.5409152450561523
2024-12-27 08:15:55.307757 Epoch 6, Training loss 0.46013272497057917
2024-12-27 08:16:36.229785 Epoch 7, Training loss 0.3908095486164093
2024-12-27 08:17:17.157041 Epoch 8, Training loss 0.30847922000288963
2024-12-27 08:17:58.092999 Epoch 9, Training loss 0.2497867405563593
2024-12-27 08:18:39.016776 Epoch 10, Training loss 0.2046178793683648
2024-12-27 08:19:19.903111 Epoch 11, Training loss 0.16985211800038816
2024-12-27 08:20:00.787487 Epoch 12, Training loss 0.15192330961674452
2024-12-27 08:20:41.650029 Epoch 13, Training loss 0.1256606623530388
2024-12-27 08:21:22.555863 Epoch 14, Training loss 0.11229557209089398
2024-12-27 08:22:03.45760

{'train': 0.98554, 'test': 0.7475}

**Adam**

In [22]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
train(n_epochs=EPOCHS, optimizer=optimizer, model=model,
      loss_fn=loss_fn, train_loader=train_loader)
calculate_accuracy(model, train_loader, test_loader)

2024-12-27 08:26:27.388833 Epoch 1, Training loss 2.0096703776605427
2024-12-27 08:27:10.112766 Epoch 2, Training loss 1.2805144143104554
2024-12-27 08:27:52.789318 Epoch 3, Training loss 0.9999506863355636
2024-12-27 08:28:35.464893 Epoch 4, Training loss 0.8496742109060288
2024-12-27 08:29:18.116056 Epoch 5, Training loss 0.7380919264554977
2024-12-27 08:30:00.781509 Epoch 6, Training loss 0.6429173493981362
2024-12-27 08:30:43.505143 Epoch 7, Training loss 0.5645195926427842
2024-12-27 08:31:26.074909 Epoch 8, Training loss 0.47160966649651526
2024-12-27 08:32:08.684066 Epoch 9, Training loss 0.40327101328969
2024-12-27 08:32:51.335693 Epoch 10, Training loss 0.34618036714196204
2024-12-27 08:33:33.910905 Epoch 11, Training loss 0.28865102064609527
2024-12-27 08:34:16.569229 Epoch 12, Training loss 0.2504079294204712
2024-12-27 08:34:59.229089 Epoch 13, Training loss 0.19812865860760212
2024-12-27 08:35:41.838836 Epoch 14, Training loss 0.17374683013558387
2024-12-27 08:36:24.453321

{'train': 0.97606, 'test': 0.7418}