In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
import torchvision.datasets as datasets
from torchvision import transforms
from optimizers import DemonRanger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
class Cnn(nn.Module):
    def __init__(self, dropout=0.5):
        super(Cnn, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.conv2_drop = nn.Dropout2d(p=dropout)
        self.fc1 = nn.Linear(1600, 100) # 1600 = number channels * width * height
        self.fc2 = nn.Linear(100, 10)
        self.fc1_drop = nn.Dropout(p=dropout)

    def forward(self, x):
        x = torch.relu(F.max_pool2d(self.conv1(x), 2))
        x = torch.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        
        # flatten over channel, height and width = 1600
        x = x.view(-1, x.size(1) * x.size(2) * x.size(3))
        
        x = torch.relu(self.fc1_drop(self.fc1(x)))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

In [3]:
class config:
    def __init__(self):
       self.batch_size = 1024
       self.wd = 0
       self.lr = 0.002
       self.view_every = 100
       self.epochs = 10
       
config = config()
net_demon = Cnn()
net_demon.to(device)
criterion = nn.CrossEntropyLoss()

In [4]:
transform = transforms.Compose([
    transforms.ToTensor()])
#Data
mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

#Loaders
trainloader = torch.utils.data.DataLoader(mnist_trainset, batch_size=config.batch_size, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(mnist_testset, batch_size=config.batch_size, shuffle=True, num_workers=2)

In [5]:
optimizer = DemonRanger(params=net_demon.parameters(),
                        lr=config.lr,
                        weight_decay=config.wd,
                        epochs = config.epochs,
                        step_per_epoch = len(trainloader), 
                        betas=(0.9,0.999,0.999), # restore default AdamW betas
                        nus=(1.0,1.0), # disables QHMomentum
                        k=0,  # disables lookahead
                        alpha=1.0, 
                        IA=False, # enables Iterate Averaging
                        rectify=False, # disables RAdam Recitification
                        AdaMod=False, #disables AdaMod
                        AdaMod_bias_correct=False, #disables AdaMod bias corretion (not used originally)
                        use_demon=True, #enables Decaying Momentum (DEMON)
                        use_gc=False, #disables gradient centralization
                        amsgrad=False # disables amsgrad
                       )
IA_activate = False 

In [6]:
# Train
for epoch in range(config.epochs):  # loop over the dataset multiple times
    print('EPOCH {:d} / {:d}'.format(epoch + 1, config.epochs))
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net_demon(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step(IA_activate)

        # print statistics
        running_loss += loss.item()
        if i % config.view_every == config.view_every-1:    # print every 2000 mini-batches
            print('Batch {:d} / {:d}: loss = {:.4f}'.format(i+1,len(trainloader),running_loss/config.view_every))
            running_loss = 0.0

print('Finished Training')

EPOCH 1 / 10


	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  C:\cb\pytorch_1000000000000\work\torch\csrc\utils\python_arg_parser.cpp:1055.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


EPOCH 2 / 10
EPOCH 3 / 10
EPOCH 4 / 10
EPOCH 5 / 10
EPOCH 6 / 10
EPOCH 7 / 10
EPOCH 8 / 10
EPOCH 9 / 10
EPOCH 10 / 10
Finished Training


In [7]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        # calculate outputs by running images through the network 
        outputs = net_demon(images).to(device)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the test images: 97 %
