In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.autograd import Variable

import math
import pprint
from functools import partial
import numpy as np
import argparse
import visdom
from tqdm import tqdm_notebook

# Custom Optimizers

## SGD

In [2]:
class CustomSGD(optim.Optimizer):
    def __init__(self, params, lr):
        defaults = dict(lr=lr)
        super(CustomSGD, self).__init__(params, defaults)
                    
    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                
                # Update Parameters
                p.data.add_(-group['lr'], d_p)
        return

## Momentum

In [3]:
class CustomMomentum(optim.Optimizer):
    def __init__(self, params, lr, momentum=0.5):
        defaults = dict(lr=lr, momentum=momentum)
        super(CustomMomentum, self).__init__(params, defaults)
                    
    def step(self):
        for group in self.param_groups:
            momentum = group['momentum']
            
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                
                # Apply momentum
                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                    buf.mul_(momentum).add_(d_p)
                else:
                    buf = param_state['momentum_buffer']
                    buf.mul_(momentum).add_(1, d_p)
                d_p = buf
                # Update Parameters
                p.data.add_(-group['lr'], d_p)
        return

## Adam

In [4]:
class CustomAdam(optim.Optimizer):
    def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-8):
        defaults = dict(lr=lr, betas=betas, eps=eps)
        super(CustomAdam, self).__init__(params, defaults)
                    
    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)
        return

# Configurations

In [5]:
def parse_config(parse=True, **optional_kwargs):
    parser = argparse.ArgumentParser(description='Custom Optimizer Experiments')
    parser.add_argument('--batch-size', type=int, default=100, metavar='N',
                        help='input batch size for training (default: 100)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--optimizer', type=str, default='sgd', metavar='OPTIM',
                        help='which optimizer to use (default: SGD)')
    parser.add_argument('--dataset', type=str, default='mnist', metavar='M',
                        help='which dataset to evaluate (default: MNIST)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')

    if parse:
        args = parser.parse_args()
    else:
        args = parser.parse_known_args()[0]
        
    # Namespace => Dictionary
    kwargs = vars(args)
    kwargs.update(optional_kwargs)

    config = Config(**kwargs)
    print(config)
    return config

class Config(object):
    def __init__(self, **kwargs):
        """Configuration Class: set kwargs as class attributes with setattr"""
        for k, v in kwargs.items():
            if k == 'optimizer':
                setattr(self, 'optimizer', self.set_optimizer(v))
            else:
                setattr(self, k, v)

    def set_optimizer(self, optim_key='sgd'):
        optim_key = optim_key.lower()
        optim_dict = {
            'sgd': optim.SGD,
            'momentum': partial(optim.SGD, momentum=0.5),
            'adam': optim.Adam,
            'custom_sgd': CustomSGD,
            'custom_momentum': CustomMomentum,
            'custom_adam': CustomAdam
        }
        return optim_dict[optim_key]

    def __repr__(self):
        """Pretty-print configurations in alphabetical order"""
        config_str = 'Configurations\n'
        config_str += pprint.pformat(self.__dict__)
        return config_str

In [6]:
config = parse_config(False)
config

Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.sgd.SGD'>,
 'seed': 1,
 'test_batch_size': 1000}


Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.sgd.SGD'>,
 'seed': 1,
 'test_batch_size': 1000}

# Data Loader

In [7]:
def get_loader(config):
    
    # 1*28*28
    if config.dataset.lower() == 'mnist':
        
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.1307], std=[0.3081])])
        
        train_loader = DataLoader(
            datasets.MNIST('./data', train=True, download=True, transform=transform),
            batch_size=config.batch_size, shuffle=True, num_workers=1, pin_memory=True)
        test_loader = DataLoader(
            datasets.MNIST('./data', train=False, transform=transform),
            batch_size=config.test_batch_size, shuffle=False, num_workers=1, pin_memory=True)
        
    # 3*32*32
    elif config.dataset.lower() == 'cifar10':
        
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

        train_loader = DataLoader(
            datasets.CIFAR10('./data', train=True, download=True, transform=transform),
            batch_size=config.batch_size, shuffle=True, num_workers=1, pin_memory=True)
        test_loader = DataLoader(
            datasets.CIFAR10('./data', train=False, transform=transform),
            batch_size=config.test_batch_size, shuffle=False, num_workers=1, pin_memory=True)

    return train_loader, test_loader

# Networks

In [8]:
class MNIST_Net(nn.Module):
    def __init__(self):
        super(MNIST_Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [9]:
class CIFAR_Net(nn.Module):
    def __init__(self):
        super(CIFAR_Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

In [10]:
def get_networks(dataset):
    if dataset == 'mnist':
        return MNIST_Net
    elif dataset == 'cifar10':
        return CIFAR_Net

# Train / Test

In [11]:
def train(config, model, optimizer, train_loader):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)

        output = model(data)

        optimizer.zero_grad()
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

def test(config, model, test_loader):
    
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        
        output = model(data)
        
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    
    return test_loss, accuracy

# Run

In [12]:
def main(dataset='mnist'):
    vis_engine = visdom.Visdom(env='optimizers')
    initial_plot = True
    
    for optimizer_name in ['SGD', 'Momentum', 'Adam', 'Custom_SGD', 'Custom_Momentum', 'Custom_Adam']:
        config = parse_config(False, optimizer=optimizer_name.lower(), dataset=dataset)

        # Random number generator
        torch.cuda.manual_seed(config.seed)

        # Build model
        model = get_networks(dataset)().cuda()

        # Build dataset
        train_loader, test_loader = get_loader(config)

        # Build Optimizer
        optimizer = config.optimizer(model.parameters(), lr=config.lr)

        # Train & Test & Plot
        for epoch in tqdm_notebook(range(1, config.epochs + 1), desc='Epoch'):
            train(config, model, optimizer, train_loader)
            test_loss, accuracy = test(config, model, test_loader)

            if initial_plot:
                loss_window = vis_engine.line(
                    X=np.array([epoch]),
                    Y=np.array([test_loss]),
                    opts=dict(
                        title=config.dataset.upper(),
                        xlabel='Epoch',
                        ylabel='Test Loss',
                        legend=[optimizer_name]))
                
                accuracy_window = vis_engine.line(
                    X=np.array([epoch]),
                    Y=np.array([accuracy]),
                    opts=dict(
                        title=config.dataset.upper(),
                        xlabel='Epoch',
                        ylabel='Accuracy (%)',
                        legend=[optimizer_name]))
                        
                initial_plot = False

            else:
                vis_engine.updateTrace(
                    X=np.array([epoch]),
                Y=np.array([test_loss]),
                win=loss_window,
                name=optimizer_name)
                
                vis_engine.updateTrace(
                    X=np.array([epoch]),
                Y=np.array([accuracy]),
                win=accuracy_window,
                name=optimizer_name)

In [13]:
for dataset in ['mnist', 'cifar10']:
    main(dataset)

Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.sgd.SGD'>,
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': functools.partial(<class 'torch.optim.sgd.SGD'>, momentum=0.5),
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.adam.Adam'>,
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomSGD'>,
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomMomentum'>,
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'mnist',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomAdam'>,
 'seed': 1,
 'test_batch_size': 1000}



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.sgd.SGD'>,
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': functools.partial(<class 'torch.optim.sgd.SGD'>, momentum=0.5),
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class 'torch.optim.adam.Adam'>,
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomSGD'>,
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomMomentum'>,
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



Configurations
{'batch_size': 100,
 'dataset': 'cifar10',
 'epochs': 10,
 'lr': 0.01,
 'optimizer': <class '__main__.CustomAdam'>,
 'seed': 1,
 'test_batch_size': 1000}
Files already downloaded and verified



