In [1]:
from scipy.stats import levy_stable
import numpy as np
import torch
import math
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.nn.functional as F
from torch import optim
import os
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import StepLR

In [2]:
class simpleNet(nn.Module):
# a simple fully connected neural network
    def __init__(self, input_dim=28*28 , width=400, depth=4, num_classes=10):
        super(simpleNet, self).__init__()
        self.input_dim = input_dim 
        self.width = width
        self.depth = depth
        self.num_classes = num_classes
        
        layers = self.get_layers()

        self.fc = nn.Sequential(
            nn.Linear(self.input_dim, self.width, bias=False),
            nn.ReLU(inplace=True),
            *layers,
            nn.Linear(self.width, self.num_classes, bias=False),
        )

    def get_layers(self):
        layers = []
        for i in range(self.depth - 2):
            layers.append(nn.Linear(self.width, self.width, bias=False))
            layers.append(nn.ReLU())
        return layers

    def forward(self, x):
        x = x.view(x.size(0), self.input_dim)
        x = self.fc(x)
        return x

In [3]:
# load MNIST data
batch_size=128
data_tf = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean=(0.1307,), std=(0.3081,))])


train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [4]:
class SGLD(Optimizer):
    """
    SGLD optimiser based on pytorch's SGD.
    Note that the weight decay is specified in terms of the gaussian prior sigma.
    """

    def __init__(self, params, lr, norm_sigma=0, addnoise=True):

        weight_decay = 1 / (norm_sigma ** 2)

        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))

        defaults = dict(lr=lr, weight_decay=weight_decay, addnoise=addnoise)

        super(SGLD, self).__init__(params, defaults)

    def step(self):
        """
        Performs a single optimization step.
        """
        loss = None

        for group in self.param_groups:

            weight_decay = group['weight_decay']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                if group['addnoise']:

                    langevin_noise = p.data.new(p.data.size()).normal_(mean=0, std=1) / np.sqrt(group['lr'])
                    p.data.add_(-group['lr'],
                                0.5 * d_p + langevin_noise)
                else:
                    p.data.add_(-group['lr'], 0.5 * d_p)

        return loss


In [5]:
import torch
from torch.optim import Optimizer


class PSGHMC(Optimizer):
    '''
    Penalized SGHMC algorithm
    '''

    def __init__(self, params, lr, delta, gamma, constrain_list, lp = 1):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))

        defaults = dict(lr=lr, delta=delta, gamma = gamma, constrain_list = constrain_list, lp = lp)
        super(PSGHMC, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(PSGHMC, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)
        
#     functions to calculate covariance between two independent noise
    def phi0(self, t, gamma):
        return torch.exp(torch.tensor(-t * gamma))

    def phi1(self, t, gamma):
        return - 1/gamma * torch.exp(torch.tensor(-t * gamma)) + 1 / gamma

    def phi2(self, t, gamma):
        tmp = 1/gamma ** 2
        tmp = tmp * (torch.exp(torch.tensor(-gamma*t))-1)
        tmp = tmp + t/gamma
        return tmp

    def covariance(self, t, gamma):
        e2gt = np.exp(-gamma*t*2)
        egt = np.exp(-gamma*t)
        a11 = - (1/(2*gamma)) * e2gt + 1/(2*gamma)
        a12 = e2gt / (2*gamma*gamma) - egt/(gamma**2) - 1/(2*(gamma**2)) + 1/(gamma ** 2)
        a22 = -e2gt/(2*(gamma ** 3)) + 2*egt / (gamma**3) + 1/(2 * (gamma ** 3)) - 2/(gamma ** 3) + t/(gamma ** 2)
        tmp = np.array([[a11, a12], [a12,a22]])
        return tmp
    
#     generate related noise based on covariance
    def gen_noise(self, t, gamma, dim):
        cov = self.covariance(t, gamma)
        rand = np.random.multivariate_normal([0]*2, cov, size = (dim))
        rand1 = rand[:, :, 0]
        rand2 = rand[:, :, 1]
        return torch.tensor(rand1).cuda(), torch.tensor(rand2).cuda()
    

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            delta = group['delta']
            constrain_list = group['constrain_list']
            lp = group['lp']
            lr = group['lr']
            gamma = group['gamma']
            
            idx = 0
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                norm_ord = torch.norm(p.data, p = lp)
                constrain = constrain_list[idx]
                if norm_ord > constrain:
                    g1 = torch.pow(torch.abs(p.data)/norm_ord, lp-1)
                    tmp_con = (norm_ord-constrain) * (g1 * torch.sign(p.data))
                    d_p.add_(delta, tmp_con)
                
                noise1, noise2 = self.gen_noise(lr, gamma, p.data.shape)
                
                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                else:
                    buf = param_state['momentum_buffer']
                    buf.mul_(self.phi0(lr, gamma)).add_(-self.phi1(lr, gamma), d_p).add_(torch.sqrt(torch.tensor(2*gamma)), noise1)

                p.data.add_(self.phi1(lr, gamma), buf).add_(-self.phi2(lr, gamma), d_p).add_(torch.sqrt(torch.tensor(2*gamma)), noise2)
                idx += 1

        return loss


In [7]:
lr_list = [5 * (0.1)**8]
delta = (0.1) ** 3

# 1-norm of the result from SGLD, used as the constraints in the constrained network
norm_list = torch.tensor([5677.8647 , 4067.0469 , 4065.2566 ,  150.35751])
# we can change the value of s here
constrain_list = norm_list * 0.8

# select 1-norm
lp = 1
epoch = 400
# calculate the averaged result from 5 runs
N = 5
gamma = 0.1

# path to save models
PATH = './MNIST-3FCN-Pen/constrain08/SGHMC/lp1'
try:
    os.mkdir(PATH)
except OSError as exc:
    pass


train_acc_all = []

for n in range(N):
    print(n)
    for i in range(len(lr_list)):
        learning_rate = lr_list[i]
        trainErrorList=[]
        trainAccList=[]


        model = simpleNet()
        if torch.cuda.is_available():
                model = model.cuda()
        criterion = nn.functional.nll_loss
    #     optimizer = SGLD(model.parameters(), lr = learning_rate, norm_sigma = 1, addnoise = True)
        optimizer = PSGHMC(model.parameters(), lr=learning_rate, delta = delta, gamma = gamma, constrain_list = constrain_list, lp = lp)
        scheduler = StepLR(optimizer, step_size=50, gamma=0.9)
        
        for l in range(epoch):
            train_acc=0
            for data in train_loader:
                img, label = data
                img=img.view(img.size(0),-1)
                if torch.cuda.is_available():
                    img = img.cuda()
                    label = label.cuda()
                else:
                    img = Variable(img)
                    label = Variable(label)
                out = model(img)
                loss = F.cross_entropy(out, label, reduction='sum')
    #             loss = criterion(out, label)
                print_loss = loss.data.item()
                _, pred = torch.max(out.data, 1)
                train_acc += pred.eq(label.view_as(pred)).sum().item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
            trainErrorList.append(loss.data.item())
            trainAccList.append(train_acc/60000)
            scheduler.step()
            if l % 50 == 0 and l !=0:
                print(train_acc/60000)
            if (train_acc/60000) >=0.7 and train_acc/60000 == max(trainAccList):
                tmp_path = PATH + '/model' + '{}'.format(n) +'.pth'
                torch.save(model, tmp_path)
        train_acc_all.append(trainAccList)
        print(train_acc/60000)

0
0.4459
0.6184333333333333


  "type " + obj.__name__ + ". It won't be checked "


0.7102166666666667
0.7923
0.8310833333333333
0.8534166666666667
0.86655
0.8743666666666666
1
0.6074166666666667
0.6786
0.7121666666666666
0.7804666666666666
0.8149166666666666
0.8388833333333333
0.8552833333333333
0.8663166666666666
2
0.5657
0.5825666666666667
0.7217666666666667
0.8001333333333334
0.8353166666666667
0.8557333333333333
0.8675333333333334
0.8753333333333333
3
0.6229333333333333
0.6082166666666666
0.70935
0.7892166666666667
0.8232833333333334
0.84565
0.85935
0.8697166666666667
4
0.5727
0.6400166666666667
0.751
0.8071833333333334
0.84065
0.8593166666666666
0.8709833333333333
0.8778333333333334
