In [1]:
import numpy as np
import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F
from tensorflow.keras.datasets import cifar10 # to load dataset

from utils import compute_stats, get_grad
from MB_LBFGS import LBFGS

In [15]:
max_iter = 10 * (50000 // 256)                      # note each iteration is NOT an epoch
ghost_batch = 128
overlap_ratio = 0.25                # should be in (0, 0.5)
lr = 1.0

In [16]:
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train = X_train/255
X_test = X_test/255

X_train = np.transpose(X_train, (0, 3, 1, 2))
X_test = np.transpose(X_test, (0, 3, 1, 2))

In [17]:
X_train.shape, X_test.shape

((50000, 3, 32, 32), (10000, 3, 32, 32))

In [18]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 1000)
        self.fc2 = nn.Linear(1000, 10)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [19]:
cuda = torch.cuda.is_available()
    
#%% Create neural network model
        
if(cuda):
    torch.cuda.manual_seed(2018)
    model = ConvNet().cuda() 
else:
    torch.manual_seed(2018)
    model = ConvNet()
    
#%% Define helper functions

# Forward pass
if(cuda):
    opfun = lambda X: model.forward(torch.from_numpy(X).cuda())
else:
    opfun = lambda X: model.forward(torch.from_numpy(X))

# Forward pass through the network given the input
if(cuda):
    predsfun = lambda op: np.argmax(op.cpu().data.numpy(), 1)
else:
    predsfun = lambda op: np.argmax(op.data.numpy(), 1)

# Do the forward pass, then compute the accuracy
accfun   = lambda op, y: np.mean(np.equal(predsfun(op), y.squeeze()))*100

In [20]:
batch_size = 256
optimizer = LBFGS(model.parameters(), lr=lr, history_size=10, line_search='None', debug=True)

#%% Main training loop

Ok_size = int(overlap_ratio*batch_size)
Nk_size = int((1 - 2*overlap_ratio)*batch_size)

# sample previous overlap gradient
random_index = np.random.permutation(range(X_train.shape[0]))
Ok_prev = random_index[0:Ok_size]
g_Ok_prev, obj_Ok_prev = get_grad(optimizer, X_train[Ok_prev], y_train[Ok_prev], opfun)

for n_iter in range(max_iter):
    
    # sample current non-overlap and next overlap gradient
    random_index = np.random.permutation(range(X_train.shape[0]))
    Ok = random_index[0:Ok_size]
    Nk = random_index[Ok_size:(Ok_size + Nk_size)]
    
    # compute overlap gradient and objective
    g_Ok, obj_Ok = get_grad(optimizer, X_train[Ok], y_train[Ok], opfun)
    
    # compute non-overlap gradient and objective
    g_Nk, obj_Nk = get_grad(optimizer, X_train[Nk], y_train[Nk], opfun)
    
    # compute accumulated gradient over sample
    g_Sk = overlap_ratio*(g_Ok_prev + g_Ok) + (1 - 2*overlap_ratio)*g_Nk
        
    # two-loop recursion to compute search direction
    p = optimizer.two_loop_recursion(-g_Sk)
                
    # perform line search step
    lr = optimizer.step(p, g_Ok, g_Sk=g_Sk)
    
    # compute previous overlap gradient for next sample
    Ok_prev = Ok
    g_Ok_prev, obj_Ok_prev = get_grad(optimizer, X_train[Ok_prev], y_train[Ok_prev], opfun)
    
    # curvature update
    optimizer.curvature_update(g_Ok_prev, eps=0.2, damping=True)
    
    # compute statistics
    if n_iter % int(50000 / 256) == 0:
        with torch.no_grad():
            train_loss, test_loss, test_acc = compute_stats(X_train, y_train, X_test, 
                                                            y_test, opfun, accfun, ghost_batch=128)
            print('Iter:',n_iter, 'lr:', lr, 'Training Loss:', train_loss, 
                  'Test Loss:', test_loss, 'Test Accuracy:', test_acc)


Applying Powell damping...
Iter: 0 lr: 1.0 Training Loss: 2.3005438346052167 Test Loss: 2.300460264635086 Test Accuracy: 9.99
Applying Powell damping...
Iter: 195 lr: 1.0 Training Loss: 1.8164344183778762 Test Loss: 1.8156640972018245 Test Accuracy: 34.2
Iter: 390 lr: 1.0 Training Loss: 1.6860568000459666 Test Loss: 1.6852852741122248 Test Accuracy: 39.780000000000015
Iter: 585 lr: 1.0 Training Loss: 1.6034406588268284 Test Loss: 1.606543820750714 Test Accuracy: 42.47000000000003
Applying Powell damping...
Iter: 780 lr: 1.0 Training Loss: 1.5694034939670554 Test Loss: 1.5759196725606912 Test Accuracy: 43.44
Iter: 975 lr: 1.0 Training Loss: 1.5486393788290023 Test Loss: 1.5583452074170112 Test Accuracy: 43.98
Iter: 1170 lr: 1.0 Training Loss: 1.5222140706181528 Test Loss: 1.5313565752744667 Test Accuracy: 45.02000000000001
Iter: 1365 lr: 1.0 Training Loss: 1.504505547719001 Test Loss: 1.516534092879296 Test Accuracy: 45.280000000000015
Iter: 1560 lr: 1.0 Training Loss: 1.497580412230491

In [13]:
cuda = torch.cuda.is_available()
    
#%% Create neural network model
        
if(cuda):
    torch.cuda.manual_seed(2018)
    model = ConvNet().cuda() 
else:
    torch.manual_seed(2018)
    model = ConvNet()
    
#%% Define helper functions

# Forward pass
if(cuda):
    opfun = lambda X: model.forward(torch.from_numpy(X).cuda())
else:
    opfun = lambda X: model.forward(torch.from_numpy(X))

# Forward pass through the network given the input
if(cuda):
    predsfun = lambda op: np.argmax(op.cpu().data.numpy(), 1)
else:
    predsfun = lambda op: np.argmax(op.data.numpy(), 1)

# Do the forward pass, then compute the accuracy
accfun   = lambda op, y: np.mean(np.equal(predsfun(op), y.squeeze()))*100

In [14]:
batch_size = 256
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for n_iter in range(max_iter):
    
    # sample current non-overlap and next overlap gradient
    random_index = np.random.permutation(range(X_train.shape[0]))
    idx = random_index[0:batch_size]
    optimizer.zero_grad()
    
    X_batch = torch.from_numpy(X_train[idx]).cuda()
    y_batch = torch.from_numpy(y_train[idx]).cuda().long().squeeze()
    
    loss = F.cross_entropy(model(X_batch), y_batch)
    loss.backward()
    
    optimizer.step()
    
    # compute statistics
    if n_iter % int(50000 / 256) == 0:
        with torch.no_grad():
            train_loss, test_loss, test_acc = compute_stats(X_train, y_train, X_test, 
                                                            y_test, opfun, accfun, ghost_batch=128)
            print('Iter:',n_iter, 'lr:', lr, 'Training Loss:', train_loss, 
                  'Test Loss:', test_loss, 'Test Accuracy:', test_acc)


Iter: 0 lr: 1.0 Training Loss: 2.3017808129262916 Test Loss: 2.3018424204111105 Test Accuracy: 10.679999999999998
Iter: 195 lr: 1.0 Training Loss: 1.556580683267116 Test Loss: 1.5562827260375027 Test Accuracy: 43.57999999999999
Iter: 390 lr: 1.0 Training Loss: 1.414362938771247 Test Loss: 1.427319056892395 Test Accuracy: 48.050000000000004


KeyboardInterrupt: 