In [1]:
# some standard imports
import matplotlib.pyplot as plt
import autograd.numpy as np
from autograd import grad as compute_grad   
from datetime import datetime 

#this is needed to compensate for matplotlib notebook's tendancy to blow up images when plotted inline
%matplotlib notebook
from matplotlib import rcParams
rcParams['figure.autolayout'] = True

%load_ext autoreload
%autoreload 2

# Gradient descent

In [208]:
# gradient descent function
def gradient_descent(g,w,alpha,max_its,version):    
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))
    
    # over the line
    for k in range(max_its):   
        # plug in value into func and derivative
        grad_eval = grad(w)
        grad_eval.shape = np.shape(w)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
            grad_eval /= grad_norm
            
        # take descent step with momentum
        w = w - alpha*grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist

In [3]:
from autograd.misc.flatten import flatten_func
from autograd import grad as compute_grad   

# testing stochastic

In [4]:
# load in large-ish dataset
datapath = '../../mlrefined_datasets/convnet_datasets/feat_face_data.csv'
# datapath = '../../mlrefined_datasets/superlearn_datasets/3d_classification_data_v0.csv'
data = np.loadtxt(datapath,delimiter = ',')

In [7]:
x = data[:,:-1]
y = data[:,-1:]

Nice compact cost function.

In [8]:
# the softmax cost function written more compactly
def softmax(w):
    cost  = np.sum(np.log(1 + np.exp((-y)*(w[0] + np.dot(x,w[1:])))))
    return cost

Compute gradient.

In [9]:
grad = compute_grad(softmax)

Test out cost with an initial weight vector.

In [10]:
scale = 0.1
w_init = scale*np.random.randn(np.shape(x)[1]+1,1)

In [11]:
startTime= datetime.now() 

print (softmax(w_init))

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

6854.28678158
Time elpased (hh:mm:ss.ms) 0:00:00.049834


Compare full gradient to stochastic evaluation.

In [12]:
startTime= datetime.now() 

grad(w_init)

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

Time elpased (hh:mm:ss.ms) 0:00:00.069863


Test stochastic.  First the cost.

In [43]:
# 2-class logistic
def stochastic_softmax(w,iter):
    # get subset of points
    x_p = x[iter,:]
    y_p = y[iter]

    # compute cost over just this subset
    cost  = np.sum(np.log(1 + np.exp((-y_p)*(w[0] + np.dot(x_p,w[1:])))))
    return cost

Evaluate the cost.

In [44]:
startTime= datetime.now() 

p = 10
stochastic_softmax(w_init,p)

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

Time elpased (hh:mm:ss.ms) 0:00:00.000371


Now the gradient.

In [45]:
stchastic_grad = compute_grad(stochastic_softmax)

In [46]:
startTime= datetime.now() 
p = 10
stchastic_grad(w_init,p)

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

Time elpased (hh:mm:ss.ms) 0:00:00.002691


In [15]:
x_blah = x[10,:]

# minibatch tests with autograd

In [59]:
# 2-class logistic
def minibatch_softmax(w,iter):
    # get subset of points
    x_p = x[iter,:]
    y_p = y[iter]
    
    # compute cost over just this subset
    cost  = np.sum(np.log(1 + np.exp((-y_p)*(w[0] + np.dot(x_p,w[1:])))))
    return cost

Test cost.

In [60]:
print (w_init[1:].shape)

(784, 1)


In [63]:
startTime= datetime.now() 

p = np.arange(10,100)
minibatch_softmax(w_init,p)

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

Time elpased (hh:mm:ss.ms) 0:00:00.001419


In [64]:
minibatch_grad = compute_grad(minibatch_softmax)

In [65]:
startTime= datetime.now() 

p = [10,11]
minibatch_grad(w_init,p)

timeElapsed=datetime.now()-startTime 
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

Time elpased (hh:mm:ss.ms) 0:00:00.000903


# Speed tests

In [178]:
# gradient descent function
def minibatch_gradient_descent(g,w,alpha,batch_size,max_its,version):    
    # flatten the input function, create gradient based on flat function
    g_flat, unflatten, w = flatten_func(g, w)
    grad = compute_grad(g_flat)

    # record history
    w_hist = []
    w_hist.append(unflatten(w))
    
    # how many mini-batches equal the entire dataset?
    num_batches = int(np.ceil(np.divide(num_pts,batch_size)))
    # over the line
    for k in range(max_its):   
        # loop over each minibatch
        for b in range(num_batches):
            # collect indices of current mini-batch
            batch_inds = np.arange(b*batch_size,min((b+1)*batch_size,num_pts))

            # plug in value into func and derivative
            grad_eval = grad(w,batch_inds)
            grad_eval.shape = np.shape(w)

            ### normalized or unnormalized descent step? ###
            if version == 'normalized':
                grad_norm = np.linalg.norm(grad_eval)
                if grad_norm == 0:
                    grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
                grad_eval /= grad_norm
            
            # take descent step with momentum
            w = w - alpha*grad_eval

        # record weight update
        w_hist.append(unflatten(w))

    return w_hist

Our correspnoding cost function.

In [189]:
# load in dataset
datapath = '../../mlrefined_datasets/convnet_datasets/feat_face_data.csv'
data = np.loadtxt(datapath,delimiter = ',')

In [240]:
# load in data - for this one split up training and testing
ind = np.random.permutation(len(data))
data_train = data[ind[:-1],:]
data_test = data[ind[-2000:],:]

In [193]:
x = data_train[:,:-1]
y = data_train[:,-1:]
num_pts = len(y)

In [241]:
# 2-class logistic
def minibatch_softmax(w,iter):
    # get subset of points
    x_p = x[iter,:]
    y_p = y[iter]
    
    # compute cost over just this subset
    cost  = np.sum(np.log(1 + np.exp((-y_p)*(w[0] + np.dot(x_p,w[1:])))))
    return cost

In [242]:
g = softmax
scale = 0.1
w = w_init
alpha = 10**(-3)
max_its = 100
version = 'unnormalized'
weight_history_1 =  gradient_descent(g,w,alpha,max_its,version)

In [249]:
g = minibatch_softmax
scale = 0.1
w_init = scale*np.random.randn(np.shape(x)[1]+1,1)
w = w_init
alpha = 10**(-1)
max_its = 100
version = 'unnormalized'
batch_size = 100
weight_history_2 =  minibatch_gradient_descent(g,w,alpha,batch_size,max_its,version)

In [250]:
weight_histories = [weight_history_1,weight_history_2]
plot_history(x,y,weight_histories)

<IPython.core.display.Javascript object>

In [222]:
# import plotting library and other necessities
import matplotlib.pyplot as plt
from matplotlib import gridspec
import copy

# our plotting function
def plot_history(x,y,weight_histories):
    '''
    A module for computing / plotting the cost and misclassification histories for a given run of gradient descent.
    Here the input should be the data and weight history from a gradient descent run
    '''
    
    # local copies of the softmax cost function written more compactly, for scoping issues
    softmax = lambda w: np.sum(np.log(1 + np.exp((-y)*(w[0] + np.dot(x,w[1:])))))
    count = lambda w: 0.25*np.sum((np.sign(w[0] + np.dot(x,w[1:])) - y)**2)
    
    # initialize figure
    fig = plt.figure(figsize = (9,3))

    # create subplot with 3 panels, plot input function in center plot
    gs = gridspec.GridSpec(1, 2) 
    ax1 = plt.subplot(gs[0]); 
    ax2 = plt.subplot(gs[1]);
    
    # loop over histories and plot all
    c = 1
    for weight_history in weight_histories:
        # loop over input weight history and create associated cost and misclassification histories
        cost_history = []
        count_history = []
        for weight in weight_history:
            cost_val = softmax(weight)
            cost_history.append(cost_val)

            count_val = count(weight)
            count_history.append(count_val)

        # now plot each, one per panel
        ax1.plot(cost_history)  
        label = 'full grad'
        if c == 2:
            label = 'mini-batch'
        if c == 3:
            label = 'stochastic'
        ax2.plot(count_history,label = label)
        c+=1
        
    # label each panel
    ax1.set_xlabel('iteration')
    ax1.set_ylabel('cost function val')
    ax1.set_title('cost function history')
    
    ax2.set_xlabel('iteration')
    ax2.set_ylabel('misclassifications')
    ax2.set_title('number of misclassificaions')
    
    ax2.legend()
#     ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=2)

    
    plt.show()