In [5]:
from scipy import signal as sig

# imports from custom library
import sys
sys.path.append('../../')
import matplotlib.pyplot as plt
plt.rc('text', usetex=True)
from mlrefined_libraries import convnets_library as convlib
from mlrefined_libraries import basics_library as baslib
from mlrefined_libraries import superlearn_library as superlearn
from mlrefined_libraries import multilayer_perceptron_library as network_lib
from autograd import grad as compute_grad  

import autograd.numpy as np
from autograd import grad as compute_grad   
import numpy as npo

import pandas as pd
import cv2
import csv
import pickle
import glob
import time
import copy
from datetime import datetime 

#this is needed to compensate for matplotlib notebook's tendancy to blow up images when plotted inline
%matplotlib notebook
from matplotlib import rcParams
rcParams['figure.autolayout'] = True

%load_ext autoreload
%autoreload 2

## Load in data

Load in data.

In [8]:
# load in dataset
datapath = '../../mlrefined_datasets/convnet_datasets/feat_face_data.csv'
data = np.loadtxt(datapath,delimiter = ',')

Contrast normalize images.

In [9]:
# extract input tensor
tensor = data[:,:-1]
y = data[:,-1:]

# contrast normalize image data
def contrast_normalize(data):
    data_means = np.mean(data,axis = 0)
    data = data - data_means
    data_stds = np.std(data,axis = 0)
    data = data/data_stds
    return data

# contrast normalize the input 
tensor_decontrast = contrast_normalize(tensor.T).T

# make new data based on fixed kernel convolutions
new_data = np.concatenate((tensor_decontrast,y),axis = 1)

In [14]:
# pick a few points for testing
ind = np.random.permutation(len(new_data))
data_train = new_data[ind[:10],:]
x = data_train[:,:-1]
y = data_train[:,-1:]

Pluck out a few datapoints for testing.

# Adjusting gradient descent

We need to be careful about how we shape tensors here.

In [6]:
# gradient descent function
def gradient_descent(g,w,alpha,max_its,version,**kwargs):       
    # compute gradient function
    grad = compute_grad(g)
    
    # flatten the input function, create gradient based on flat function
    w_unflat = copy.deepcopy(w)
    g_flat, unflatten, w = flatten_func(g, w)

    # record history
    w_hist = []
    w_hist.append(w_unflat)

    # over the line
    for k in range(max_its):   
        # plug in value into func and derivative
        grad_eval = grad(w_unflat)
        
        # flatten gradient for descent step
        grad_eval, _ = flatten(grad_eval)

        ### normalized or unnormalized descent step? ###
        if version == 'normalized':
            grad_norm = np.linalg.norm(grad_eval)
            if grad_norm == 0:
                grad_norm += 10**-6*np.sign(2*np.random.rand(1) - 1)
            grad_eval /= grad_norm

        # take descent step 
        w = w - alpha*grad_eval

        # unflatten weight update for storing
        w_unflat = unflatten(w)
        
        # store weight update
        w_hist.append(w_unflat)

    return w_hist

# Our naive convolution functionality

In [72]:
# choose a nonlinear activation function 
def activation(t):
    # a relu function
    nonlinearity = np.maximum(0,t)  # default is the relu function
    return nonlinearity

# sliding window for image augmentation
def sliding_window_image(image, kernel_size, stride):
    windowed_image = []
    for i in np.arange(0, np.shape(image)[0]-kernel_size+1, stride):
        for j in np.arange(0, np.shape(image)[1]-kernel_size+1, stride):
             windowed_image.append(image[i:i+kernel_size, j:j+kernel_size].flatten())
            
    return np.array(windowed_image)

# pad image with appropriate number of zeros for convolution
def pad_image(image,kernel_size):
    odd_nums = np.array([int(2*n + 1) for n in range(100)])
    pad_val = np.argwhere(odd_nums == kernel_size)[0][0]
    image_padded = np.zeros((np.shape(image) + 2*pad_val))
    image_padded[pad_val:-pad_val,pad_val:-pad_val] = image
    return image_padded          

def conv_layer(data,kernels):
    # loop over input and reshape to be square
    image_tensor = np.zeros((len(y),28,28))
    for i in range(0,np.shape(data)[0]):
        image_tensor[i,:,:] = np.reshape(data[i,:],(28,28),1)
        
    ### loop over images, produce convolution feature maps, downsample
    new_tensors = []
    kernel_size = kernels[0].shape[0]
    kernel_stride = 1
    pool_kernel_size = 6
    pool_stride = 3
    for image in image_tensor:
        # pad image with zeros
        padded_image = pad_image(image,kernel_size)

        #### loop over kernels and construct feature map for each kernel
        downsampled_feature_maps = []
        for kernel in kernels:
            # window image
            wind_img = sliding_window_image(padded_image,kernel_size,stride = kernel_stride)

            # make convolution feature map - via matrix multiplication over windowed tensor 
            feature_map = np.dot(wind_img,kernel.flatten()[:,np.newaxis])

            # reshape convolution feature map into array
            feature_map = np.reshape(feature_map,(np.shape(image)))

            # now shove result through nonlinear activation
            feature_map = activation(feature_map)

            #### now pool / downsample feature map, first window then pool on each window
            wind_featmap = sliding_window_image(feature_map,kernel_size,stride = pool_stride)

            # max pool on each collected patch
            max_pool = np.max(wind_featmap,axis = 1)

            # reshape into new tensor
            max_pool = np.reshape(max_pool, (int((np.size(max_pool))**(0.5)),int((np.size(max_pool))**(0.5))))

            # reshape into new downsampled pooled feature map
            downsampled_feature_maps.append(max_pool)

        ## re-shape downsampled_feature_maps and store
        new_tensors.append(downsampled_feature_maps)

    # reshape new tensor properly
    new_tensors = np.reshape(new_tensors, (np.shape(new_tensors)[0],np.shape(new_tensors)[1],np.shape(new_tensors)[2]*np.shape(new_tensors)[3]))
    new_tensors = np.reshape(new_tensors, (np.shape(new_tensors)[0],np.shape(new_tensors)[1]*np.shape(new_tensors)[2]),order = 'F')
    
    return np.array(new_tensors)

# A basic architecture

Lets start by tacking on the convolution layer to our basic multilayer perceptron / feedforward architecture - i.e., no normalization of activation outputs, non-maxout activation, etc,.

We simply need to change

- input weights: we should include kernels now
- before entering the fully connected network we pass our data (and kernels) through the convolutional layer

In [67]:
# fully evaluate our network features using the tensor of weights in omega_inner
def compute_features(x, kernels,omega_inner):
    #### NEW --- pass input data and kernels through convolutional layer #####
    x_conv = conv_layer(x,kernels)
    
    #### pass result through fully connected multilayer perceptron ####
    o = np.ones((np.shape(x_conv)[0],1))
    a_padded = np.concatenate((o,x_conv),axis = 1)
    
    # loop through each layer matrix
    for W in omega_inner:
        # output of layer activation
        a = activation(np.dot(a_padded,W))
                
        #  pad with ones (to compactly take care of bias) for next layer computation
        o = np.ones((np.shape(a)[0],1))
        a_padded = np.concatenate((o,a),axis = 1)
        
    return a_padded

# our predict function 
def predict(x,omega):     
    # compute network features - here omega[0] contains the entire tensor of internal weights
    f = compute_features(x,omega[0],omega[1])
    
    # compute linear model compactly via inner product - here omega[1] contains only those weights in the final linear combination of network features
    vals = np.dot(f,omega[2])
    return vals

# the softmax cost function written more compactly
def softmax(w):
    cost  = np.sum(np.log(1 + np.exp((-y)*(predict(x,w)))))
    return cost

# Initialization

Our initialization has to be adjusted equivalently - we need to initialize kernels!

To understand what size our initial input layer must have we can pass our data and test kernels through the convolution layer.

In [73]:
# set number of kernels and kernel size, and pass a test set of kernels and data through the convolutional layer
num_kernels = 2
kernel_size = 3
scale = 0.1
kernels = np.random.randn(num_kernels,kernel_size,kernel_size)

In [74]:
# push data and kernels through conv-layer
x_convolved = conv_layer(x,kernels)
L1 = np.shape(x_convolved)[1]

With layer 1 size determined, we can now initialize our network ``layer_sizes``.

In [76]:
def initialize_general_network_weights(num_kernels,kernel_size,layer_sizes,scale):
    # container for entire weight tensor
    weights = []
    kernel_weights = []

    # loop over desired kernel sizes and create appropriately sized initial 
    # weight matrix for each kernel
    for k in range(num_kernels):
        # make weight matrix
        weight = scale*np.random.randn(kernel_size,kernel_size)
        kernel_weights.append(weight)
    kernel_weights = np.asarray(kernel_weights)

    # loop over desired layer sizes and create appropriately sized initial 
    # weight matrix for each layer
    for k in range(len(layer_sizes)-1):
        # get layer sizes for current weight matrix
        U_k = layer_sizes[k]
        U_k_plus_1 = layer_sizes[k+1]

        # make weight matrix
        weight = scale*np.random.randn(U_k + 1,U_k_plus_1)
        weights.append(weight)

    # re-express weights so that w_init[0] = omega_inner contains all 
    # internal weight matrices, and w_init[1] = w contains weights of 
    # final linear combination in predict function
    w_init = [kernel_weights,weights[:-1],weights[-1]]
    return w_init

Now we can initialize weights for this network.

In [77]:
layer_sizes
num_kernels = 2
kernel_size = 3
scale = 0.1
layer_sizes = [L1,5,1]
w_init = initialize_general_network_weights(num_kernels,kernel_size,layer_sizes,scale)

Lets test out our initialization by passing it and our data through the network.

In [78]:
predict(x,w_init)

array([[ 0.10518486],
       [ 0.10027592],
       [ 0.09576042],
       [ 0.10333978],
       [ 0.09782135],
       [ 0.08641705],
       [ 0.11559694],
       [ 0.14429247],
       [ 0.13310843],
       [ 0.08974176]])

And our cost function

In [79]:
softmax(w_init)

7.1521834774557522

# The gradient

In [80]:
from autograd import grad as compute_grad   
test_grad = compute_grad(softmax)

In [81]:
wa = test_grad(w_init)

AttributeError: 'ArrayBox' object has no attribute 'exp'

## learned convolution features

In [88]:
# load in data - for this one split up training and testing
ind = np.random.permutation(len(new_data))

# split it up
data_train = new_data[ind[:3],:]
data_test = new_data[ind[3:],:]

# load data into network
demo = convlib.network_learner.Network()
demo.input_data(data_train,data_test,normalize = True)

In [89]:
# choose cost function
demo.choose_cost(cost_name = 'twoclass_softmax')

# setup network architecture
activation_name = 'linear'
num_kernels = 1
layer_sizes = [64,2,1]
demo.architecture_settings(activation_name,layer_sizes,num_kernels)

In [90]:
# setup optimizer
demo.optimizer_settings(alpha = 10**(-1),max_its = 10,version = 'unnormalized',scale = 0.1)

In [91]:
# demo.fit(verbose = True)

In [92]:
# # compute cost plots on training and testing data
# demo.compute_cost_plots()

# # produce cost functio plots for training and testing data
# demo.plot_histories(start = 0)

The prediction function.

In [93]:
# demo.weight_history[6]
# demo.w_init

# just the gradient

The gradient

In [94]:
test_grad = compute_grad(demo.training_cost)
wa = test_grad(demo.w_init)

(3, 65)
(65, 2)
(1, 65, 2)
Autograd ArrayBox with value [[-0.12022168 -0.13153025]
 [-0.03854317 -0.18003865]
 [-0.12064693 -0.33380425]]


In [83]:
demo.w_init[1]

array([[[-0.08748114, -0.05287551, -0.12917575],
        [-0.02690774,  0.05879566, -0.03019546],
        [ 0.00932586,  0.16003246,  0.02650719]]])

In [76]:
wa

[[array([[ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.

In [68]:
demo.w_init[1]

array([[[ 0.09398295,  0.06761082,  0.0184683 ],
        [-0.19839479,  0.12737906, -0.04069006],
        [ 0.03058614, -0.02851461, -0.10171031]]])

In [15]:
np.shape(demo.x_train)

(10, 784)