In [8]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [9]:
# %cd "drive/My Drive"

In [10]:
# %cd "part1"

In [11]:
# %cd "starter_code"

In [12]:
# some useful functions
import numpy as np
from xman import *

In [13]:
# # TODO: Uncomment the lines below and replace <FILL IN> with appropriate code to declare all operations

class f(XManFunctions):
    @staticmethod
    def square(a):
        return XManFunctions.registerDefinedByOperator('square',a)
    
    # @staticmethod
    # def mean(a):
    #     return <FILL_IN>

    ### BEGIN SOLUTION
    @staticmethod
    def mean(a):
        return XManFunctions.registerDefinedByOperator('mean',a)
    ### END SOLUTION
    
    # @staticmethod
    # def softMax(a):
    #     return <FILL_IN>

    ### BEGIN SOLUTION
    @staticmethod
    def softMax(a):
        return XManFunctions.registerDefinedByOperator('softMax',a)
    ### END SOLUTION

    # @staticmethod
    # def crossEnt(a):
    #     return <FILL_IN>

    ### BEGIN SOLUTION
    @staticmethod
    def crossEnt(a,b):
        return XManFunctions.registerDefinedByOperator('crossEnt',a,b)
    ### END SOLUTION

    # @staticmethod
    # def relu(a):
    #     return <FILL_IN>
    
    ### BEGIN SOLUTION
    @staticmethod
    def relu(a):
        return XManFunctions.registerDefinedByOperator('relu',a)
    ### END SOLUTION

In [14]:
# the functions that autograd.eval will use to evaluate each function,
# to be called with the functions actual inputs as arguments

def _softMax(x):
    maxes = np.amax(x, axis=1)
    # print ("line number 35", x.shape, maxes.shape)
    maxes = maxes.reshape(maxes.shape[0], 1)
    # print ("line number 37", maxes.shape)
    e_x = np.exp(x - maxes)
    sums = np.sum(e_x, axis=1)
    # print ("line number 40",  e_x.shape, sums.shape)
    sums = sums.reshape(sums.shape[0], 1)
    # print ("line number 42", sums.shape)
    dist = e_x / sums
    return dist

def _crossEnt(x,y):
    log_x = np.nan_to_num(np.log(x))
    return - np.multiply(y,log_x).sum(axis=1, keepdims=True)

EVAL_FUNS = {
    'add':      lambda x1,x2: x1+x2,
    'subtract': lambda x1,x2: x1-x2,
    'square':   np.square,
    'mul':      lambda x1,x2: np.dot(x1,x2),
    'mean':     lambda x:x.mean(),
    'softMax':  _softMax,
    'crossEnt': _crossEnt,
    'relu': lambda x: np.maximum(0,x)
    }

In [15]:
# the functions that autograd.bprop will use in reverse mode
# differentiation.  BP_FUNS[f] is a list of functions df1,....,dfk
# where dfi is used in propagating errors to the i-th input xi of f.
# Specifically, dfi is called with the ordinary inputs to f, with two
# additions: the incoming error, and the output of the function, which
# was computed by autograd.eval in the eval stage.  dfi will return
# delta * df/dxi [f(x1,...,xk)]
#
# NOTE: Autograd has an optimization where if it finds a softMax op
# followed by crossEnt op, it combines the backward pass for both. So
# you only need to implement the BP_FUNS for the combined operation
# crossEnt-softMax below.

def _derivDot1(delta,out,x1,x2):
    return np.dot(delta, x2.transpose())

def _derivDot2(delta,out,x1,x2):
    return np.dot(x1.transpose(), delta)

def _derivAdd(delta,x1):
    if delta.shape!=x1.shape:
        # broadcast, sum along axis=0
        if delta.shape[1]!=x1.shape[0]:
            raise ValueError("Dimension Mismatch")
        return delta.sum(axis=0) #we sum the gradients over the batch
    else: return delta

def _derivSoftMax(delta,out,x):
    return (delta[:,:,None]*(out[:,:,None]*(np.eye(out.shape[1])[None,:,:] - out[:,None,:]))).sum(axis=1)

def _derivCrossEnt1(delta,out,x,y):
    return -y*np.reciprocal(x)

def _derivCrossEnt2(delta,out,x,y):
    return -np.log(x)

BP_FUNS = {
    'add':              [lambda delta,out,x1,x2: _derivAdd(delta,x1),    lambda delta,out,x1,x2: _derivAdd(delta,x2)],
    'subtract':         [lambda delta,out,x1,x2: _derivAdd(delta,x1),    lambda delta,out,x1,x2: -_derivAdd(delta,x2)],
    'square':           [lambda delta,out,x : delta * 2.0 * x],
    'mul':              [_derivDot1, _derivDot2],
    'mean':             [lambda delta,out,x : delta * 1.0/float(x.shape[0])*np.ones(x.shape)],
    'relu':             [lambda delta,out,x : delta * ((x>0).astype(np.float64))],
    'softMax':          [_derivSoftMax],
    'crossEnt':         [_derivCrossEnt1, _derivCrossEnt2],
    'crossEnt-softMax': [lambda delta,out,x,y: delta*(_softMax(x)*y.sum(axis=1)[:,None] - y),  lambda delta,out,x,y:-delta*np.log(_softMax(x))],  #second one is never used for much
    }

In [16]:
# Unit tests for the functions. Run by `python functions.py`.
x = np.array([
    [ 0.76677119,  0.12815245],
    [ 0.4007303 ,  0.77046941],
    [ 0.00574018,  0.71242641]])
y = np.array([
    [-0.06655641,  0.10877971],
    [ 0.13663944, -0.12461873]])
z = np.array([[0., 1.], [0., 1.], [1., 0.]])
v =np.array([[ 0.96894013], [ 0.07382228]])
# Eval mul
expected_x_mul_y =  np.array([[-0.03352286,  0.06743895],
    [ 0.07860534, -0.05242359],
    [ 0.0969635 , -0.08815726]])
np.testing.assert_allclose(EVAL_FUNS['mul'](x, y), expected_x_mul_y)
expected_relu_y = np.array([
    [ 0.        ,  0.10877971],
    [ 0.13663944,  0.        ]])
# Eval relu
np.testing.assert_allclose(EVAL_FUNS['relu'](y), expected_relu_y)
expected_softMax_x = np.array([
    [ 0.65444116,  0.34555884],
    [ 0.40860406,  0.59139594],
    [ 0.33033148,  0.66966852]])
# Eval softMax
np.testing.assert_allclose(EVAL_FUNS['softMax'](x), expected_softMax_x)
expected_crossEnt_softMax_x_z = np.array([
    [ 1.06259235],
    [ 0.52526954],
    [ 1.10765864]])
# Eval crossEnt
np.testing.assert_allclose(EVAL_FUNS['crossEnt'](expected_softMax_x, z), expected_crossEnt_softMax_x_z)
# Eval mean
expected_mean_v = 0.52138120499999996
np.testing.assert_allclose(EVAL_FUNS['mean'](v), expected_mean_v)
# BP mul
delta_x_mul_y = np.array([
    [ 0.12523631,  0.00680066],
    [ 0.48109275,  0.95663136],
    [ 0.40436419,  0.56481742]])
np.testing.assert_allclose(BP_FUNS['mul'][0](delta_x_mul_y, expected_x_mul_y, x, y), np.array([
    [-0.00759551,  0.01626473],
    [ 0.07204228, -0.05347794],
    [ 0.03452765, -0.01513473]]), rtol=1e-06)
np.testing.assert_allclose(BP_FUNS['mul'][1](delta_x_mul_y, expected_x_mul_y, x, y), np.array([
    [ 0.29113716,  0.39180788],
    [ 0.67479632,  1.14031757]]))
# BP relu
delta_relu_y = np.array([
    [ 0.66202207,  0.59765468],
    [ 0.01812402,  0.58537534]])
np.testing.assert_allclose(BP_FUNS['relu'][0](delta_relu_y, expected_relu_y, y), np.array([
    [ 0.        ,  0.59765468],
    [ 0.01812402,  0.        ]]))
# BP crossEnt-softMax
delta_crossEnt_softMax_x_z = np.array([
    [  5.69906247e-01],
    [  8.66851385e-01],
    [  2.79581480e-04]])
np.testing.assert_allclose(BP_FUNS['crossEnt-softMax'][0](delta_crossEnt_softMax_x_z, expected_crossEnt_softMax_x_z, x, z), np.array([
    [  3.72970104e-01,  -3.72970104e-01],
    [  3.54198998e-01,  -3.54198998e-01],
    [ -1.87226917e-04,   1.87226917e-04]]))
# BP mean
np.testing.assert_allclose(BP_FUNS['mean'][0](0.19950823, expected_mean_v, v), np.array([
    [ 0.09975412],
    [ 0.09975412]]))

In [None]:
"""
Multilayer Perceptron for character level entity classification
"""
import argparse
import numpy as np
from xman import *
from utils import *
from autograd import *

np.random.seed(0)

EPS=1e-4

def fwd(network, valueDict):
    ad = Autograd(network.my_xman)
    return ad.eval(network.my_xman.operationSequence(network.my_xman.loss), valueDict)

def bwd(network, valueDict):
    ad = Autograd(network.my_xman)
    return ad.bprop(network.my_xman.operationSequence(network.my_xman.loss), valueDict,loss=np.float_(1.0))

def update(network, dataParamDict, grads, rate):
    for rname in grads:
        if network.my_xman.isParam(rname):
            dataParamDict[rname] = dataParamDict[rname] - rate*grads[rname]
    return dataParamDict

def accuracy(probs, targets):
    preds = np.argmax(probs, axis=1)
    targ = np.argmax(targets, axis=1)
    return float((preds==targ).sum())/preds.shape[0]

def grad_check(network):
    # function which takes a network object and checks gradients
    # based on default values of data and params
    dataParamDict = network.my_xman.inputDict()
    fd = fwd(network, dataParamDict)
    grads = bwd(network, fd)
    for rname in grads:
        if network.my_xman.isParam(rname):
            fd[rname].ravel()[0] += EPS
            fp = fwd(network, fd)
            a = fp['loss']
            fd[rname].ravel()[0] -= 2*EPS
            fm = fwd(network, fd)
            b = fm['loss']
            fd[rname].ravel()[0] += EPS
            auto = grads[rname].ravel()[0]
            num = (a-b)/(2*EPS)
            if not np.isclose(auto, num, atol=1e-3):
                raise ValueError("gradients not close for %s, Auto %.5f Num %.5f"
                        % (rname, auto, num))

def glorot(m,n):
    # return scale for glorot initialization
    return np.sqrt(6./(m+n))

class MLP(object):
    """
    Multilayer Perceptron
    Accepts list of layer sizes [in_size, hid_size1, hid_size2, ..., out_size]
    """
    def __init__(self, layer_sizes):
        self.num_layers = len(layer_sizes)-1
        self.my_xman = self._build(layer_sizes) # DO NOT REMOVE THIS LINE. Store the output of xman.setup() in this variable
        print (self.my_xman.operationSequence(self.my_xman.loss))

    def _build(self, layer_sizes):
        print("INITIAZLIZING with layer_sizes:", layer_sizes)
        self.params = {}
        for i in range(self.num_layers):
            k = i+1
            sc = glorot(layer_sizes[i], layer_sizes[i+1])
            self.params['W'+str(k)] = f.param(name='W'+str(k),
                    default=sc*np.random.uniform(low=-1.,high=1.,
                        size=(layer_sizes[i], layer_sizes[i+1])))
            self.params['b'+str(k)] = f.param(name='b'+str(k),
                    default=0.1*np.random.uniform(low=-1.,high=1.,size=(layer_sizes[i+1],)))
        self.inputs = {}
        self.inputs['X'] = f.input(name='X', default=np.random.rand(1,layer_sizes[0]))
        self.inputs['y'] = f.input(name='y', default=np.random.rand(1,layer_sizes[-1]))
        x = XMan()
        inp = self.inputs['X']
        for i in range(self.num_layers):
            oo = f.mul(inp,self.params['W'+str(i+1)]) + self.params['b'+str(i+1)]
            inp = f.relu( oo )

        x.output = f.softMax(inp)
        # loss
        x.loss = f.mean(f.crossEnt(x.output, self.inputs['y']))
        return x.setup()

    def data_dict(self, X, y):
        dataDict = {}
        dataDict['X'] = X
        dataDict['y'] = y
        return dataDict

def main(params):
    epochs = params['epochs']
    max_len = params['max_len']
    num_hid = params['num_hid']
    batch_size = params['batch_size']
    dataset = params['dataset']
    init_lr = params['init_lr']
    output_file = params['output_file']
    train_loss_file = params['train_loss_file']

    # load data and preprocess
    dp = DataPreprocessor()
    data = dp.preprocess('%s.train'%dataset, '%s.valid'%dataset, '%s.test'%dataset)
    # minibatches
    mb_train = MinibatchLoader(data.training, batch_size, max_len,
           len(data.chardict), len(data.labeldict))
    mb_valid = MinibatchLoader(data.validation, len(data.validation), max_len,
           len(data.chardict), len(data.labeldict), shuffle=False)
    mb_test = MinibatchLoader(data.test, len(data.test), max_len,
           len(data.chardict), len(data.labeldict), shuffle=False)

    # build
    print ("building mlp...")
    mlp = MLP([max_len*mb_train.num_chars,num_hid,mb_train.num_labels])
    grad_check(mlp)

    print ("done")

    # train
    print ("training...")
    logger = open('%s_mlp4c_L%d_H%d_B%d_E%d_lr%.3f.txt'%
            (dataset,max_len,num_hid,batch_size,epochs,init_lr),'w')
    # get default data and params
    value_dict = mlp.my_xman.inputDict()
    min_loss = 1e5
    lr = init_lr
    train_loss = np.ndarray([0])
    best_param_dict = {}
    for i in range(epochs):
        for ii, (idxs,e,l) in enumerate(mb_train):
            # prepare input
            data_dict = mlp.data_dict(e.reshape((e.shape[0],e.shape[1]*e.shape[2])),l)
            for k,v in data_dict.iteritems():
                value_dict[k] = v
            # fwd-bwd
            vd = fwd(mlp,value_dict)
            gd = bwd(mlp,value_dict)
            value_dict = update(mlp, value_dict, gd, lr)
            message = 'TRAIN loss = %.3f' % vd['loss']
            logger.write(message+'\n')
            train_loss = np.append(train_loss, vd['loss'])
        print (ii)
        # validate
        tot_loss, n= 0., 0
        probs = []
        targets = []
        for (idxs,e,l) in mb_valid:
            # prepare input
            data_dict = mlp.data_dict(e.reshape((e.shape[0],e.shape[1]*e.shape[2])),l)
            for k,v in data_dict.iteritems():
                value_dict[k] = v
            # fwd
            vd = fwd(mlp, value_dict)
            tot_loss += vd['loss']
            probs.append(vd['output'])
            targets.append(l)
            n += 1
        acc = accuracy(np.vstack(probs), np.vstack(targets))
        c_loss = tot_loss/n
        if c_loss<min_loss:
            min_loss = c_loss
            for k,v in value_dict.iteritems():
                best_param_dict[k] = np.copy(v)
        message = ('Epoch %d VAL loss %.3f min_loss %.3f acc %.3f' %
                (i,c_loss,min_loss,acc))
        logger.write(message+'\n')
        print (message)

    np.save(train_loss_file, train_loss)

    tot_loss, n = 0., 0
    probs = []
    targets = []
    for (idxs,e,l) in mb_test:
        # prepare input
        data_dict = mlp.data_dict(e.reshape((e.shape[0],e.shape[1]*e.shape[2])),l)
        for k,v in data_dict.iteritems():
            best_param_dict[k] = v
        # fwd
        vd = fwd(mlp,best_param_dict)
        tot_loss += vd['loss']
        probs.append(vd['output'])
        targets.append(l)
        n += 1
    acc = accuracy(np.vstack(probs), np.vstack(targets))
    c_loss = tot_loss/n
    np.save(output_file, np.vstack(probs))
    print ("done, test loss = %.3f acc = %.3f" % (c_loss, acc))


# parser = argparse.ArgumentParser()
# parser.add_argument('--max_len', dest='max_len', type=int, default=10)
# parser.add_argument('--num_hid', dest='num_hid', type=int, default=50)
# parser.add_argument('--batch_size', dest='batch_size', type=int, default=64)
# parser.add_argument('--dataset', dest='dataset', type=str, default='tiny')
# parser.add_argument('--epochs', dest='epochs', type=int, default=15)
# parser.add_argument('--init_lr', dest='init_lr', type=float, default=0.5)
# parser.add_argument('--output_file', dest='output_file', type=str, default='output')
# parser.add_argument('--train_loss_file', dest='train_loss_file', type=str, default='train_loss')
# params = vars(parser.parse_args())
params = dict()
params['max_len'] = 10
params['num_hid'] = 50
params['batch_size'] = 64
params['dataset'] = '../../data/tiny'
params['epochs'] = 15
params['init_lr'] = 0.5
params['output_file'] = 'output'
params['train_loss_file'] = 'train_loss'
main(params)

constructing vocabulary
# chars in training  132
# chars in validation  78
# chars in testing  78
# chars in (testing-training-validation)  0
# labels 5
preparing training data
num_rows: 1857  index 1857
preparing validation data
num_rows: 221  index 221
preparing test data
num_rows: 221  index 221
building mlp...
INITIAZLIZING with layer_sizes: [1340, 50, 5]
[('z7', 'mul', <map object at 0x11bd58490>), ('z6', 'add', <map object at 0x11bd58510>), ('z5', 'relu', <map object at 0x11bd585d0>), ('z4', 'mul', <map object at 0x11bd58650>), ('z3', 'add', <map object at 0x11bd58710>), ('z2', 'relu', <map object at 0x11bd58790>), ('output', 'softMax', <map object at 0x11bd587d0>), ('z1', 'crossEnt', <map object at 0x11bd58850>), ('loss', 'mean', <map object at 0x11bd588d0>)]
first
> /Users/zacharypeng/Downloads/part1/starter_code/autograd.py(86)optimizeForBProp()
-> mapList = [i for i in opseq[k][2]]
(Pdb) k
1
(Pdb) opseq[k][0]
'z1'
(Pdb) opseq[k][1]
'crossEnt'
(Pdb) opseq[2][0]
'output'
(Pdb) 