# Exercise 2
Yong Wu</br>
ZiWei Liu</br>
WenZhuo Chen

# 3 Programming a nural network

In [19]:
from __future__ import print_function
import argparse
import numpy as np
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold

In [10]:
class ReLULayer(object):
    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the ReLU of the input
        relu = np.maximum(0,input) # your code here
        return relu

    def backward(self, upstream_gradient):
        # compute the derivative of ReLU from upstream_gradient and the stored input
        upstream_gradient[np.maximum(0,self.input)<=0]=0
        downstream_gradient =upstream_gradient # your code here
        return downstream_gradient

    def update(self, learning_rate):
        pass # ReLU is parameter-free

In [11]:
class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # return the softmax of the input
        softmax = np.exp(self.input)/np.sum(np.exp(self.input),axis=1).reshape(-1,1) # your code here
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        # return the loss derivative with respect to the stored inputs
        # (use cross-entropy loss and the chain rule for softmax,
        #  as derived in the lecture)
        one_hot_labels = np.eye(self.n_classes)[true_labels]
        downstream_gradient = (predicted_posteriors - one_hot_labels) / predicted_posteriors.shape[0]
        return downstream_gradient

    def update(self, learning_rate):
        pass # softmax is parameter-free

In [12]:
class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        # randomly initialize weights and intercepts
        self.B = np.random.normal(size=[self.n_inputs,self.n_outputs]) # your code here
        self.b = np.random.normal(size=[1,self.n_outputs]) # your code here

    def forward(self, input):
        # remember the input for later backpropagation
        self.input = input
        # compute the scalar product of input and weights
        # (these are the preactivations for the subsequent non-linear layer)
        preactivations =self.input.dot(self.B)+self.b # your code here
        return preactivations

    def backward(self, upstream_gradient):
        # compute the derivative of the weights from
        # upstream_gradient and the stored input
        self.grad_b = np.sum(upstream_gradient,axis=0) # your code here
        self.grad_B = self.input.T.dot(upstream_gradient) # your code here
        # compute the downstream gradient to be passed to the preceding layer
        downstream_gradient = upstream_gradient.dot(self.B.T) # your code here
        return downstream_gradient

    def update(self, learning_rate):
        # update the weights by batch gradient descent
        self.B = self.B - learning_rate * self.grad_B
        self.b = self.b - learning_rate * self.grad_b

In [13]:
class MLP(object):
    def __init__(self, n_features, layer_sizes):
        # constuct a multi-layer perceptron
        # with ReLU activation in the hidden layers and softmax output
        # (i.e. it predicts the posterior probability of a classification problem)
        #
        # n_features: number of inputs
        # len(layer_size): number of layers
        # layer_size[k]: number of neurons in layer k
        # (specifically: layer_sizes[-1] is the number of classes)
        self.n_layers = len(layer_sizes)
        self.layers   = []

        # create interior layers (linear + ReLU)
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out

        # create last linear layer + output layer
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        # X is a mini-batch of instances
        batch_size = X.shape[0]
        # flatten the other dimensions of X (in case instances are images)
        X = X.reshape(batch_size, -1)

        # compute the forward pass
        # (implicitly stores internal activations for later backpropagation)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predicted_posteriors, true_classes):
        # perform backpropagation w.r.t. the prediction for the latest mini-batch X
        gradient=self.layers[-1].backward(predicted_posteriors , true_classes) # your cod0e here
        for i in range(len(self.layers)-2,-1,-1):
            gradient=self.layers[i].backward(gradient)

    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            # print("Epoch", i)
            # reorder data for every epoch
            # (i.e. sample mini-batches without replacement)
            permutation = np.random.permutation(N)

            for batch in range(n_batches):
                # create mini-batch
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]

                # perform one forward and backward pass and update network parameters
                self.update(x_batch, y_batch, learning_rate)

In [22]:
def cross_validation(N,folds,n_epochs,batch_size,learning_rate):
    n_features = 2
    n_classes  = 2
    skf = StratifiedKFold(n_splits=folds)  #cross-validation

    error_rate=[0,0,0,0]
    X_all, Y_all = datasets.make_moons(N, noise=0.05)
    for train, test in skf.split(X_all, Y_all):
        X_train,Y_train=X_all[train],Y_all[train]
        X_test,Y_test=X_all[test],Y_all[test]

        # standardize features to be in [-1, 1]
        offset  = X_train.min(axis=0)
        scaling = X_train.max(axis=0) - offset
        X_train = ((X_train - offset) / scaling - 0.5) * 2.0
        X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0
       
        # set hyperparameters (play with these!)
        layer_sizes_22 = [2, 2, n_classes]
        layer_sizes_33 = [3, 3, n_classes]
        layer_sizes_55 = [5, 5, n_classes]
        layer_sizes_3030 = [30, 30, n_classes]

        # create network
        network_22 = MLP(n_features, layer_sizes_22)
        network_33 = MLP(n_features, layer_sizes_33)
        network_55 = MLP(n_features, layer_sizes_55)
        network_3030 = MLP(n_features, layer_sizes_3030)

        # train
        network_22.train(X_train, Y_train, n_epochs, batch_size, learning_rate)
        network_33.train(X_train, Y_train, n_epochs, batch_size, learning_rate)
        network_55.train(X_train, Y_train, n_epochs, batch_size, learning_rate)
        network_3030.train(X_train, Y_train, n_epochs, batch_size, learning_rate)

        # test
        # determine class predictions from posteriors by winner-takes-all rule
        # compute and output the error rate of predicted_classes
        predicted_posteriors_22 = network_22.forward(X_test)
        predicted_classes_22 = np.argmax(predicted_posteriors_22,axis=1) 
        error_rate[0] =error_rate[0]+ np.sum(predicted_classes_22!=Y_test)/len(Y_test)
        
        predicted_posteriors_33 = network_33.forward(X_test)
        predicted_classes_33 = np.argmax(predicted_posteriors_33,axis=1) 
        error_rate[1] =error_rate[1]+ np.sum(predicted_classes_33!=Y_test)/len(Y_test)

        predicted_posteriors_55 = network_55.forward(X_test)
        predicted_classes_55 = np.argmax(predicted_posteriors_55,axis=1) 
        error_rate[2] =error_rate[2]+ np.sum(predicted_classes_55!=Y_test)/len(Y_test)

        predicted_posteriors_3030 = network_3030.forward(X_test)
        predicted_classes_3030 = np.argmax(predicted_posteriors_3030,axis=1) 
        error_rate[3] =error_rate[3]+ np.sum(predicted_classes_3030!=Y_test)/len(Y_test)

    label_name={0:"2*2",1:"3*3",2:"5*5",3:"30*30"}
    for i in range(len(error_rate)):
        error_rate[i]=error_rate[i]/folds
        print("{} MLP network error rate is:{}".format(label_name[i],error_rate[i]))


In [55]:
def main():
    parser = argparse.ArgumentParser(description='MLP network')
    
    parser.add_argument('--dataset-num', type=int, default=4000, metavar='N',
                        help='input dataset num for dataset (default: 4000)')
    parser.add_argument('--batch-size', type=int, default=200, metavar='N',
                        help='input batch size for training (default: 200)')
    parser.add_argument('--epochs', type=int, default=5, metavar='N',
                        help='number of epochs to train (default: 5)')
    parser.add_argument('--lr', type=float, default=0.05, metavar='LR',
                        help='learning rate (default: 0.05)')
    parser.add_argument('--fold', type=int, default=5, metavar='N',
                        help='input k-folds for cross-validation (default: 5)')
    args = parser.parse_args(args=[])

    cross_validation(args.dataset_num,args.fold,args.epochs,args.batch_size,args.lr)

In [56]:
if __name__ == '__main__':
    main()

2*2 MLP network error rate is:0.31625000000000003
3*3 MLP network error rate is:0.23775
5*5 MLP network error rate is:0.12075
30*30 MLP network error rate is:0.0017500000000000003
