In [1]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy.stats import multivariate_normal, norm
import numpy as np
import itertools

2 .	Compute the complete update (back propagation) equations for all weights (𝑤1⁡~⁡𝑤8) in the following neural networks. This time, the activation function in the hidden layer is ReLU and that in the output layer remains sigmoid. In addition, use softmax as the cost function. 

![](1.png)

\[
    f(x)= 
\begin{cases}
    1,& \text{if } x >  0\\
    0,& \text{if } x \leq 0\\
\end{cases}
\]

$\frac{\partial\varepsilon}{\partial w_{1}}$ = -d1(1-y1)w5 f(q1) x1 - d2(1-y2)w7 f(q1) x1

$\frac{\partial\varepsilon}{\partial w_{2}}$ = -d1(1-y1)w5 f(q1) x2 - d2(1-y2)w7 f(q1) x2

$\frac{\partial\varepsilon}{\partial w_{3}}$ = -d1(1-y1)w6 f(q2) x1 - d2(1-y2)w8 f(q2) x1

$\frac{\partial\varepsilon}{\partial w_{4}}$ = -d1(1-y1)w6 f(q2) x2 - d2(1-y2)w8 f(q2) x2

$\frac{\partial\varepsilon}{\partial w_{5}}$ = -d1(1-y1)h1

$\frac{\partial\varepsilon}{\partial w_{6}}$ = -d1(1-y1)h2

$\frac{\partial\varepsilon}{\partial w_{7}}$ = -d2(1-y2)h1

$\frac{\partial\varepsilon}{\partial w_{8}}$ = -d2(1-y2)h2

3 .	Write a program to implement the neural network with your back propagation equations in problem 2. To test your network, train it to distinguish the classes of versicolor and virginica in the Iris dataset using only the third and fourth features (i.e., petal length and petal width) as the inputs. As usual, use 70% of the data for training and the rest for testing. Repeat the experiments 10 times to find the average accuracy. During training, set the desired output as 1.0 for in class data and 0.0 for out of class data. Don’t forget to use random numbers as the initial weights. In addition, monitor the cost (loss) function with respect to the training epochs. Do you observe any difference when compared with the cost function plot using MSE? 

In [2]:
iris = load_iris()
feature_names = iris.feature_names.copy()
iris_X = iris.data
iris_y = iris.target
print(iris_X.shape, iris_y.shape)
iris.target_names

(150, 4) (150,)


array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [3]:
iris_X = iris_X[50:, [2, 3]]
iris_y = iris_y[50:] - 1

In [177]:
class NN:
    def __init__(self, n_epoch=50):
        np.random.seed(40)
        self.W = np.random.random((2, 2, 2))
        self.n_epoch = n_epoch
        self.lr = 0.2
        self.enc = preprocessing.OneHotEncoder()
        
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def _relu(self, x):
        return np.maximum(x, 0)
    
    def _f(self, x):
        return (x > 0).astype('int')
    
    def predict(self, X):
        h = self._relu(X.dot(self.W[0, :, :]))
        y = self._sigmoid(h.dot(self.W[1, :, :]))
        return y, h, X.dot(self.W[0, :, :])
    
    def bp(self, x, y_pred, y_true, h, q, lr):
        # error = y_pred - y_true
           
            
        derivative_z_x = -y_true*(1-y_pred)
        h = h.reshape(-1, 1)
        
        diff1 = h.dot(derivative_z_x.reshape(1, -1))
        self.W[1, :, :] = self.W[1, :, :] - lr*diff1

        diff2 = x.dot(self._f(q)*((derivative_z_x).dot(self.W[1, :, :].T)).reshape(1, -1))
        self.W[0, :, :] = self.W[0, :, :] - lr*diff2
        
    def fit(self, X, y):
        one_hot_y = self.enc.fit_transform(y.reshape(-1, 1)).toarray()
        
        for i in range(self.n_epoch):
            for x, y in zip(X, one_hot_y):
                y_pred, layer_output, q = self.predict(x.reshape(1, -1))
                self.bp(x.reshape(-1, 1), y_pred, y, layer_output, q[0], self.lr)
            
        return self
    
    def score(self, X, y):
        y_pred, _, _ = self.predict(X)
        # print(y_pred)
        return accuracy_score(y, np.argmax(y_pred, axis=1))

In [163]:
def nor(X):
    return (X - np.mean(X, axis=0)) / np.std(X)

In [178]:
model = NN()
model.fit(nor(iris_X), iris_y)
model.score(nor(iris_X), iris_y)

0.96

In [179]:
acc = []
for i in range(10):
    model = NN()
    train_X, test_X, train_y, test_y = train_test_split(iris_X, 
                                                        iris_y, 
                                                        train_size=0.7, 
                                                        test_size = 0.3)
    train_X, test_X,  = nor(train_X), nor(test_X)
    model.fit(train_X, train_y)
    acc.append(model.score(test_X, test_y))
print(acc)
print('avg acc: %.3f'%(np.mean(acc)))

[0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667]
avg acc: 0.967


4 .	The following shows the LeNet-5 architecture. If we use the weight-sharing approach for the first layer, compute the number of connections and trainable weights from the input layer to the first hidden layer C1. To compute the results, you need the following parameters: kernel size = 5x5, stride = 1, and no zero-padding. 
![](2.png)

5 .	Assuming that the following is a part of convolution neural networks. Compute the resultant values (size of 3x3) if it has two input planes, stride of one, no zero-padding, and using the ReLU activation function.
![](3.png)