# Q2

In [1]:
import numpy as np

class Layer:
    def __init__(self, W, b, activation, d_activation):
        self._weights = W # W should be matrix outDim*inDim
        self._bias = b # W should be vector outDim*1
        self._activation = activation
        self._d_activation = d_activation
        
    def forward(self, v_prev):
        self._v_prev = v_prev
        self._z = np.dot(self._weights, self._v_prev) + self._bias
        self._V = self._activation(self._z)
        
        return self._V
        
    def backward(self, err_next):
        self.G = np.multiply(self._d_activation(self._z), err_next)
        self._w_grad = np.dot(self.G, self._v_prev.transpose())
        
        self._b_grad = np.dot(self.G, np.ones([self.G.shape[1], 1]))
        
        self._err = np.dot(self._weights.transpose(), err_next)
                       
        return self._err
        
    def step(self, rate):
        self._weights -= rate*self._w_grad
        self._bias -= rate*self._b_grad
        
    def predict(self, X):
        Z = np.dot(self._weights, X) + self._bias
        V = self._activation(Z)
        
        return V

In [2]:
class LinearLayer(Layer):
    identity = lambda x: x
    didentity = lambda x: 1
    
    def __init__(self, W, b):
        super().__init__(W, b, LinearLayer.identity, LinearLayer.didentity)

        
class RelULayer(Layer):
    RelU = lambda x: np.maximum(x,0)
    dRelU = lambda x: (x > 0) * 1
    
    def __init__(self, W, b):
        super().__init__(W, b, RelULayer.RelU, RelULayer.dRelU)
        
        
class SigmoidLayer(Layer):
    sigmoid = lambda x: 1/(1 + np.exp(-x))
    dsigmoid = lambda x: x * (1 - x)
    
    def __init__(self, W, b):
        super().__init__(W, b, SigmoidLayer.sigmoid, SigmoidLayer.dsigmoid)

## Checking HW inputs

In [3]:
# Manually running the network with HW inputs

X = np.array([[1],[2],[-1]])
Y = np.array([[0]])

W_1 = np.ones([2,3])
b_1 = np.ones([2,1])
L1 = RelULayer(W_1, b_1)

W_2 = np.ones([2,2])
b_2 = np.ones([2,1])
L2 = RelULayer(W_2, b_2)

W_3 = np.ones([1,2])
b_3 = np.ones([1,1])
L3 = LinearLayer(W_3, b_3)

In [4]:
v1 = L1.forward(X)
v2 = L2.forward(v1)
v3 = L3.forward(v2)
y_hat = v3

In [5]:
print("L1 output:\n", v1)
print("=========================")
print("L2 output:\n", v2)
print("=========================")
print("L3 output (y_hat):\n", v3)

L1 output:
 [[3.]
 [3.]]
L2 output:
 [[7.]
 [7.]]
L3 output (y_hat):
 [[15.]]


In [6]:
loss = 2*(y_hat-Y)

err3 = L3.backward(loss)
err2 = L2.backward(err3)
err1 = L1.backward(err2)

In [7]:
print("L1 W grad:\n", L1._w_grad)
print("L1 b grad:\n", L1._b_grad)
print("=========================")
print("L2 W grad:\n", L2._w_grad)
print("L2 b grad:\n", L2._b_grad)
print("=========================")
print("L3 W grad:\n", L3._w_grad)
print("L3 b grad:\n", L3._b_grad)

L1 W grad:
 [[ 60. 120. -60.]
 [ 60. 120. -60.]]
L1 b grad:
 [[60.]
 [60.]]
L2 W grad:
 [[90. 90.]
 [90. 90.]]
L2 b grad:
 [[30.]
 [30.]]
L3 W grad:
 [[210. 210.]]
L3 b grad:
 [[30.]]


## Putting it all together in a NN object
- Note - added Q4 requirements in the training function  

In [8]:
class NN:
    def __init__(self, layers):
        self._layers = layers
        self._iter_loss = [np.inf]
        
    def train(self, X, Y, loss, d_loss, max_iterations=1000, rate=1e-5, convergece_threshold=1e-10, n_batches=1, decay_rate=None):       
        X_batches = np.array_split(X.transpose(), n_batches)
        Y_batches = np.array_split(Y.transpose(), n_batches)
        
        self._initial_learning_rate = rate
                
        for i in range(max_iterations):                
            if decay_rate:
                rate = self._initial_learning_rate * np.power(decay_rate, i/n_batches)
            for X_b, Y_b in zip(X_batches, Y_batches):
                out = X_b.transpose()
                for l in self._layers:
                    out = l.forward(out)
                                    
                err = d_loss(out, Y_b.transpose())
                for l in reversed(self._layers):
                    err = l.backward(err)
                    l.step(rate)
        
            mean_loss = np.mean(loss(self.predict(X), Y))
            self._iter_loss.append(mean_loss)
            mean_loss_diff = np.abs(self._iter_loss[-1] - self._iter_loss[-2])
            if mean_loss_diff < convergece_threshold:
                return True
            
        return False
    
                
    def predict(self, X):
        pred = X
        for l in self._layers:
            pred = l.predict(pred)
            
        return pred

## Bonus - batch normalization layer

In [9]:
class BatchNormalizationLayer(LinearLayer):
    def __init__(self, gamma, b, K):
        super().__init__(gamma, b)
        self._K = K # Number of batches
        self._all_batches_means = None
        self._all_batches_vars = None
    
    def _update_global_mean_var(self, batch_means, batch_vars):
        if self._all_batches_means is None:
            self._all_batches_means = batch_means / self._K
            self._all_batches_vars = batch_vars / self._K
        else:
            self._all_batches_means += batch_means / self._K
            self._all_batches_vars += batch_vars / self._K
            
    def _standardize_batch(self, X):
        self._m = X.mean(axis=1).reshape([X.shape[0],1])
        self._v = X.var(axis=1).reshape([X.shape[0],1])
        X_stand = np.subtract(X,self._m)/np.sqrt(self._v + 1e-8)
        self._update_global_mean_var(self._m, self._v)
        
        return X_stand
                
    def forward(self, v_prev):
        self._v_prev = v_prev
        self._z = self._standardize_batch(v_prev)
        self._V = np.multiply(self._weights, self._z) + self._bias
        
        return self._V


    def backward(self, err_next):
        
        self.G = err_next
        self._b_grad = np.dot(self.G, np.ones([self.G.shape[1], 1]))
        self._w_grad = np.dot(self.G, self._z.transpose())
        self._w_grad = np.sum(self._w_grad, axis=1).reshape(self._b_grad.shape)
        
        self._err = np.multiply(self._weights, err_next)
        # rescale the error for the previous layer
        self._err = np.multiply(self._err, self._v)
        self._err = np.add(self._err, self._m)
                       
        return self._err
        
        
    def predict(self, X):
        X_stand = np.subtract(X,self._all_batches_means)/np.sqrt(self._all_batches_vars)
        
        Z = np.multiply(self._weights, X_stand) + self._bias
        
        return Z

## Usage examples using Q4 data - out of the scope
- **Note - this is not the answer for Q4**

In [10]:
n_train = 10000
n_test = 1000
n = n_train + n_test
p = 4

X = np.random.uniform(0, 1, [n, p])
X_train, X_test = X[:n_train].transpose(), X[(n_train+1):].transpose()
X = X.transpose()

noise = np.random.normal(0, 1, n)
Y = X[0] - 2*X[1] + 3*X[2] - 4*X[3] + noise
Y = Y.reshape([n, 1])
Y_train, Y_test = Y[:n_train].transpose(), Y[(n_train+1):].transpose()

In [11]:
def get_hw_network(batch_norm=False, K=1):
    W_1 = np.ones([2,4])
    b_1 = np.ones([2,1])
    L1 = RelULayer(W_1, b_1)

    W_2 = np.ones([2,2])
    b_2 = np.ones([2,1])
    L2 = RelULayer(W_2, b_2)

    W_3 = np.ones([1,2])
    b_3 = np.ones([1,1])
    L3 = LinearLayer(W_3, b_3)
    
    nn = NN([L1, L2, L3])
    
    if batch_norm:
        bn_gamma_1 = np.ones([4,1])
        bn_b_1 = np.ones([4,1])
        BN1 = BatchNormalizationLayer(bn_gamma_1, bn_b_1, K)
        
        bn_gamma_2 = np.ones([2,1])
        bn_b_2 = np.ones([2,1])
        BN2 = BatchNormalizationLayer(bn_gamma_2, bn_b_2, K)
        
        bn_gamma_3 = np.ones([2,1])
        bn_b_3 = np.ones([2,1])
        BN3 = BatchNormalizationLayer(bn_gamma_3, bn_b_3, K)
            
        nn = NN([BN1, L1, BN2, L2, BN3, L3])
            
    return nn

In [12]:
from time import perf_counter

rss = lambda y_hat, y: np.power((y_hat - y), 2)
d_rss = lambda y_hat, y: 2*(y_hat - y)


In [13]:
t1_start = perf_counter()

nn1 = get_hw_network()
nn1_converged = nn1.train(X_train, Y_train, rss, d_rss)
nn1_mse = np.mean(rss(nn1.predict(X_test), Y_test))

t1_stop = perf_counter()

print("Converged?", nn1_converged)
print("Test MSE:", nn1_mse)
print("Number of iterations:", len(nn1._iter_loss))
print("Elapsed time:", t1_stop-t1_start)

Converged? True
Test MSE: 3.3706496873518628
Number of iterations: 54
Elapsed time: 0.09917958400183124


In [14]:
t1_start = perf_counter()

nn2 = get_hw_network()
nn2_converged = nn2.train(X_train, Y_train, rss, d_rss, decay_rate=0.96)
nn2_mse = np.mean(rss(nn2.predict(X_test), Y_test))


t1_stop = perf_counter()

print("Converged?", nn2_converged)
print("Test MSE:", nn2_mse)
print("Number of iterations:", len(nn2._iter_loss))
print("Elapsed time:", t1_stop-t1_start)

Converged? True
Test MSE: 3.369592016213544
Number of iterations: 305
Elapsed time: 0.37996045899853925


In [15]:
t1_start = perf_counter()

nn3 = get_hw_network()
nn3_converged = nn3.train(X_train, Y_train, rss, d_rss, n_batches=50)
nn3_mse = np.mean(rss(nn3.predict(X_test), Y_test))


t1_stop = perf_counter()

print("Converged?", nn3_converged)
print("Test MSE:", nn3_mse)
print("Number of iterations:", len(nn3._iter_loss))
print("Elapsed time:", t1_stop-t1_start)

Converged? False
Test MSE: 0.9715176866335166
Number of iterations: 1001
Elapsed time: 4.799457793997135


In [16]:
t1_start = perf_counter()

nn3 = get_hw_network()
nn3_converged = nn3.train(X_train, Y_train, rss, d_rss, n_batches=50, max_iterations=10000)
nn3_mse = np.mean(rss(nn3.predict(X_test), Y_test))


t1_stop = perf_counter()

print("Converged?", nn3_converged)
print("Test MSE:", nn3_mse)
print("Number of iterations:", len(nn3._iter_loss))
print("Elapsed time:", t1_stop-t1_start)

Converged? True
Test MSE: 0.9721611706145762
Number of iterations: 8094
Elapsed time: 39.67634789300064


In [17]:
t1_start = perf_counter()

nn4 = get_hw_network(batch_norm=True, K=50)
nn4_converged = nn4.train(X_train, Y_train, rss, d_rss, n_batches=50, max_iterations=10000)
nn4_mse = np.mean(rss(nn4.predict(X_test), Y_test))


t1_stop = perf_counter()

print("Converged?", nn4_converged)
print("Test MSE:", nn4_mse)
print("Number of iterations:", len(nn4._iter_loss))
print("Elapsed time:", t1_stop-t1_start)

Converged? False
Test MSE: 3.67399584700498
Number of iterations: 10001
Elapsed time: 181.0727358749973
