In [1]:
import numpy as np
import random
from sklearn.datasets import  load_iris
from sklearn import preprocessing
iris = load_iris()

Will use iris sepal length + sepal width (ie index 0 & index 1). Grab 5 samples to train with.

In [2]:
features2use = [0,1]
seed_from_id = 57
input_nodes = 3
hidden_nodes = 3
output_nodes = 3

def grab_training_samples(samples_features,samples_target,samples2grab,seed):
    random.seed(seed)
    indices = random.sample(range(0,len(samples_features)),samples2grab)
    return samples_features[indices],samples_target[indices]

def cnv_target_to_prob(targets,types=3):
    target_probs = []
    for target in targets:
        target_probs.append([1 if x==target else 0 for x in range(types)])
    return target_probs
    
def cost(truth,prediction):
    #implemented as cross-entropy cost
    #check to see that truth is in the desired form...
    def cost_elem(ti,pi):
        return sum([-t*np.log2(p) for t,p in zip(ti,pi)])
    return sum([cost_elem(ti,pi) for ti,pi in zip(truth,prediction)])/len(truth)

# Grab the features corresponding to index0 + index1 (sepal length+sepal width). Normalize them via the min/max scaler, and convert the target classes from 0,1,2 to [1,0,0],[0,1,0],[0,0,1]

In [3]:
training_features,training_target = grab_training_samples(iris.data,iris.target,5,seed_from_id)
#grab only indices of interest from feature set.
training_features = [ft[features2use] for ft in training_features]
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
training_features = minmax_scaler.fit_transform(training_features)
training_target = cnv_target_to_prob(training_target)

In [4]:
def activate(inputs,weights):
    if len(inputs)==len(weights):
        return np.dot(weights,inputs)
    else:
        return np.dot(weights[0:-1],inputs)+weights[-1]
    
def sigmoid(z):
    return 1/(1+np.exp(-z))
def sigmoid_deriv(z):
    return sigmoid(z)*(1-sigmoid(z))

def softmax(z,normalize=1):
    #REMEMBER TO DIVIDE BY sum(softmax)!
    return np.exp(z)/1

***Our network will be the following three layers:***
* Input (3 neurons), take x1,x2,1 and feed them through
* Hidden (3 neurons), each neuron takes x1,x2,1, multiplies by weight, and then passes them through a sigmoid activation
* Output (3 neurons), each neuron takes h1,h2,h3 (ie a hidden node) and 1, multiplies by weight and then passes them through a softmax activation

***Configured with random weights***

In [5]:
def create_network(n_inputs,hidden_nodes,output_nodes,seed_from_id):
    network=list()
    random.seed(seed_from_id)
    #generate hidden
    hidden_layer = [{'weights':[random.random() for __ in range(n_inputs+1)]} for n in range(hidden_nodes)]
    network.append(hidden_layer)
    #generate output
    output_layer = [{'weights':[random.random() for __ in range(hidden_nodes+1)]} for n in range(output_nodes)]
    network.append(output_layer)
    
    for idx,__ in enumerate(network[0]):
        network[0][idx]["activate"]=activate
        network[0][idx]["transfer"]=sigmoid
        network[0][idx]["name"]="0,{}".format(idx)
    for idx,__ in enumerate(network[1]):
        network[1][idx]["activate"]=activate
        network[1][idx]["transfer"]=softmax
        network[1][idx]["name"]="1,{}".format(idx)
    return network

***For the creation of this network, we have a list of:***
<br>
<br>
[NetworkLayer0;NetworkLayer1]
<br>
<br>
***Each "NetworkLayer" is composed of a set of neurons, and each neuron contains:***
* Weights
* Activate Function
* Transfer Function
* Name
* Output (ie, yj)
* dwij

In [6]:
network = create_network(2,3,3,seed_from_id)
network

[[{'weights': [0.04256571358257688, 0.5896864504016538, 0.019310811347186485],
   'activate': <function __main__.activate(inputs, weights)>,
   'transfer': <function __main__.sigmoid(z)>,
   'name': '0,0'},
  {'weights': [0.5141261392922695, 0.9766461363238077, 0.29024680692058136],
   'activate': <function __main__.activate(inputs, weights)>,
   'transfer': <function __main__.sigmoid(z)>,
   'name': '0,1'},
  {'weights': [0.9742198470014191, 0.6360311719284631, 0.8839870868472304],
   'activate': <function __main__.activate(inputs, weights)>,
   'transfer': <function __main__.sigmoid(z)>,
   'name': '0,2'}],
 [{'weights': [0.019670627013046116,
    0.7356439915697156,
    0.5167042010454891,
    0.604345615005703],
   'activate': <function __main__.activate(inputs, weights)>,
   'transfer': <function __main__.softmax(z, normalize=1)>,
   'name': '1,0'},
  {'weights': [0.3779795944555536,
    0.976973385429604,
    0.6552766031176774,
    0.7255780591443041],
   'activate': <function _

***For every iteration during training, we restart the propagation (ie, set output and dwij sums to 0)
Additionally, the predict function will be how we propagate forward***

In [7]:
def restart_propagation(network):
    for layer in network:
        for neuron in layer:
            neuron["dwij"]=np.zeros(len(neuron["weights"]))
            
#generate our prediction given our input
def predict(network,inputs,softmax_normalize=1):
    for layer in network:
        #weights at this layer.
        new_inputs=[]
        for neuron in layer:
            #weights for this neuron
            z=neuron["activate"](inputs,neuron["weights"])
            y=neuron["transfer"](z)
            neuron["output"] = y
            new_inputs.append(y)
        inputs=new_inputs
    if softmax_normalize:
        #print(inputs)
        input_sum = sum(inputs)
        inputs = [val/input_sum for val in inputs]
        for idx,neuron in enumerate(network[-1]):
            neuron["output"] = neuron["output"]/input_sum
    return inputs

# For backpropagation derivation, we want:
* w = weights
* y = output from activation
* z = output from A*x+B

$$\frac{d(E)}{d(w_{ij})} = \frac{d(E)}{d(y_{j})} * \frac{d(y_{j})}{d(w_{ij})} = \frac{d(E)}{d(y_{j})} * \frac{d(y_{j})}{d(z_{j})} * \frac{d(z{j})}{d(w_{ij})}$$

We know that $$\frac{d(z_{j})}{d(w_{ij})} = y_{j}$$ and know that $$\frac{d(y_{j})}{d(z_{j})} = \frac{d(activation)}{d(z_{j})}$$

Which reduces
$$\frac{d(E)}{d(w_{ij})} = \frac{d(E)}{d(y_{j})} * \frac{d(activation)}{d(z_{j})} * y_{i}$$

We know that for cross-entropy loss, $$\frac{d(E)}{d(z_{j})} = p_{j} - t_{j}$$

Therefore, our backpropagation @ the output is:
$$\frac{d(E)}{d(w_{ij})} = (p_j-t_j)y_{i}$$
And for the hidden layers, we have:
$$\frac{d(E)}{d(w_{ij})} = [\sum_{k = output}((p_k-t_k)(y_k)w_{kj})]f(z_j)(1-f(z_j))y_i$$
Where $f(z_{j})$ is the sigmoid activation in the hidden layer, and $y_i$ is the input into the hidden node.**

In [8]:
def backward_propagate_hidden_output(network,y):
    #for backwards training, j = the output, i = the previous node
    #p_j is softmax prediction, t_j is the corresponding index
    for j,nj in enumerate(network[-1]):
        de_dz=(nj["output"]-y[j])
        for i,ni in enumerate(network[-2]):
            nj["dwij"][i] += de_dz*ni["output"]
        nj["dwij"][-1] += de_dz
    
    
def backward_propagate_input_hidden(network,x):
    #for backwards training, j=the output, i=the previous node
    for j,nj in enumerate(network[-2]):
        #need to grab the dwij from the previous node (call it k)
        de_dz = 0
        for k,nk in enumerate(network[-1]):
            de_dz += nk["dwij"][k]*nk["weights"][j]
        #now can set our dj
        f_zj = nj["output"]
        for i,val in enumerate(x):
            nj["dwij"][i] += de_dz*(f_zj*(1-f_zj))*val
        nj["dwij"][-1] += de_dz*(f_zj*(1-f_zj))

def update_weights(network, learningRate, N):
    for layer in network:
        for neuron in layer:
            weights = neuron["weights"]
            dwij = neuron["dwij"]
            weights = weights - learningRate*(dwij/N)
            neuron["weights"] = weights

In [9]:
def BGD(network,epochs,learning_rate,X,Y):
    for itr in range(epochs):
        restart_propagation(network)
        prediction = []
        for x,y in zip(X,Y):
            prediction.append(predict(network,x)) #/forward propagate
            backward_propagate_hidden_output(network,y)
            backward_propagate_input_hidden(network,x)
        update_weights(network,learning_rate,len(Y))
        print("Itr, XEB Cost = {}, {}".format(itr,cost(Y,prediction)))


# (i) For calculation of XEB after two rounds of training, with the initial configuration of seed:

In [10]:
BGD(network,2,0.5,training_features,training_target)

Itr, XEB Cost = 0, 1.6945662449154297
Itr, XEB Cost = 1, 1.5999797078522264


# (ii) For how back propagation for hidden->output was calculated see below (and see above for the associated derivation)
```
def backward_propagate_hidden_output(network,y):
    #for backwards training, j = the output, i = the previous node
    #p_j is softmax prediction, t_j is the corresponding index
    for j,nj in enumerate(network[-1]):
        de_dz=(nj["output"]-y[j])
        for i,ni in enumerate(network[-2]):
            nj["dwij"][i] += de_dz*ni["output"]
        nj["dwij"][-1] += de_dz
    
```
# (iii) For how back propagation for input->hidden was calculated, see below (and see above for the associated derivation)
```
def backward_propagate_input_hidden(network,x):
    #for backwards training, j=the output, i=the previous node
    for j,nj in enumerate(network[-2]):
        #need to grab the dwij from the previous node (call it k)
        de_dz = 0
        for k,nk in enumerate(network[-1]):
            de_dz += nk["dwij"][k]*nk["weights"][j]
        #now can set our dj
        f_zj = nj["output"]
        for i,val in enumerate(x):
            nj["dwij"][i] += de_dz*(f_zj*(1-f_zj))*val
        nj["dwij"][-1] += de_dz*(f_zj*(1-f_zj))
```

# (iv) For the bonus question, lets pick all four parameters. In this case, we simply run the following set of code:

In [11]:
training_features,training_target = grab_training_samples(iris.data,iris.target,5,seed_from_id)
#grab only indices of interest from feature set.
training_features = [ft for ft in training_features]
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
training_features = minmax_scaler.fit_transform(training_features)
training_target = cnv_target_to_prob(training_target)

***Need only to set #inputs = 4 and # of hidden neurons = 5***

In [12]:
network = create_network(4,5,3,3)
BGD(network,2,0.5,training_features,training_target)

Itr, XEB Cost = 0, 1.684429021348008
Itr, XEB Cost = 1, 1.5975206772321364
