In [7]:
#author @ Cavin Gada

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris() 


In [8]:
# Load Data and Preprocess Data

x = iris.data
y_preprocessed = iris.target
y = np.array([[0,0,0]]) # we initialize to have 1 random data element such that dimensions work when appending. 


# preprocess data by the following convention (I was recommended by a friend to consider formatting my target values like this):
# 1. 0 ('Setosa') = [1,0,0]
# 2. 1 ('Versicolor') = [0,1,0]
# 3. 2 ('Virginica') = [0,0,1]

for i in range(y_preprocessed.shape[0]):
  if y_preprocessed[i] == 0:
    y = np.append(y, [[1,0,0]],axis=0)
  elif y_preprocessed[i] == 1:
    y = np.append(y, [[0,1,0]],axis=0)
  else:
    y = np.append(y,[[0,0,1]],axis=0)

y=y[1:] # get rid of that first random element we made. 

# 18% of the dataset will be used for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.18)


In [9]:
# functions to do training and predictions

def randomizeWeights(layers): 
  # we do layers-1 iterations since weights are used as a connection from one layer to another
  weightMatrix = []

  for i in range(len(layers)-1):

    currentNodes = layers[i+1]
    prevNodes = layers[i] # add 1 dimension for bias

    # create a matrix of weights corresponding to the transitioning layers. (m x n, where m is the current layer and n is previous)
    weight = np.random.randn(currentNodes, prevNodes)

    # append the matrix to a list of matrices
    weightMatrix.append(np.matrix(weight))

  return weightMatrix


def forward_propagation(x, w, layers):
  activation_matrix = [] # let the first activation function be equal to the input. This will be useful when iterating backwards (for shape reasons). 
  activation_matrix.append([x])
  currentLayer = x # the first layer is considered the input. 
  for i in range(len(w)):
    # the following is exemplified on slides and https://dev.to/shamdasani/build-a-flexible-neural-network-with-backpropagation-in-python
    # we need to keep forward propogating through each layer, so we go through every weight element and respectively update the layer. 
    z = np.dot(currentLayer, np.transpose(w[i]))
    z2 = sig(z)
    activation_matrix.append(z2) # must save activation function for when we do backpropogation
    currentLayer = z2

  #the activation matrix consists of all the activation functions at each intersection between layers. 
  return activation_matrix

def backward_propagation(x, y, w, layers, activation_matrix, learning_rate):

  # final error
  o_error = np.matrix(y-activation_matrix[len(activation_matrix)-1]) 

  for i in range(len(w), 0, -1):
    # current activation function
    curr = activation_matrix[i]
    # previous activation function
    prev = activation_matrix[i-1]

    # applied derivative of sigmoid to error. 
    o_delta = np.multiply(o_error, sigmoidPrime(curr))

    # updating current layer error
    o_error = np.dot(o_delta, w[i-1])

    # weight adjustment. we must update the weight of the next layer (backwards). 
    w[i-1] = w[i-1] + (np.multiply(o_delta.T, prev)*learning_rate)

  # return the weight that should be the new weight
  return w

def training_func(x_train, y_train, layers, learning_rate, epoch_amount):
  w = randomizeWeights(layers) 
  for i in range(epoch_amount):
    for i in range(len(y_train)):

      # i-th training data
      xi = x_train[i]
      yi = y_train[i]

      # activation functions through forward propogation
      act = forward_propagation(xi,w,layers)

      # weight update through back-propogation. note that back propogation will take current w and add to it. 
      w = backward_propagation(xi, yi, w, layers, act, learning_rate)

  # returned the trained weight
  return w

def pred(xi, w, layers):

  # must forward propogate, then find the last the activation function and compare outputed values. 
  last_activation = forward_propagation(xi,w,layers)[-1]
  predictions = np.asarray(last_activation).flatten()
  
  most_likely = max(predictions)

  # if most likely is 0 ('Setosa'), return [1,0,0]
  if most_likely == predictions[0]:
    return np.array([1,0,0])
  # if most likely is 1 ('Versicolor'), return [0,1,0]
  if most_likely == predictions[1]:
    return np.array([0,1,0])
  # if most likely is 2 ('Virginica'), return [0,0,1]
  if most_likely == predictions[2]:
    return np.array([0,0,1])

def proportion_correct(x_data, y_data, w, layers):
  numCorrect = 0
  
  for i in range(len(x_data)):

    prediction = pred(x_data[i], w, layers)

    if np.array_equal(prediction, y_data[i]):
      numCorrect+=1

  return numCorrect/len(x_data)

def sig(n):
    denominator = 1 + np.exp(0-n)
    return 1.0/denominator

def sigmoidPrime(s):
    # must use numpy multiply for shape
    return np.multiply(s, 1-s)


In [10]:
# PART A - 1 hidden layer with 5 nodes. 

print("1 Hidden layer with five nodes: ")
layers = [4,5,3]
w = training_func(x_train, y_train, layers, .1, 100)

print("Accuracy on train set is: " + str(proportion_correct(x_train, y_train, w, layers)))
print("Accuracy on test set is: " + str(proportion_correct(x_test, y_test, w, layers)))

# PART B - 2 hidden layers with 6 and 8 nodes respectively. 
print("\n"+"2 Hidden layer with six and eight nodes respectively: ")
layers2 = [4,6,8,3]
w2 = training_func(x_train, y_train, layers2, .1, 100)

print("Accuracy on train set is: " + str(proportion_correct(x_train, y_train, w2, layers2)))
print("Accuracy on test set is: " + str(proportion_correct(x_test, y_test, w2, layers2)))

# findings:
print("\n")

""" Note: On rare occasions, my algorithm seems a little unreliable, where I recieve extremely low accuracy. This is quite possible due to
the fact that we only have a small amount of data to do our learning with. With just 30% of the dataset being trained, it leaves
room for error. 

However, for the most part, when the training is complete, I have noticed accuracies in the high 90% ranges. 

When comparing the 1 hidden layer model with the 2 hidden layers model, I noticed that in general, the two hidden layer model
performs better. It performs "better" in a few ways:
  1. sometimes it has higher accuracies in both training and test set
  2. sometimes it only has higher accuracy on the test set (and perhaps a lower accuracy on the train set, indicating it is not
  overfitting the train data)
  3. sometimes it has higher accuracy on the train set, but similar accuracy on the test set. 
  
One fall back of my model is that the comparison between part a and b isn't strong due to the fact that the initial weights will be
different since they are randomized. Even though they are from a normal distribution, this could still cause high variance / false conclusions.

Another fall back is that perhaps the dataset is too small to introduce additional hidden layers. 
Overall, it seems very difficult to tweak my model settings to keep a consistent accuracy every time my code is run. More specifically,
I do not really know how many hidden layers I should implement and with how many nodes in each to achieve consistently better
results. I guess these are all problems I will learn how to solve in a future class :) ! """

1 Hidden layer with five nodes: 
Accuracy on train set is: 0.975609756097561
Accuracy on test set is: 0.9259259259259259

2 Hidden layer with six and eight nodes respectively: 
Accuracy on train set is: 0.967479674796748
Accuracy on test set is: 0.9629629629629629




' Note: On rare occasions, my algorithm seems a little unreliable, where I recieve extremely low accuracy. This is quite possible due to\nthe fact that we only have a small amount of data to do our learning with. With just 30% of the dataset being trained, it leaves\nroom for error. \n\nHowever, for the most part, when the training is complete, I have noticed accuracies in the high 90% ranges. \n\nWhen comparing the 1 hidden layer model with the 2 hidden layers model, I noticed that in general, the two hidden layer model\nperforms better. It performs "better" in a few ways:\n  1. sometimes it has higher accuracies in both training and test set\n  2. sometimes it only has higher accuracy on the test set (and perhaps a lower accuracy on the train set, indicating it is not\n  overfitting the train data)\n  3. sometimes it has higher accuracy on the train set, but similar accuracy on the test set. \n  \nOne fall back of my model is that the comparison between part a and b isn\'t strong due