In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
from sklearn import preprocessing

## Cost/activation

In [2]:
def cost_MSE(t,y_hat, derivative=0):
    if derivative:
            return -(t - y_hat)
    return np.mean(1/2*np.sum(np.power(t - y_hat, 2),\
                      axis=0))

def logistic_sigmoid(x, derivative=0):    
    sigm = 1/(1 + np.exp(-x))
    if len(sigm.shape) < 2:
        sigm = sigm.reshape(sigm.shape[0],1)
        
    if derivative:
        return sigm*(1. - sigm)
    return sigm

## NN backend

In [3]:
# >>>>>>>>>>>>>>>>>>> init_weights_biases >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# W, B = init_weights_biases(4, 3, [2,2])
#no_hidden_units: needs a list with at least one element
def init_weights_biases(no_of_features, no_outputs, no_hidden_units, seed=1):
    
    W = []
    B = []
    rows, columns = 0, 0 
    last = len(no_hidden_units)
    np.random.seed(seed)
    
    if no_hidden_units: #list is not empty
        for i in range(last+1):
            if i == 0: #first weight
                rows = no_hidden_units[i]
                columns = no_of_features
            elif i > 0 and i < last:
                rows = no_hidden_units[i]
                columns = no_hidden_units[i-1]
            else: #last
                columns = rows # list ran out of indeces, so use last one
                rows = no_outputs            

            W.insert(i, np.random.randn(rows, columns))
            B.insert(i, np.zeros((rows, 1)))
    else: # no hidden units (perceptron)
        W.insert(0, np.random.randn(no_outputs, no_of_features))
        B.insert(0, np.zeros((no_outputs, 1)))
    
    dummy_param = 0
    param = 0
    for i in range(len(W)):
        dummy_param = W[i].shape[0] * W[i].shape[1]
        param += dummy_param
        
#     W.append(param) #number of learnable weights
    
    return W, B, param

# >>>>>>>>>>>>>>>>>>> forward_prop >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # W1, b1, W2, b2 = init_weights_biases(no_hidden_units=8)
    # Z, A, Y = forward_prop(W, B, X)
    # X has n features x M samples
def forward_prop(W, B, X):
    no_of_samples = X.shape[1]
     #last weight matrix, rows correspond to outputs
    no_of_outputs = W[-2].shape[0] #index -1 is the number of learnable weights
    
    Z = []
    A = []
    A.append(X) #first layer is an activation
    
    for i in range(len(W)): #to avoid the last two indeces
        Z.insert(i, W[i] @ A[i] + B[i])
        A.insert(i+1, logistic_sigmoid(Z[i]))
    
    Y = np.zeros((no_of_samples, no_of_outputs))
    #scaling to making the pair a probability
    Y = np.divide(A[i+1], np.sum(A[i+1], axis=0)) #comuns are the samples now
    return Z, A, Y

# >>>>>>>>>>>>>>>>>>> backprop >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# W1, b1, W2, b2 = init_weights_biases(no_hidden_units=8)
# A1, A2, Y = forward_prop(W1, b1, W2, b2, X)
# grad_mid_layer, grad_output = backprop(W2, A1, A2, X, Y, t)
# backprop(W2, A1, A2, X, Y, t)
def backprop(W, Z, A, Y_hat, T):
    
    output_index = len(W)-1 # if 3, starts at 2
    error = {}
    
    error_output = ( cost_MSE(T,Y_hat, derivative=1) * logistic_sigmoid(Z[-1], derivative=1))
    error[output_index] = error_output
    
    dJ_dW = {}
    for i in range(output_index-1,-1,-1):
         # doesn't get to W[0], so updated after the foor loop again
#         dJ_dW.insert(i+1, error[i+1] @ A[i+1].T)
        dJ_dW[i+1] = error[i+1] @ A[i+1].T
        
        error_dummy = (W[i+1].T @ error[i+1]) * logistic_sigmoid(Z[i], derivative=1)
#         error.insert(i, error_dummy)
        error[i] = error_dummy
    
    dJ_dW[0] = error[0] @ A[0].T
    
    return dJ_dW

## NN frontend 1 

    train, predict functions

In [4]:
# >>>>>>>>>>>>>>>>>>> train >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # all samples (X), 4 x 2, are fed
    # X: dataset, n samples x N features
    # T: binary labels, n labels x L number of ouputs
    # hidden_layers : list with number of neurons for each inner layer.
        # e.g. [3, 4] will yield two layers with 3 and 4 units respectively
    # this function needs n samples > 1 (batch optimization).
def train(X, T, hidden_layers=[2], epochs=500, rho=.1, normalize_data=True, show_cost=0):    
    
    if normalize_data:
        scaler = preprocessing.StandardScaler()
        X = scaler.fit_transform(X)
    
    no_of_features = X.shape[1]
    no_samples = X.shape[0]
    no_outputs = T.shape[1]
    
    W, B, param = init_weights_biases(no_of_features, no_outputs, hidden_layers)
    
    Y_hat = np.zeros((no_outputs, X.shape[0]))
    
    j = 0
    idx_done = 0
    converged = False
    
    ###NESTED function
    def display_NN_info():
        print("* NN ************************************")
        print("   no. inputs (layer 1): " + str(no_of_features) )
        for k in range(len(hidden_layers)):
            print("   layer " + str(k+2) + ": " + str(hidden_layers[k]) + " units")
        print("   output layer ("+ str(k+3) + "): " + str(no_outputs) )
        print("   learnable weights: " + str(param) )
        print("   max epochs: " + "{:,}".format(epochs) )
        print("   learning rate(rho): " + str(rho) )
        
    display_NN_info()
    time.sleep(4)
    
    cost_final = []
    accuracy = []
    
    for i in range(epochs):
        
        Z, A, Y_hat = forward_prop(W, B, X.T)
        dJ_dW = backprop(W, Z, A, Y_hat, T.T)

        #grad descent
        for j in range(len(W)):
            W[j] = W[j] - rho*dJ_dW[j]

        Y = Y_hat.T

        y_and_T_match = np.allclose(Y, T, rtol=1e-03)        

        if y_and_T_match: #converged
            j += 1 
            if j == 3:
                idx_done = i + 1 # already predicts corretly all the time
            if j > 100: #makes the prediction more robust 
                # ( probability considered 1 == .60 or greater )
                converged = True
                break
    
        cost_final.append(cost_MSE(T, Y_hat.T))
        accuracy.append(calc_accuracy(T, Y_hat.T))
        
        if show_cost:
            print(str(i) + ": accur: "+ str(accuracy[i])+ "%")
            print(" cost: " + str(cost_final[i]))
    
    if show_cost:
        display_NN_info()
        plt.scatter(range(epochs), cost_final, s=1, color="red")
        plt.title("iterations X cost")
        plt.xlabel("iterations")
        plt.ylabel("cost")
    
    
    print("   Start/final Cost: " + "{:.6f}".format(cost_final[0]) \
          + "/" + "{:.6f}".format(cost_final[-1]))
    
    print("   Train start/final accuracy(" + str(no_samples) + " train samples): "\
          + "{:.2f}".format(accuracy[0]) + "/" + "{:.2f}".format(accuracy[-1]) )
    
    return W, B, Y, X, cost_final, epochs, idx_done, converged, rho, normalize_data, accuracy

# >>>>>>>>>>>>>>>>>>> predict >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # X: dataset, n samples x N features
    #  train_pkg: list with [W, B, Y, X, cost_final, epochs, idx_done, converged, rho]
def predict(X, T, train_pkg):
    if len(X.shape) < 2:
        X = X.reshape(1,X.shape[0]) #for one sample
    
    normalized = train_pkg[-1]
    if normalized: #if the data has been normalized
        scaler = preprocessing.StandardScaler()
        X = scaler.fit_transform(X)
    
    Z, A, Y_hat = forward_prop(train_pkg[0], train_pkg[1], X.T)
    
    
    
    del Z, A
    return Y_hat.T, calc_accuracy(T, Y_hat.T)

def calc_accuracy(T, Y):
    matches = np.argmax(Y, axis=1) == np.argmax(T, axis=1)
    return len(matches[matches == True])/len(matches)*100

## dataset 2.b and 2.c Iris - includes function to select test and train sets

    T[0:50,0]: setosa
    T[50:100,1]: versicolor
    T[100:150,2]:virginica

In [24]:
df = pd.read_excel (r'fisheriris.xlsx')
names = ["setosa", "versicolor", "virginica"]
idx = [50,100,150]

# first four columns are 
X = np.array(df.values[:,0:4], dtype=np.float32)
T = np.zeros((150,3))
T[0:50,0] = 1 # setosa
T[50:100,1] = 1 #versicolor
T[100:150,2] = 1 #virginica

# >>>>>>>>>>>>>>>>>>> select_train_test_samples >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def select_train_test_samples(X, T, no_samples_train):
    size = X.shape[0]

    last_index = size-1
    
    all_indeces = np.linspace(0, last_index, size, dtype=np.int16)
    
    samples = []
    labels = []
    idx = []
    random.seed(5)
    idx.append(random.sample(range(size), no_samples_train))
    idx.append(np.delete(all_indeces, idx)) # delete returns a different value every time
    
    for i in range(2):
        samples.insert(i, X[idx[i],:]) # samples[0] is train, [1] are test
        labels.insert(i, T[idx[i],:]) # same for labels
    
    return [samples[0], samples[1], labels[0], labels[1]]

# >>>>>>>>>>>>>>>>>>> train_random_samples >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def train_random_samples(X,T, no_of_train_samples, hidden_layers=[2],\
                         epochs=1000, rho=.1, normalize_data=True, show_cost=0):
    
    no_test_samples = X.shape[0]-no_of_train_samples
    
    print("train/test samples: " + str(no_of_train_samples) + "/" + str(no_test_samples) )
    [X_train, X_test, T_train,T_test] = select_train_test_samples(X, T, no_of_train_samples)
    
    start = time.time()
    train_pkg = train(X_train, T_train, hidden_layers=hidden_layers, epochs=epochs, rho=rho, show_cost=show_cost)
    end = time.time()
    Y_hat, accuracy = predict(X_test, T_test, train_pkg)
    
    print("   Test accuracy(" + str(no_test_samples) +" test samples): " + "{:.2f}".format(accuracy) + "%")
    print('   Time Taken: ', end-start, ' seconds')
    print()
    
    return accuracy, train_pkg

# NN run (ONE HIDDEN LAYER)

The Neural Network can be run in two ways (

    1. using train_random_samples() with parameters will train based on a slice of the data for the train; the other part will be used for test
    2. using train() and then feeding the train package to predict()

In [27]:
no_train_samples = [50, 75, 100, 125]
accuracys = {}  #dictionary indexed with number of train samples used
train_pkgs = {} #same as previous


for i in range(len(no_train_samples)):
    accuracys[no_train_samples[i]], train_pkgs[no_train_samples[i]] = \
                            train_random_samples(X,T, no_train_samples[i], hidden_layers=[2],\
                                     epochs=5_000, rho=.1, normalize_data=True)

train/test samples: 50/100
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 2 units
   output layer (3): 3
   learnable weights: 14
   max epochs: 5,000
   learning rate(rho): 0.1
   Start/final Cost: 6.102624/0.330640
   Train start/final accuracy(50 train samples): 0.00/100.00
   Test accuracy(100 test samples): 88.00%
   Time Taken:  4.853496551513672  seconds

train/test samples: 75/75
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 2 units
   output layer (3): 3
   learnable weights: 14
   max epochs: 5,000
   learning rate(rho): 0.1
   Start/final Cost: 9.147789/0.630011
   Train start/final accuracy(75 train samples): 0.00/97.33
   Test accuracy(75 test samples): 96.00%
   Time Taken:  4.877950191497803  seconds

train/test samples: 100/50
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 2 units
   output layer (3): 3
   learnable weights: 14
   max epochs: 5,000
   learning rate(rho

# 2.c. NN run (TWO LAYERS-DEEP)

The Neural Network can be run in two ways (

    1. using train_random_samples() with parameters will train based on a slice of the data for the train; the other part will be used for test
    2. using train() and then feeding the train package to predict()

In [30]:
no_train_samples = [50, 75, 100, 125]
accuracys = {}  #dictionary indexed with number of train samples used
train_pkgs = {} #same as previous

for i in range(len(no_train_samples)):
    accuracys[no_train_samples[i]], train_pkgs[no_train_samples[i]] = \
                            train_random_samples(X,T, no_train_samples[i], hidden_layers=[1,3],\
                                     epochs=5_000, rho=.1, normalize_data=True)

train/test samples: 50/100
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 1 units
   layer 3: 3 units
   output layer (4): 3
   learnable weights: 16
   max epochs: 5,000
   learning rate(rho): 0.1
   Start/final Cost: 5.734385/0.038773
   Train start/final accuracy(50 train samples): 28.00/100.00
   Test accuracy(100 test samples): 94.00%
   Time Taken:  4.899377822875977  seconds

train/test samples: 75/75
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 1 units
   layer 3: 3 units
   output layer (4): 3
   learnable weights: 16
   max epochs: 5,000
   learning rate(rho): 0.1
   Start/final Cost: 8.526915/0.426495
   Train start/final accuracy(75 train samples): 29.33/98.67
   Test accuracy(75 test samples): 94.67%
   Time Taken:  4.935793161392212  seconds

train/test samples: 100/50
* NN ************************************
   no. inputs (layer 1): 4
   layer 2: 1 units
   layer 3: 3 units
   output layer (4): 3
   l