In [None]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg as LA
import plotly
import plotly.graph_objects as go
import random, time
from collections import Counter

In [None]:
# activation functions----------------------------------------------------------------
# INPUT: numpy array
# OUTPUT: numpy array
def ReLU(x,deriv = False):
    ######################### your code goes here ########################
    out = []       
    if deriv == True:
        for xval in x:
            if xval < 0:
                out.append(0)
            elif xval > 0:
                out.append(1)
    else:
        for xval in x:
            out.append(max(0,xval)) 
    return out
    
def Linear(x,deriv = False):
    ######################### your code goes here ########################
    out = []
    if deriv == True:
        for xval in x:
            out.append(1)   
        return out
    else:
        return x

def Sigmoid(x,deriv = False):
    ######################### your code goes here ########################
    out = []
    if deriv == True:
        for xval in x:
            out.append(1/(1+np.exp(-xval)))
    else:
        for xval in x:  
            sigdev = 1/(1+np.exp(-xval)) * (1-1/(1+np.exp(-xval)))
            out.append(sigdev)
    return out

def Squared(x,deriv = False):
    ######################### your code goes here ########################
    if deriv == True:
        return 2*x
    else:
        return x**2

def Softmax(x,deriv = False):
    ######################### your code goes here ########################
    func = np.exp(x)/sum(np.exp(x))
    if deriv == True:
        return func-func@func.T
    else:
        return func
#----------------------LOSS FUNCTION--------------------------------------------------------------------------

# Note: Nx and y are always numpy arrays (for 'bce' they always have only one entry)
# when deriv = False the output must be a number and when deriv = True the output must be a vector
def loss(Nx,y,cost_type,deriv = False):
    # square error------------------
    if cost_type == 'se':
        if deriv == True:
            return 2*(Nx - y).T #vector
        else: 
            argument = Nx-y
            value = (np.linalg.norm(argument))**2 #number
            return value 
    # cross entropy-----------------
    elif cost_type == 'ce':
        if deriv == True:
            value = -y.T @ np.diag(1/Nx)
            return value
        else:
            return -y.T @ np.log(Nx)
    # binary cross entropy---------- 
    else:
        if deriv == True:
            return -y/Nx + (1-y)/(1+Nx)
        else:
            return -y@np.log(Nx)-(1-y)@np.log(1-Nx)
            

In [None]:
def feedforward(W,B,G,x):
    ######################### your code goes here ########################
    # W has 5 matrices W[0] to W[4]
    # B has 5 vectors
    # G has 6 functions
    # x has 10 entries
     
    # INITIAL SIGNAL
    feeds = []
    #-----Debug------------------------------
    #print(W[0].shape, x.shape, B[0].shape)
    #----------------------------------------
    si = (W[0] @ x) + B[0]
    
    #feeds = [x0,si] # for index 0
    feeds.append([x,si])
    
    for i in range(1,len(G)): # len(G) = 6 (stops at index 5)
        xi = G[i](si) # FUNCTION CALL
        if i != (len(G)-1):
            si = (W[i] @ xi) + B[i] #stop before last index
            feeds.append([xi,si])
        else:
            feeds.append(xi)
    return feeds

In [None]:
def deltas(X_feeds,Y,W,B,G,verbose = False, cost_type = 'se'):
    ######################### your code goes here ########################
    # TASK: create dictionary with {keys: deltas}
    
    #note: X_feeds.keys() = [0,1]
    
    # initialize layers l
    D = len(G)-1 # (D = 5 for first test)
    # initialize deltas dictionary: deltas_dict = {0:[delta^(0),delta^(l),delta^(D-1)] , 1:[delta^(0),delta^(l),delta^(D-1)]
    deltas_dict = {}
    keys = X_feeds.keys()
    [deltas_dict.setdefault(i,[np.zeros(len(B[l]) ) for l in range(D)]) for i in keys] 
    #X_feeds contains lots of nan arrays after a few iterations in problem 4

    #loop for each key in dictionary X_feeds to create ***deltas_dict***
    for i in X_feeds.keys():
        
        #first delta^(D-1)
        Nx = X_feeds[i][D][0] #********Nx = last x from X_feeds********
        s0 = X_feeds[i][D-1][1] #D-1 (D index does not contain an s)
        GderivInit = G[D](s0,deriv=True)
        lossderiv = loss(Nx,Y[i,:],cost_type,deriv = True)
        #-----------------------------------------------------------------
        #deltInit =  np.multiply(lossderiv, GderivInit) # HADAMARD PRODUCT
        #print(deltInit)
        #-----------------------------------------------------------------
        deltInit =  lossderiv * GderivInit
        #add to dictionary
        deltas_dict[i][D-1] = deltInit #goes up to D-1 only
        
        #next delta^(l) from l=D-2 to l=0
        for l in range(D-2,-1,-1):
            
            #calculate next delta
            prevdelta = deltas_dict[i][l+1] 
            sl = X_feeds[i][l][1]           #SOMETHING WRONG HERE -GOEST TO [nan nan nan nan]
            #print(sl)
            Gderiv = G[l+1](sl,deriv=True) # GOES TO ZERO IN PROB 4 - *****SOMETHING WRONG WITH Gderiv******
            #---------------------------------------------------------------------
            #newdelta = np.multiply(prevdelta @ W[l+1], Gderiv) # HADAMARD PRODUCT (performs the same)
            #print(W[l+1].T.shape, prevdelta.shape, Gderiv)
            #---------------------------------------------------------------------
            newdelta = (W[l+1].T @ prevdelta) * Gderiv 
            #add new delta to dictionary
            deltas_dict[i][l] = newdelta 
  
    return deltas_dict

In [None]:
def grads(X,Y,W,B,G,batch, lambda_ = 0, verbose = False,cost_type = 'se'):
    ######################### your code goes here ########################

    # X_feeds dictionary
    X_feeds = {}
    [X_feeds.setdefault(k,feedforward(W,B,G,X[k,:])) for k in batch] 
    
    # -------------------X_feeds ordering?-NO-------------------------------
    #new_keys = [i for i in range(len(batch))]
    #X_feeds = dict(zip(new_keys, list(X_feeds.values())))

    # X_deltas
    X_deltas = deltas(X_feeds,Y,W,B,G,verbose,cost_type) # index1:batch, index2:layer
    
    #------------------cost function derivative with respect to W--------------------------------------
    dWs = []
    D = len(G)-1 #D=5
    for l in range(D): # l:0 to 4 
        
        # ----------DEBUGGING PRINT STATEMENTS--------------------------
        #size for batch = 0
        #print(np.shape(X_deltas[0][l]), np.shape(X_feeds[0][l][0]))
        #size for batch = 1
        #print(np.shape(X_deltas[1][l]), np.shape(X_feeds[1][l][0]))
        #---------------------------------------------------------------

        product = [np.outer(X_deltas[i][l], X_feeds[i][l][0]) for i in batch] 
        
        #----------DEBUG-------------------------------------------------
        #batch 0 product + batch 1 product
        #print([matrix.shape for matrix in product])
        #----------------------------------------------------------------
     
        outer_prodsum = sum(product) 
        dWs.append( (1/len(batch)) *  outer_prodsum + lambda_*2*W[l]) # dW is a list of matrices for each layer, l
                
    #--------------cost function derivative with respect to B--------------------------------------------
    dBs = []
    
    for l in range(D):
        delta_list = [X_deltas[i][l] for i in batch]
            
        dBs.append((1/len(batch))*sum(delta_list))
    
    return dWs, dBs, X_feeds

In [None]:
#max_iters should be a multiple of 100
def fit(X,Y,arch,G,alpha = 1e-9, momentum = .01, batch_size = 100, 
        lambda_ = 0, max_iters = 100,verbose = False, cost_type = 'se',print_costs = True):
    ######################### your code goes here ########################
    W,B,VW,VB = [],[],[],[]
    D,m = len(G)-1, len(X)
    
    #------------------------initializations----------------------------------------
    for l in range(D):
        # ---for W: sample N(X;0, 2/(n_l-1 + n_l))
        sigma = 2/(arch[l]+arch[l+1])
        W.append(np.random.normal(0,sigma,(arch[l+1], arch[l])))
        #W.append(np.ones((arch[l+1], arch[l])))
        B.append(np.zeros(arch[l+1]))
        #B.append(np.array(range(arch[l+1])))
        VW.append(np.zeros(W[l].shape))
        VB.append(np.zeros(B[l].shape))
    #----------------------------------------------------------------------------
    
    #gradient descent    
    epochs = 0
    grad_norms = []
    while epochs <= max_iters:
        
        batch = random.sample(range(m),batch_size)
        #batch = range(m)[:batch_size] 
        
        # -----------gradients and feeds (updated W and B each iteration)---------------
        
        #ValueError: operands could not be broadcast together with shapes (len(arch[1]),) (0,) 
        #In new_delta within deltas()<grads()
        dWs,dBs,feeds = grads(X,Y,W,B,G,batch,lambda_,verbose,cost_type) # ERROR AFTER A FEW ITERATIONS OF WHILE LOOP
        
        #------------gradient norms-----------------------------
        norm_sum = sum([np.linalg.norm(dWs[l]) + np.linalg.norm(dBs[l]) for l in range(D)])
        grad_norms.append(norm_sum)
        
        # -----------update values------------------
        for l in range(D):

            #VW[l] = momentum*VW[l] - alpha*dWs[l]
            #VB[l] = momentum*VB[l] - alpha*dBs[l]
            
            W[l] = W[l] + momentum*VW[l] - alpha*dWs[l]
            B[l] = B[l] + momentum*VB[l] - alpha*dBs[l]   
        
        #-----------costs--------------------------------------
        
        costs = [loss(feeds[i][D][0],Y[i],cost_type) for i in batch]
        
        '''
        if epochs%(np.floor(max_iters/30))==0 and print_costs:               
            print(f'epoch: {epochs}')
            print(f'           cost: {costs[epochs]}')
         '''  
        epochs+=1
        
    return W,B,costs,grad_norms
        