In [None]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg as LA
import plotly
import plotly.graph_objects as go
import random, time
from collections import Counter

In [None]:
# to complete problem 1, fill in the following with your code

def ReLU(x,deriv = False):
    ######################### your code goes here ########################
    if deriv:
        return np.diag((x>0).astype(int))
    return [max(0,x[i]) for i in range(len(x))]

def Linear(x,deriv = False):
    ######################### your code goes here ########################
    if deriv:
        return np.identity(len(x))
    return x

def Sigmoid(x,deriv = False):
    ######################### your code goes here ########################
    if deriv:
        return np.diag(Sigmoid(x,False)*(1-Sigmoid(x,False)))
    return 1/(1+np.exp(-x))

def Squared(x,deriv = False):
    ######################### your code goes here ########################
    if deriv:
        return np.diag([2*x[i] for i in range(len(x))])
    return x**2

def Softmax(x,deriv = False):
    ######################### your code goes here ########################
    if deriv:
        gx = Softmax(x,deriv = False)
        return np.diag(gx) - np.outer(gx,gx)
    denom = sum([np.exp(x[i]) for i in range(len(x))])
    return np.array([np.exp(x[i])/denom for i in range(len(x))])


# Note: Nx and y are always numpy arrays (for 'bce' they always have only one entry)
# when deriv = False the output must be a number and when deriv = True the output must be a vector
def loss(Nx,y,cost_type,deriv = False):
    if cost_type == 'se':
        if deriv:
            return 2*(Nx-y).T
        return LA.norm(Nx-y,2)**2
    elif cost_type == 'ce':
        if deriv:
            return -y.T@np.diag(np.array([1/Nx[i] for i in range(len(Nx))]))
        return -np.inner(y,np.log(Nx))
    elif cost_type == 'bce':
        if deriv:
            return np.array([-y[0]/Nx[0]+(1-y[0])/(1-Nx[0])])
        return -y[0]*np.log(Nx[0])-(1-y[0])*np.log(1-Nx[0])

In [None]:
def feedforward(W,B,G,x):
    ######################### your code goes here ########################
    D = len(W)
    feeds = []
    for l in range(D):
        s = W[l]@x+B[l]
        feeds.append([x,s])
        #print(x,s)
        x = G[l+1](s)
    feeds.append([G[D](s)])
    return feeds

In [None]:
def deltas(X_feeds,Y,W,B,G,verbose = False, cost_type = 'se'):
    ######################### your code goes here ########################
    D = len(W)
    deltas_dict = {}
    for i in X_feeds.keys():
        deltas_dict[i]=[]
        l = D-1
        xD = X_feeds[i][l+1][0]
        sl = X_feeds[i][l][1]
        delta = loss(xD,Y[i,:],cost_type,deriv = True)@G[l+1](sl,True)
        if verbose:
            print(f'l = {l}')
            print(f'the shape of x({D}): {xD.shape}')
            print(f'the shape of s({l}): {sl.shape}')
            print(f'the shape of delta({l}): {delta.shape}')
        deltas_dict[i].append(delta)
        while l>=1:
            l-=1
            sl = X_feeds[i][l][1]
            if verbose:
                print(f'the shape of s({l}): {sl.shape}')
                print(f'the shape of delta({l}): {deltas_dict[i][-1].shape}')
                print(f'the shape of W[{l+1}]: {W[l+1].shape}')
            delta = (deltas_dict[i][-1]@W[l+1])@G[l+1](sl,True)
            if verbose:
                print(f'l = {l}')
                print(f'the shape of s({l}): {sl.shape}')
                print(f'the shape of deltas[{i}][{l}]: {deltas_dict[i][-1].shape}')
                print(f'the shape of W[{l+1}]: {W[l+1].shape}')
                print(f'delta = {delta}')
            deltas_dict[i].append(delta)
        deltas_dict[i].reverse()
    return deltas_dict

In [None]:
def grads(X,Y,W,B,G,batch, lambda_ = 0, verbose = False,cost_type = 'se'):
    ######################### your code goes here ########################
    dWs = []
    dBs = []
    m,D = len(X),len(W)
    X_feeds = {}
    for i in batch:
        x=X[i,:]
        feeds=feedforward(W,B,G,x)
        X_feeds[i] = feeds
    X_deltas = deltas(X_feeds,Y,W,B,G,verbose,cost_type)
    for l in range(D):
        if verbose:
            print(f'the shape of s({l}): {X_feeds[i][l][0]}')
            print(f'the shape of delta({l}): {X_deltas[i][l]}')
        dWs.append(1/m*sum([np.outer(X_deltas[i][l],X_feeds[i][l][0]) for i in batch]) + 2*lambda_*W[l])
        dBs.append(1/m*sum([X_deltas[i][l] for i in batch]))
    return dWs,dBs,X_feeds

In [None]:
#max_iters should be a multiple of 100
def fit(X,Y,arch,G,alpha = 1e-9, momentum = .01, batch_size = 100, 
        lambda_ = 0, max_iters = 100,verbose = False, cost_type = 'se',print_costs = True):
    ######################### your code goes here ########################
    W,B,VW,VB = [],[],[],[]
    D,m = len(G)-1,len(X)
    for l in range(D):
        W.append(np.random.normal(0, 2/(arch[l+1]+arch[l]), size=(arch[l+1], arch[l])))
        B.append(np.zeros(arch[l+1]))
        VW.append(np.zeros(W[l].shape))
        VB.append(np.zeros(B[l].shape))
    epochs = 0
    costs = []
    grad_norms = []
    while epochs<=max_iters:
        batch = random.sample(range(m),batch_size)
        dWs,dBs,feeds = grads(X,Y,W,B,G,batch,lambda_,verbose,cost_type)
        cost = 0
        grad_norm = 0
        for l in range(D):
            grad_norm += LA.norm(W[l],2)
            grad_norm += LA.norm(B[l],2)
        grad_norms.append(grad_norm)
        for i in batch:
            Nx = feeds[i][D][0]
            cost += loss(Nx,Y[i],cost_type)/batch_size
        costs.append(cost)
        if epochs%(np.floor(max_iters/30))==0 and print_costs:
            print(f'epoch: {epochs}')
            print(f'           cost: {cost}')
        if verbose:
            print('computed grads')
        for l in range(D):
            VW[l] = momentum*VW[l]-alpha*dWs[l]
            W[l] = W[l] + VW[l]
            VB[l] = momentum*VB[l]-alpha*dBs[l]
            B[l] = B[l] + VB[l]
        epochs+=1
    return W,B,costs,grad_norms
        