In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def shuffle_examples(X,Y):
  p = np.random.permutation(len(X))
  return X[p],Y[p]

def split_train_validation_test(X,Y,p_train,p_train_val):
  n = int(p_train*len(X))
  m = int(p_train_val*len(X))
  return X[:n],Y[:n],X[n:m],Y[n:m],X[m:],Y[m:]   

def scale(X):
  mu = np.mean(X,axis=0)
  st = np.std(X,axis=0)
  return (X-mu)/st,mu,st  

def sigmoid(x):
  return 1/(1+np.exp(-x))

def relu(x):
  return np.maximum(0,x)  

def softmax(X):
  return np.exp(X)/(np.sum(np.exp(X),axis=1).reshape(-1,1))   

def sigmoid_p(x):
  return sigmoid(x)*(1-sigmoid(x))

def relu_p(x):
  return 1*(x >= 0)    

def error_msr(Y,Y_hat):
  return np.sum((Y_hat-Y)**2)/(2*len(Y))

def error_cross_entropy(Y,Y_hat):
  return -np.sum(Y*np.log(Y_hat))/len(Y)

def norm_W(W):
  n_W = 0
  for l in range(1,len(W)):
    n_W = n_W + np.sum(W[l]**2)
  return n_W    

def error(Y,Y_hat,cat,W,reg,la):
  if cat:
    J = error_cross_entropy(Y,Y_hat)
  else:
    J = error_msr(Y,Y_hat)
  if reg:
    J = J + la*norm_W(W)/len(Y)
  return J      

# y is a one dimensional array y_1d = [0,2,1,3,0,2] (the categories of the examples)
# y_hat is 2d
def accuracy(Y_hat,y_1d):
  y_hat_1d = np.argmax(Y_hat,axis=1)
  correct_predictions = np.sum(y_hat_1d == y_1d)
  predictions = len(y_1d)
  accuracy = correct_predictions/predictions
  return accuracy
  
def h(x,act):
  if act == 'sigmoid':
    return sigmoid(x)
  if act == 'relu':
    return relu(x)
  if act == 'identity':
    return x
  if act == 'tanh':
    return np.tanh(x)
  if act == 'softmax':
    return softmax(x)  
  return 'Problem' 

def h_p(x,act):
  if act == 'sigmoid':
    return sigmoid_p(x)
  if act == 'relu':
    return relu_p(x)
  if act == 'identity':
    return 1
  if act == 'tanh':
    return 1/(np.cosh(x))**2
  return 'Problem'  

def As_Zs(X,W,b,act):
  A = [X]
  Z = [0]
  for l in range(1,len(b)):
    Z.append(np.matmul(A[-1],W[l])+b[l])
    A.append(h(Z[-1],act[l]))
  return A,Z 

def gradients(A,Z,act,W,Y):
  Y_hat = A[-1]
  l = len(W)-1
  DJ_DW = []
  DJ_Db = []
  DJ_DZ = (Y_hat - Y)/len(Y)
  while l > 1:
    DJ_DA = np.matmul(DJ_DZ,W[l].T)
    DJ_DW.insert(0,np.matmul(A[l-1].T,DJ_DZ)) 
    DJ_Db.insert(0,np.sum(DJ_DZ,axis=0))
    DJ_DZ = DJ_DA*h_p(Z[l-1],act[l-1])
    l = l-1
  DJ_DA = np.matmul(DJ_DZ,W[1].T)
  DJ_DW.insert(0,np.matmul(A[0].T,DJ_DZ)) 
  DJ_Db.insert(0,np.sum(DJ_DZ,axis=0)) 
  DJ_DW.insert(0,0) 
  DJ_Db.insert(0,0)  
  return DJ_DW,DJ_Db  

def update_parameters(W,b,DJ_DW,DJ_Db,c,reg,la,m):
  for l in range(1,len(b)):
    W[l] = W[l] - c*DJ_DW[l]
    b[l] = b[l] - c*DJ_Db[l]
  if reg:  
    for l in range(1,len(b)):
      W[l] = W[l] - 2*c*la*W[l]/m
  return W, b  

def initialize_W_and_b(n):
  W = [0]
  b = [0]
  for l in range(1,len(n)):
    W.append(np.random.randn(n[l-1],n[l])/np.sqrt(n[l-1]))
    b.append(np.zeros(n[l]))
  return W, b 

def steepest(n,act,X,Y,epochs,c,cat,reg,la):
  W, b = initialize_W_and_b(n)
  J_list = []
  for i in range(epochs):
    A, Z = As_Zs(X,W,b,act)
    Y_hat = A[-1]
    J_list.append(error(Y,Y_hat,cat,W,reg,la))
    DJ_DW, DJ_Db = gradients(A,Z,act,W,y)
    W, b = update_parameters(W,b,DJ_DW,DJ_Db,c,reg,la,len(Y))
  return W, b, J_list   

def predict(X,W,b,act):
  A,Z = As_Zs(X,W,b,act)
  return A[-1]

def scale_predict(X,X_train_mean,X_train_std,W,b,act):
  return predict((X-X_train_mean)/X_train_std,W,b,act)

def labels_1d_2d(y,n_c):
  return 1*(np.arange(n_c).reshape(1,n_c) == y.reshape(len(y),1))   