In [None]:
import numpy as np
# m: number of samples
# n: feature dimension
# c: number of classes
# X: m x n matrix, each row is an observation
# Y: m x c matrix of class labels, one-hot coded
# W: n x c weight 
# b: c x 1 bias
# lambd: 1 x 1 regularization parameter
# alpah: 1 x 1 learning rate
def update_weight_vectorized(X, Y, W, b, lambd, alpha):
    m = X.shape[0]
    n = X.shape[1]
    delta = 1 # delta can be any number
    S = np.dot(X, W) + b # predict score, m x c matrix
    label_score = np.amax(S*Y, axis = 1) # m x c matrix
    unclipped_loss = (S - label_score + delta) * (1-Y) # m x c matrix
    mask = unclipped_loss > 0 # m x c matrix
    hinge_loss = np.sum(np.maximum(unclipped_loss, 0)) / m # hinge loss
    hinge_loss += lambd * np.sum(W**2)/2
    
    dS = mask - Y * np.sum(mask, axis = 1) # m x c matrix
    dW = np.dot(X.T, dS) / m
    dW += lambd * W
    db = np.sum(dS.T, axis = 1) / m
    
    W -= alpha * dW
    b -= alpha * db
    
    return (W, b, hinge_loss)

def predict(X, W, b):
    m = X.shape[0]
    n = X.shape[1]
    delta = 1
    S = np.dot(X, W) + b
    label = np.argmax(S, axis = 1) # m x 1 vector
    
    return label
    

    