In [28]:
import numpy as np

def get_z(X, w): return X @ w
def relu(Z): return np.maximum(0,Z)
def softmax(Z):
    e = np.exp(Z - np.max(Z, axis=-1, keepdims=True))
    return e / np.sum(e, axis=-1, keepdims=True)
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))
def X_bias(X):
    ones=np.ones((X.shape[0], 1))
    return np.hstack((ones, X))
def err_softmax(A, y): return A - y
def err_relu(A, err_next, w_next):
    return err_next @ w_next[1:].T * (A > 0)
def err_sigmoid(A, err_next, w_next):
    return (err_next @ w_next[1:].T) * (A * (1 - A))
def err_mse(A, y): return 2 * (A - y)
def grad(err, X): return X.T @ err / X.shape[0]
def update_weight(w, lr, grad): return w - lr * grad

In [29]:
from sklearn.preprocessing import OneHotEncoder
# IF MULTICLASS -> ONEHOT y !

X = np.array([[1.2,-0.4,0.8],[-0.6,2,-0.5],[0.3,-1.2,1.7],[2.1,0.5,-0.8]])
y = np.array([3,1,2,3]).reshape(-1,1)
y_onehot = OneHotEncoder(sparse_output=False).fit_transform(y)
W1 = np.array([[0,0,0],[0.02,-0.01,0.03],[-0.05,0.04,0.01],[0.03,0.02,-0.02]])
W2 = W1.copy()

X_with_bias = X_bias(X)
Z1 = get_z(X_with_bias, W1) 
A1 = relu(Z1)

A1_bias = X_bias(A1)
Z2 = get_z(A1_bias, W2)
A2 = softmax(Z2)

E2 = err_softmax(A2, y_onehot)
G2 = grad(E2, A1_bias)
W2_new = update_weight(W2, 0.1, G2)

E1 = err_relu(A1, E2, W2)
G1 = grad(E1, X_with_bias)
W1_new = update_weight(W1, 0.1, G1)

print(W1_new)
print(W2_new)

[[-0.00016792 -0.00125257 -0.0010004 ]
 [ 0.02032425 -0.00924846  0.02722287]
 [-0.04946549  0.03749486  0.01092531]
 [ 0.0293399   0.02062628 -0.02024931]]
[[-0.00833061 -0.00833205  0.01666266]
 [ 0.01845714 -0.0086132   0.03015606]
 [-0.04873108  0.03936466  0.00936642]
 [ 0.02936574  0.01906595 -0.0184317 ]]


In [30]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import accuracy_score

dataset = load_digits()
X = dataset.data
y = dataset.target
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

poly = PolynomialFeatures(1)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.fit_transform(X_val)
X_test_poly = poly.fit_transform(X_test)

encoder = OneHotEncoder(sparse_output=False)
ytr_onehot = encoder.fit_transform(y_train.reshape(-1,1))
yval_onehot = encoder.fit_transform(y_val.reshape(-1,1))
yts_onehot = encoder.fit_transform(y_test.reshape(-1,1))

hidden_layer_size = 32
output_layer_size = ytr_onehot.shape[1]
lr = 0.01
num_iters = 20000

np.random.seed(42)
W1 = np.random.randn(X_train_poly.shape[1], hidden_layer_size)
W2 = np.random.randn(hidden_layer_size+1, hidden_layer_size)
W3 = np.random.randn(hidden_layer_size+1, output_layer_size)

def get_z(X, w): return X @ w
def relu(Z): return np.maximum(0,Z)
def softmax(Z):
    e = np.exp(Z - np.max(Z, axis=-1, keepdims=True))
    return e / np.sum(e, axis=-1, keepdims=True)
def X_bias(X):
    ones=np.ones((X.shape[0], 1))
    return np.hstack((ones, X))
def err_softmax(A, y): return A - y
def err_relu(A, err_next, w_next):
    return err_next @ w_next[1:].T * (A > 0)
def err_mse(A, y): return 2 * (A - y)
def cost_softmax(y, A): return -np.mean(np.sum(y * np.log(A + 1e-8), axis=1))
def grad(err, X): return X.T @ err / X.shape[0]
def update_weight(w, lr, grad): return w - lr * grad

for i in range(1,num_iters+1):
    
    Z1 = get_z(X_train_poly, W1)
    A1 = relu(Z1)
    A1_bias = X_bias(A1)
    Z2 = get_z(A1_bias, W2)
    A2 = relu(Z2)
    A2_bias = X_bias(A2)
    Z3 = get_z(A2_bias, W3)
    A3 = softmax(Z3)
    
    cost = cost_softmax(ytr_onehot, A3)
    
    E3 = err_softmax(A3, ytr_onehot)
    G3 = grad(E3, A2_bias)
    W3 = update_weight(W3, lr, G3)
    E2 = err_relu(A2, E3, W3)
    G2 = grad(E2, A1_bias)
    W2 = update_weight(W2, lr, G2)
    E1 = err_relu(A1, E2, W2)
    G1 = grad(E1, X_train_poly)
    W1 = update_weight(W1, lr, G1)
    
    if i % 1000 == 0:
        print(f"iter {i}: cost = {cost}")

def predict(X, W1, W2, W3):
    Z1 = get_z(X, W1)
    A1 = relu(Z1)
    A1_bias = X_bias(A1)
    Z2 = get_z(A1_bias, W2)
    A2 = relu(Z2)
    A2_bias = X_bias(A2)
    Z3 = get_z(A2_bias, W3)
    return softmax(Z3)

A3_train = predict(X_train_poly, W1, W2, W3)
A3_val = predict(X_val_poly, W1, W2, W3)

print(f"tr acc: {accuracy_score(y_train, np.argmax(A3_train, axis=1))}")
print(f"val acc: {accuracy_score(y_val, np.argmax(A3_val, axis=1))}")


iter 1000: cost = 0.10841846755807866
iter 2000: cost = 0.004399206130531385
iter 3000: cost = 0.0017630665129825228
iter 4000: cost = 0.0011347690167500995
iter 5000: cost = 0.0008412035250322298
iter 6000: cost = 0.0006684415625880973
iter 7000: cost = 0.0005557941048293148
iter 8000: cost = 0.0004763981615622828
iter 9000: cost = 0.00041733777933869334
iter 10000: cost = 0.00037157644791852663
iter 11000: cost = 0.0003349265923796161
iter 12000: cost = 0.00030476453736512555
iter 13000: cost = 0.0002796745481219137
iter 14000: cost = 0.000258504448029585
iter 15000: cost = 0.00024042568748116217
iter 16000: cost = 0.00022469509573726934
iter 17000: cost = 0.00021095479069977468
iter 18000: cost = 0.00019883614568093978
iter 19000: cost = 0.0001880587564387737
iter 20000: cost = 0.00017840802975499016
tr acc: 1.0
val acc: 0.8629629629629629
