In [None]:
import numpy as np
from numpy import log, max, exp, sum, clip
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder

# get pred_y, cost, grad (softmax)
def multi_logistic_cost_gradient(X, W, Y, eps=1e-15):
    z = X @ W
    z_max = max(z, axis=1, keepdims=True)   
    exp_z = exp(z - z_max)                   
    pred_Y = exp_z / sum(exp_z, axis=1, keepdims=True)
    pred_Y = clip(pred_Y, eps, 1 - eps)

    cost   = sum(-(Y * log(pred_Y)))/ X.shape[0]
    gradient = X.T @ (pred_Y-Y)/ X.shape[0]
    return pred_Y, cost, gradient

In [3]:
# prepare data

X = np.array([[-1],[0],[0.5],[0.3],[0.8]])
encoder = PolynomialFeatures(1)
P = encoder.fit_transform(X)

y = np.array([[1], [1], [2], [3], [2]])
onehot_encoder=OneHotEncoder(sparse_output=False)
Y = onehot_encoder.fit_transform(y)

In [4]:
# learning
learning_rate = 0.5
W = np.array([[0.5, 0.3, 0.2], [-1, 0.5, 0.1]])

pred_Y, cost, gradient = multi_logistic_cost_gradient(P, W, Y, eps=1e-15)
print('Initial Cost =', cost)
print('Initial Gradient =', gradient)
print('Initial Weights =', W)

num_iters = 10000
cost_vec = np.zeros(num_iters+1)
cost_vec[0] = cost

for i in range(1, num_iters+1):

    # update w
    W = W - learning_rate*gradient

    # compute updated cost and new gradient
    pred_Y, cost, gradient = multi_logistic_cost_gradient(P, W, Y, eps=1e-15)
    cost_vec[i] = cost

    if(i % 1000 == 0):
        print('Iter', i, ': cost =', cost)
    if(i<3):
        print('Iter', i, ': cost =', cost)
        print('Gradient =', gradient)
        print('Weights =', W)

print('Final Cost =', cost)
print('Final Weights =', W)

Initial Cost = 0.8022881819072072
Initial Gradient = [[-0.03546935 -0.04792369  0.08339304]
 [ 0.13213239 -0.1401589   0.00802651]]
Initial Weights = [[ 0.5  0.3  0.2]
 [-1.   0.5  0.1]]
Iter 1 : cost = 0.7795683594693033
Gradient = [[-0.03270099 -0.03511072  0.06781171]
 [ 0.12473711 -0.12981954  0.00508243]]
Weights = [[ 0.51773468  0.32396185  0.15830348]
 [-1.0660662   0.57007945  0.09598675]]
Iter 2 : cost = 0.7607300780934951
Gradient = [[-0.03049513 -0.02461642  0.05511155]
 [ 0.11804638 -0.12085116  0.00280477]]
Weights = [[ 0.53408517  0.3415172   0.12439763]
 [-1.12843475  0.63498922  0.09344553]]
Iter 1000 : cost = 0.19604646547387247
Iter 2000 : cost = 0.12921272181148719
Iter 3000 : cost = 0.09584723302617415
Iter 4000 : cost = 0.07568248180854417
Iter 5000 : cost = 0.06225271831640409
Iter 6000 : cost = 0.05271896343741894
Iter 7000 : cost = 0.045629554089268366
Iter 8000 : cost = 0.040167222553487494
Iter 9000 : cost = 0.03583863738277689
Iter 10000 : cost = 0.0323294800

In [None]:
# prediction
x_test = np.array([1,-0.1])

z = x_test @ W
Y_test = np.exp(z) / np.sum(np.exp(z), axis=-1, keepdims=True)
print("z: " + str(z))
print("Y_test: " + str(Y_test))