In [6]:
import numpy as np
from numpy import log, max, exp, sum, clip
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder

# get pred_y, cost, grad (softmax)
def multi_logistic_cost_gradient(X, W, Y, eps=1e-15):
    z = X @ W
    z_max = max(z, axis=1, keepdims=True)   
    exp_z = exp(z - z_max)                   
    pred_Y = exp_z / sum(exp_z, axis=1, keepdims=True)
    pred_Y = clip(pred_Y, eps, 1 - eps)
    cost   = sum(-(Y * log(pred_Y)))/ X.shape[0]
    gradient = X.T @ (pred_Y-Y)/ X.shape[0]
    return pred_Y, cost, gradient

In [7]:
# prepare data

X = np.array([[1.2,-0.4,0.8],[-0.6,2,-0.5],[0.3,-1.2,1.7],[2.1,0.5,-0.8]])
encoder = PolynomialFeatures(1)
P = encoder.fit_transform(X)

y = np.array([3,1,2,3]).reshape(-1,1)
onehot_encoder=OneHotEncoder(sparse_output=False)
Y = onehot_encoder.fit_transform(y)

In [8]:
# learning
learning_rate = 0.1
W = np.array([[0,0,0],[0.02,-0.01,0.03],[-0.05,0.04,0.01],[0.03,0.02,-0.02]])

pred_Y, cost, gradient = multi_logistic_cost_gradient(P, W, Y, eps=1e-15)
print('Initial Cost =', cost)
print('Initial Gradient =', gradient)
print('Initial Weights =', W)

num_iters = 10000
cost_vec = np.zeros(num_iters+1)
cost_vec[0] = cost

for i in range(1, num_iters+1):

    # update w
    W = W - learning_rate*gradient

    # compute updated cost and new gradient
    pred_Y, cost, gradient = multi_logistic_cost_gradient(P, W, Y, eps=1e-15)
    cost_vec[i] = cost

    if(i % 1000 == 0):
        print('Iter', i, ': cost =', cost)
    if(i<3):
        print('Iter', i, ': cost =', cost)
        print('Gradient =', gradient)
        print('Weights =', W)

print('Final Cost =', cost)
print('Final Weights =', W)

Initial Cost = 1.1244898406408839
Initial Gradient = [[ 0.08366715  0.08136882 -0.16503597]
 [ 0.4076001   0.15942862 -0.56702872]
 [-0.45593088  0.39346652  0.06246436]
 [ 0.24832965 -0.33422675  0.08589709]]
Initial Weights = [[ 0.    0.    0.  ]
 [ 0.02 -0.01  0.03]
 [-0.05  0.04  0.01]
 [ 0.03  0.02 -0.02]]
Iter 1 : cost = 1.0182703437840153
Gradient = [[ 0.07058601  0.07498791 -0.14557393]
 [ 0.37910344  0.15217022 -0.53127367]
 [-0.42368968  0.36423061  0.05945907]
 [ 0.2239035  -0.31016662  0.08626313]]
Weights = [[-0.00836671 -0.00813688  0.0165036 ]
 [-0.02076001 -0.02594286  0.08670287]
 [-0.00440691  0.00065335  0.00375356]
 [ 0.00516703  0.05342267 -0.02858971]]
Iter 2 : cost = 0.9266115384670267
Gradient = [[ 0.05955033  0.06871184 -0.12826217]
 [ 0.35209914  0.14474907 -0.49684821]
 [-0.39384519  0.33742193  0.05642326]
 [ 0.20269269 -0.28813985  0.08544715]]
Weights = [[-0.01542532 -0.01563567  0.03106099]
 [-0.05867035 -0.04115988  0.13983024]
 [ 0.03796206 -0.03576971 

In [None]:
# prediction
x_test = np.array([-0.1]).reshape(-1,1)
P_test = encoder.fit_transform(x_test)

z = P_test @ W
z_max = max(z, axis=1, keepdims=True)   
exp_z = exp(z - z_max)  

Y_test = exp_z / sum(exp_z, axis=1, keepdims=True)
print("z: " + str(z))
print("Y_test: " + str(Y_test))

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4 is different from 2)