In [152]:
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [153]:
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

X.reset_index()
y.reset_index()

Unnamed: 0,index,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
145,145,2
146,146,2
147,147,2
148,148,2


In [154]:
# adding bias term
X_with_bias = np.c_[np.ones(len(X)), X]

In [155]:
X_temp, X_test, y_temp, y_test = train_test_split(X_with_bias, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

In [156]:
# Scaling the training instances
mean = X_train[:, 1:].mean(axis=0)
std = X_train[:, 1:].std(axis=0)

X_train[:, 1:] = (X_train[:, 1:] - mean) / std
X_valid[:, 1:] = (X_valid[:, 1:] - mean) / std
X_test[:, 1:] = (X_test[:, 1:] - mean) / std

In [157]:
# np.diag - returns a square matrix with 1 in its diagonal.
# y is a vector with unique labels [0,1,2]. We use them as indices into the diagonal
# matrix. For example diag[2] will return the third row of the diag matrix which is
# equal to [0,0,1]
def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]

y_train_one_hot = to_one_hot(y_train)
y_valid_one_hot = to_one_hot(y_valid)
y_test_one_hot = to_one_hot(y_test)

In [158]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis=1, keepdims=True)
    return exps / exp_sums

In [161]:
epochs_n = 5001
m = len(X_train)
num_classes = len(y_train.unique())
batch_size = 20
step = 0.1
epsilon = 1e-5
theta = np.random.randn(X_train.shape[1], num_classes)

for epoch in range(epochs_n):

  # compute cost over validation set
  if epoch % 1000 == 0:
    logits_valid = X_valid @ theta
    y_probas_valid = softmax(logits_valid)
    xenentropy_losses = -(y_valid_one_hot * np.log(y_probas_valid + epsilon))
    print (epoch, xenentropy_losses.sum(axis=1).mean())

  for iteration in range(m):
    indices = np.random.permutation(m)[:batch_size]    
    X_batch = X_train[indices]
    y_batch = y_train_one_hot[indices, :]

    # forward pass
    logits = X_batch @ theta
    y_probas = softmax(logits)

    # gradient softmax
    error = y_probas - y_batch
    grad = (1/m) * X_batch.T @ error

    theta = theta - step * grad

theta


0 1.5170985980346388
1000 0.2435495820970888
2000 0.29587306963754856
3000 0.34068831470097166
4000 0.3796862602530022
5000 0.41374130663670056


array([[  1.53512166,   9.60920184, -10.19997475],
       [ -4.41739406,   3.83429151,  -0.44632338],
       [  5.04185003,  -0.24836876,  -6.54605345],
       [ -6.77414938,  -7.27385862,  13.09728191],
       [ -5.96855746,  -1.22767668,   9.34836951]])

In [162]:
logits = X_valid @ theta
probas = softmax(logits)
predictions = probas.argmax(axis=1)

accuracy_score = (predictions == y_valid).mean()
accuracy_score

0.9166666666666666

In [166]:
# Adding l2 regularization

epochs_n = 5001
m = len(X_train)
num_classes = len(y_train.unique())
batch_size = 20
step = 0.1
epsilon = 1e-5
alpha = 0.01
theta = np.random.randn(X_train.shape[1], num_classes)

for epoch in range(epochs_n):

  # compute cost over validation set
  if epoch % 1000 == 0:
    logits_valid = X_valid @ theta
    y_probas_valid = softmax(logits_valid)
    xenentropy_losses = -(y_valid_one_hot * np.log(y_probas_valid + epsilon))
    # not regularizing the bias parameter in theta
    l2_loss = 1/2 * (theta[1:] ** 2 ).sum()
    total_loss = xenentropy_losses.sum(axis=1).mean() + alpha * l2_loss
    print (epoch, total_loss.round(4))

  for iteration in range(m):
    indices = np.random.permutation(m)[:batch_size]    
    X_batch = X_train[indices]
    y_batch = y_train_one_hot[indices, :]

    # forward pass
    logits = X_batch @ theta
    y_probas = softmax(logits)

    # gradient softmax
    error = y_probas - y_batch
    grad = (1/m) * X_batch.T @ error
    grad += np.r_[np.zeros([1, num_classes]), alpha * theta[1:]]

    theta = theta - step * grad

theta


0 1.1511
1000 0.3868
2000 0.3839
3000 0.383
4000 0.3831
5000 0.3831


array([[-0.65338463,  0.40931179, -0.87330157],
       [-0.57502341,  0.25379909,  0.32122432],
       [ 0.74273165, -0.4516422 , -0.29108945],
       [-0.88488757, -0.04927848,  0.93416605],
       [-0.82480092, -0.30155436,  1.12635528]])

In [167]:
logits = X_valid @ theta
probas = softmax(logits)
predictions = probas.argmax(axis=1)

accuracy_score = (predictions == y_valid).mean()
accuracy_score

0.9166666666666666

In [173]:
logits = X_test @ theta
probas = softmax(logits)
predictions = probas.argmax(axis=1)

accuracy_score = (predictions == y_test).mean()
accuracy_score

0.9666666666666667