## Classification

* Softmax Function
    * Ensures the values are non-negative (exponentiating the values)
    * Smooth prob output btw 0-1
    * Differentiable
    * This model mapped our inputs directly to our outputs via a single affine transformation, followed by a softmax operation

In [1]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score

In [2]:
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

In [3]:
batch_size = 256
num_inputs = 784
num_outputs = 10

train_features = tf.reshape(train_features, (-1, num_inputs))
train_features = tf.cast(train_features, tf.float32)/255.0
test_features = tf.reshape(test_features, (-1, num_inputs))
test_features = tf.cast(test_features, tf.float32)/255.0


In [4]:
def load_data(data, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.shuffle(buffer_size = 1000)
    dataset = dataset.batch(batch_size)
    return dataset

# next(iter(data_iter))

In [5]:
def init_params(num_inputs, num_outputs):   
    w = tf.Variable(tf.random.normal(shape = (num_inputs, num_outputs),
    mean = 0, stddev = 0.01), trainable = True)
    b = tf.Variable(tf.zeros(num_outputs), trainable = True)
    return w, b
    
## Softmax Function
def softmax(x):
    ''' x is list of sigmoid outputs '''
    x_exp = tf.exp(x)
    exp_norm = tf.reduce_sum(x_exp, 1, keepdims= True)
    return x_exp/exp_norm

In [6]:
def log_reg_multiclass(x, w, b):
    ''' 
    Multiclass classification implies softmax instead of sigmoid 
    Inputs:
    X : shape(num_examples, 28, 28)
    w : shape(784, 10)
    b : shape(10, 1)
    '''
    x = tf.reshape(x, (-1, w.shape[0]))
    output = tf.matmul(x, w) + b
    softmax_output = softmax(output)
    return softmax_output

In [7]:
def crossentropy_loss(yhat, y):
    ''' This function calculates the prediction prob of the correct class & takes log '''
    # depth corresponds to number of classes
    y_one_hot = tf.one_hot(y, depth = yhat.shape[-1])  
    
    # Masking only correct prediction prob
    prob = tf.boolean_mask(yhat, y_one_hot)

    # ce : log(y*yhat) but y is always 1
    return -tf.math.log(prob)

In [8]:
def sgd(params, grads, lr, batch_size):
    for param, grad in zip(params, grads):
        param.assign_sub(lr*grad/batch_size)

In [9]:
num_epochs = 50
lr = 0.05
batch_size = 256

w, b = init_params(num_inputs, num_outputs)
data_iter = load_data((train_features, train_labels), batch_size)

for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = log_reg_multiclass(X, w, b)
            loss = crossentropy_loss(yhat, y)
        # print(loss)
        
        # Calculate Gradients
        dw, db = tape.gradient(loss, [w, b])
        
        # Update Gradients
        sgd([w, b], [dw, db], lr, batch_size)
    
    training_loss = crossentropy_loss(log_reg_multiclass(train_features, w, b), train_labels)
    print(f"epoch : {epoch}, training_loss : {tf.reduce_sum(training_loss)}")

Instructions for updating:
Use tf.identity instead.
epoch : 0, training_loss : 41365.953125
epoch : 1, training_loss : 36169.8828125
epoch : 2, training_loss : 33681.37109375
epoch : 3, training_loss : 32128.697265625
epoch : 4, training_loss : 31172.6796875
epoch : 5, training_loss : 30483.875
epoch : 6, training_loss : 29824.80859375
epoch : 7, training_loss : 29350.865234375
epoch : 8, training_loss : 28793.83203125
epoch : 9, training_loss : 28415.0546875
epoch : 10, training_loss : 28299.59765625
epoch : 11, training_loss : 27803.828125
epoch : 12, training_loss : 27720.1484375
epoch : 13, training_loss : 27376.7734375
epoch : 14, training_loss : 27151.88671875
epoch : 15, training_loss : 27040.375
epoch : 16, training_loss : 26899.296875
epoch : 17, training_loss : 26642.458984375
epoch : 18, training_loss : 26580.419921875
epoch : 19, training_loss : 26423.1015625
epoch : 20, training_loss : 26243.16796875
epoch : 21, training_loss : 26280.525390625
epoch : 22, training_loss : 2

In [10]:
test_label_pred = log_reg_multiclass(test_features, w, b)
test_pred = np.argmax(test_label_pred, axis = 1)
accuracy_score(test_pred, test_labels)

0.8424

## Tensorflow Concise Implementation

In [11]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score

In [12]:
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

In [13]:
batch_size = 256
num_inputs = 784
num_outputs = 10

train_features = tf.reshape(train_features, (-1, num_inputs))
train_features = tf.cast(train_features, tf.float32)/255.0
test_features = tf.reshape(test_features, (-1, num_inputs))
test_features = tf.cast(test_features, tf.float32)/255.0

In [18]:
initializer = tf.initializers.RandomNormal(stddev = 0.1)
log_reg_tf = tf.keras.Sequential()
log_reg_tf.add(tf.keras.layers.Flatten())
log_reg_tf.add(tf.keras.layers.Dense(10, kernel_initializer=initializer, activation  = 'softmax'))

In [19]:
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.05)

In [20]:
data_iter = load_data((train_features, train_labels), batch_size)

num_epochs = 50
lr = 0.05
batch_size = 256

for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = log_reg_tf(X, training = True)
            ce_loss = loss(y, yhat)
        
        grads = tape.gradient(ce_loss, log_reg_tf.trainable_variables)

        optimizer.apply_gradients(zip(grads, log_reg_tf.trainable_variables))

    yhat = log_reg_tf(train_features)
    training_loss = loss(train_labels, yhat)
    print(f"epoch : {epoch}, training_loss : {tf.reduce_sum(training_loss)}")

epoch : 0, training_loss : 0.7303739786148071
epoch : 1, training_loss : 0.6314913034439087
epoch : 2, training_loss : 0.5772733688354492
epoch : 3, training_loss : 0.5531703233718872
epoch : 4, training_loss : 0.532720148563385
epoch : 5, training_loss : 0.5153622031211853
epoch : 6, training_loss : 0.5039476156234741
epoch : 7, training_loss : 0.5009585022926331
epoch : 8, training_loss : 0.49021270871162415
epoch : 9, training_loss : 0.48346146941185
epoch : 10, training_loss : 0.476067453622818
epoch : 11, training_loss : 0.46961405873298645
epoch : 12, training_loss : 0.4683119058609009
epoch : 13, training_loss : 0.4660100042819977
epoch : 14, training_loss : 0.4672020375728607
epoch : 15, training_loss : 0.4557550251483917
epoch : 16, training_loss : 0.4568466246128082
epoch : 17, training_loss : 0.4495241641998291
epoch : 18, training_loss : 0.4520266354084015
epoch : 19, training_loss : 0.4443742334842682
epoch : 20, training_loss : 0.44346895813941956
epoch : 21, training_los

In [21]:
test_label_pred = log_reg_tf(test_features)
test_pred = np.argmax(test_label_pred, axis = 1)
accuracy_score(test_pred, test_labels)

0.8421