## Classification

* Softmax Function
    * Ensures the values are non-negative (exponentiating the values)
    * Smooth prob output btw 0-1
    * Differentiable
    * This model mapped our inputs directly to our outputs via a single affine transformation, followed by a softmax operation

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score

In [None]:
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

In [None]:
batch_size = 256
num_inputs = 784
num_outputs = 10

train_features = tf.reshape(train_features, (-1, num_inputs))
train_features = tf.cast(train_features, tf.float32)/255.0
test_features = tf.reshape(test_features, (-1, num_inputs))
test_features = tf.cast(test_features, tf.float32)/255.0


In [None]:
def load_data(data, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.shuffle(buffer_size = 1000)
    dataset = dataset.batch(batch_size)
    return dataset

# next(iter(data_iter))

In [None]:
def init_params(num_inputs, num_outputs):   
    w = tf.Variable(tf.random.normal(shape = (num_inputs, num_outputs),
    mean = 0, stddev = 0.01), trainable = True)
    b = tf.Variable(tf.zeros(num_outputs), trainable = True)
    return w, b
    
## Softmax Function
def softmax(x):
    ''' x is list of sigmoid outputs '''
    x_exp = tf.exp(x)
    exp_norm = tf.reduce_sum(x_exp, 1, keepdims= True)
    return x_exp/exp_norm

In [None]:
def log_reg_multiclass(x, w, b):
    ''' 
    Multiclass classification implies softmax instead of sigmoid 
    Inputs:
    X : shape(num_examples, 28, 28)
    w : shape(784, 10)
    b : shape(10, 1)
    '''
    x = tf.reshape(x, (-1, w.shape[0]))
    output = tf.matmul(x, w) + b
    softmax_output = softmax(output)
    return softmax_output

In [None]:
def crossentropy_loss(yhat, y):
    ''' This function calculates the prediction prob of the correct class & takes log '''
    # depth corresponds to number of classes
    y_one_hot = tf.one_hot(y, depth = yhat.shape[-1])  
    
    # Masking only correct prediction prob
    prob = tf.boolean_mask(yhat, y_one_hot)

    # ce : log(y*yhat) but y is always 1
    return -tf.math.log(prob)

In [None]:
def sgd(params, grads, lr, batch_size):
    for param, grad in zip(params, grads):
        param.assign_sub(lr*grad/batch_size)

In [None]:
num_epochs = 50
lr = 0.05
batch_size = 256

w, b = init_params(num_inputs, num_outputs)
data_iter = load_data((train_features, train_labels), batch_size)

for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = log_reg_multiclass(X, w, b)
            loss = crossentropy_loss(yhat, y)
        # print(loss)
        
        # Calculate Gradients
        dw, db = tape.gradient(loss, [w, b])
        
        # Update Gradients
        sgd([w, b], [dw, db], lr, batch_size)
    
    training_loss = crossentropy_loss(log_reg_multiclass(train_features, w, b), train_labels)
    print(f"epoch : {epoch}, training_loss : {tf.reduce_sum(training_loss)}")

In [None]:
test_label_pred = log_reg_multiclass(test_features, w, b)
test_pred = np.argmax(test_label_pred, axis = 1)
accuracy_score(test_pred, test_labels)

## Tensorflow Concise Implementation

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score

In [None]:
(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

In [None]:
batch_size = 256
num_inputs = 784
num_outputs = 10

# train_features = tf.reshape(train_features, (-1, num_inputs))
# train_features = tf.cast(train_features, tf.float32)/255.0
# test_features = tf.reshape(test_features, (-1, num_inputs))
# test_features = tf.cast(test_features, tf.float32)/255.0

In [None]:
initializer = tf.initializers.RandomNormal(stddev = 0.1)
log_reg_tf = tf.keras.Sequential()
log_reg_tf.add(tf.keras.layers.Flatten(input_shape = (28, 28)))
log_reg_tf.add(tf.keras.layers.Dense(10, kernel_initializer=initializer, activation  = 'softmax'))

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.05)

In [None]:
data_iter = load_data((train_features, train_labels), batch_size)

num_epochs = 50
lr = 0.05
batch_size = 256

for epoch in range(num_epochs):
    for X, y in data_iter:
        with tf.GradientTape() as tape:
            yhat = np.argmax(log_reg_tf(X, training = True), axis = 1)
            yhat_ohe = tf.one_hot(yhat, num_outputs)
            ce_loss = loss(y, yhat_ohe)
        
        grads = tape.gradient(ce_loss, log_reg_tf.trainable_variables)

        optimizer.apply_gradients(zip(grads, log_reg_tf.trainable_variables))

    yhat = np.argmax(log_reg_tf(train_features), axis = 1)
    training_loss = loss(yhat, y)
    print(f"epoch : {epoch}, training_loss : {tf.reduce_sum(training_loss)}")