# Importing packages

In [1]:
import numpy as np
import pandas as pd
import gdown
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.notebook import trange, tqdm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
# import tensorflow as ts
from tensorflow import keras
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss

In [2]:
!nvidia-smi

Thu Mar  9 16:19:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 517.37       Driver Version: 517.37       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P2000       WDDM  | 00000000:65:00.0  On |                  N/A |
| 48%   38C    P8     8W /  75W |    531MiB /  5120MiB |     15%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Data preprocessing - fashion MNIST

In [7]:
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, stratify = Y_train, random_state=7, test_size = 0.1)

In [9]:
X_train_scaled = X_train.reshape(len(X_train),28*28)/255.0
X_val_scaled = X_val.reshape(len(X_val),28*28)/255.0
X_test_scaled = X_test.reshape(len(X_test),28*28)/255.0

Encoding labels

In [10]:
enc = OneHotEncoder()
y_train_enc = enc.fit_transform(np.expand_dims(y_train,1)).toarray()
y_val_enc = enc.fit_transform(np.expand_dims(y_val,1)).toarray()
y_test_enc = enc.fit_transform(np.expand_dims(y_test,1)).toarray()
print(y_train_enc.shape, y_val_enc.shape, y_test_enc.shape)

(54000, 10) (6000, 10) (10000, 10)


Batching

In [11]:
X_train_batch = X_train_scaled[0:1000,:]
y_train_batch = y_train_enc[0:1000,:]

In [12]:
print(X_train_scaled.shape, y_train_enc.shape, X_train_batch.shape, y_train_batch.shape )

(54000, 784) (54000, 10) (1000, 784) (1000, 10)


# Initializations

In [13]:
class Initialization:
  
  def initialize(self, sizes):
    self.W = {}
    self.B = {}
    self.sizes = sizes
    self.nh = len(sizes)-2

    for i in range(self.nh + 1):
      self.W[i+1] = np.random.randn(self.sizes[i+1], self.sizes[i])
      self.B[i+1] = np.zeros((self.sizes[i+1],1))

    return(self.W, self.B)
    
initial = Initialization()  #Instantiation for the class initialization

# Hyperparameters

In [None]:
class Hyperparameters:
  epochs = 20
  eta    = 1e-5

hyper = Hyperparameters()

# Activation functions

In [14]:
class ActivationFunctions:

  def linear(self,W,b,x):
    return (np.dot(W,x) + b)

  def sigmoid(self, z):
    return 1/(1+np.exp(-z))

  def softmax(self, z):
    return np.exp(z)/np.sum(np.exp(z))
  
  def grad_sigmoid(self,z):
    return z*(1-z)


act = ActivationFunctions()

# Loss Functions

In [15]:
class LossFunctions:
  def MSE(self, y, y_pred):
    return (np.sum((y-y_pred)**2))/len(y)
  
  def cross_entropy(self, y, y_pred):
    # clipping y_pred to prevent log(0) errors
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    
    ce_loss = np.mean(np.sum(np.multiply(-y,np.log(y_pred)), axis = 0))
    #ce_loss = log_loss(y,y_pred)
    #print(log_loss(y,y_pred))
    return(ce_loss)
loss_functions = LossFunctions()

# Learning Algorithms

In [16]:
class Optimizer:

  def learning_algoithm(self, W, B, dW, dB, nh, batch_size, learning_rate, optimizer):
    if optimizer == 'SGD':
      for i in range(nh + 1):
        W[i+1] -= learning_rate * (dW[i+1])
        B[i+1] -= learning_rate * (dB[i+1])

    if optimizer == 'MBGD':
      for i in range(nh + 1):
        W[i+1] -= learning_rate * (dW[i+1]/batch_size)
        B[i+1] -= learning_rate * (dB[i+1]/batch_size)

    # if optimizer == 'NAG':

    # if optimizer == 'RMSPROP':

    # if optimizer == 'ADAM':
      
    # if optimizer == 'NADAM':
    

    return(W, B)

opt = Optimizer()

# Model

In [21]:
class Model(Optimizer):
    
  def forward_prop(self, x,y):
    self.A = {}
    self.H = {}
    self.x = x
    self.y = y
    loss_batch = 0
    self.H[0] = self.x.T

    for i in range(self.nh + 1):
      self.A[i+1] = np.matmul(self.W[i+1], self.H[i]) + self.B[i+1]
      self.H[i+1] = act.sigmoid(self.A[i+1])

    y_pred = act.softmax(self.A[i+1])
    #print(y_pred)
    #print('Y = ', self.y.shape, 'Y_pred =', y_pred.shape)
    loss_batch = loss_functions.cross_entropy(self.y.T ,y_pred)
    self.loss_batch_store.append(loss_batch)
    return(y_pred)
  
  def grad(self, x, y):

    y_pred = self.forward_prop(x,y)
    self.dW = {}
    self.dB = {}
    self.dA = {}
    self.dH = {}
    self.dA[self.nh + 1] = y_pred - y.T

    for i in range(self.nh + 1 , 0, -1):
      self.dW[i]      = np.matmul(self.dA[i],self.H[i-1].T)
      self.dB[i]      = np.sum(self.dA[i], axis = 1).reshape(-1,1)
      self.dH[i-1]    = np.matmul(self.W[i].T,self.dA[i])
      self.dA[i-1]    = np.multiply(self.dH[i-1],act.grad_sigmoid(self.H[i-1]))
    return (self.dW, self.dB)
  
  def fit(self, X, Y, epochs, learning_rate, hidden_layer_sizes, batch_size, optimizer):
    
    # x = X[0:batch_size,:]
    # y = Y[0:batch_size,:]
    #self.X = X
    self.loss_epoc_store = []
    self.nx = X.shape[1]
    self.ny = Y.shape[1]
    self.nh = len(hidden_layer_sizes)
    self.sizes = [self.nx] + hidden_layer_sizes + [self.ny]

    if optimizer == 'SGD':
      batch_size =1
  
    step_size = len(X)/batch_size
    self.W, self.B = initial.initialize(self.sizes)

    for i in trange(epochs, total=epochs, unit="epoch"):
      loss_epoc = 0
      self.loss_batch_store = []
      for i in range(0,len(X),batch_size):
        self.x = X[i:i+batch_size,:]
        self.y = Y[i:i+batch_size,:]
        #print('y =', self.y.shape)
        self.grad(self.x,self.y)
        self.W, self.B = opt.learning_algoithm(self.W, self.B, self.dW, self.dB, self.nh, batch_size, learning_rate, optimizer)

      loss_epoc = np.sum(self.loss_batch_store)/step_size
      print('training loss = ', loss_epoc)
      self.loss_epoc_store.append(loss_epoc)
    
    plt.plot(self.loss_epoc_store)
    plt.xlabel('Epochs')
    plt.ylabel('log_loss')
    plt.show()

model = Model()

In [28]:
model.fit(X_train_scaled, y_train_enc, 100, 0.5, [32,32], 32, optimizer = 'SGD')
# For SGD batch-size is automatically set to 1

  0%|          | 0/100 [00:00<?, ?epoch/s]

  return 1/(1+np.exp(-z))


training loss =  1.3414073415714458
training loss =  1.3253043604616102
training loss =  1.2635766800895483
training loss =  1.2442205977833594
training loss =  1.3563671907448636
training loss =  1.271568020895393
training loss =  1.029582413577692
training loss =  1.240338926814476
training loss =  1.1118690146216534
training loss =  1.1379535350080858
training loss =  1.0592659329038032
training loss =  1.0600633741247982
training loss =  1.1436962417058625
training loss =  1.1903268556031068
training loss =  1.2005085700898528
training loss =  1.2215159150301376


KeyboardInterrupt: 