In [1]:
import numpy as np 
import matplotlib.pyplot as plt

# install
## numpy
## matplotlib

## data load & preprocessing

In [2]:
from dataset.mnist import load_mnist

(train_raw_img, train_label), (test_raw_img, test_label) = load_mnist(flatten=False, normalize=False)
print(train_raw_img.shape)

(60000, 1, 28, 28)


In [3]:
# preprocessing (train & inference)

train_img = train_raw_img.reshape(len(train_raw_img.squeeze()), -1)
train_label = train_label.reshape(len(train_label), -1)

test_img = test_raw_img.reshape(len(test_raw_img.squeeze()), -1)
test_label = test_label.reshape(len(test_label), -1)

print(train_img.shape)
print(train_label.shape)
print(test_img.shape)
print(test_label.shape)

(60000, 784)
(60000, 1)
(10000, 784)
(10000, 1)


In [4]:
# normalization (set value 0 ~ 1)

train_img = train_img.astype('float')
train_img = train_img/255

test_img = test_img.astype('float')
test_img = test_img/255

## model

In [5]:

class Linear:
    def __init__(self, input_size=1, hidden_size=1):
        # Initialize weights and biases
        self.W = np.random.randn(input_size, hidden_size)
        self.b = np.zeros(hidden_size)
        
        # Store input size and hidden size
        self.input_size = input_size
        self.hidden_size = hidden_size
        
    def forward(self, x):
        # Calculate linear transformation
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout, lr = 0.01, lamb=0):
        # Calculate gradients for weights and biases
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)
        
        # Update weights and biases
        self.W -= lr * (dW + lamb*self.W)
        self.b -= lr * db
        
        # Calculate gradients for input
        # print(dout.shape, self.W.T.shape)
        dx = np.dot(dout, self.W.T)
        
        return dx


# [실습 1] activation function 구현

In [6]:
class ReLU:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
        
    def backward(self, dout):
        dx = dout
        dx[self.mask] = 0
        return dx
    

class LeakyReLU:
    def __init__(self, alpha=0.01):
        self.alpha = alpha
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] *= self.alpha
        return out
        
    def backward(self, dout):
        dout[self.mask] *= self.alpha
        dx = dout
        return dx

class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
        
    def backward(self, dout):
        dx = dout * self.out * (1 - self.out)
        return dx

class Tanh:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = np.tanh(x)
        self.out = out
        return out
        
    def backward(self, dout):
        dx = dout * (1 - self.out ** 2)
        return dx


In [7]:
class Softmax_with_CrossEntropy :
    def __init__(self) :
        # softmax
        self.softmax_x = None 
        self.softmax_out = None
        # crossEntropy
        self.pred = None
        self.target = None
        self.delta = 1e-7
        
    def softmax_forward(self, x) :
        self.softmax_x = x
        tmp = np.max(self.softmax_x, axis=1).reshape(-1, 1)
        self.softmax_out = np.exp(self.softmax_x-tmp)/np.sum(np.exp(self.softmax_x-tmp), axis=1).reshape(-1,1)
        return self.softmax_out
    
    def crossEntropy_forward(self, pred, target) :
        self.pred = pred
        self.target = target
        loss = -np.sum(self.target*np.log(self.pred+self.delta), axis=1)
        return loss
    
    def backward(self) :
        dout = (self.pred-self.target)/len(self.pred)
        return dout

## Train / Eval

In [8]:
# one_hot label 만드는 함수

def make_one_hot(labels) :
    a = []
    for label in labels :
        one_hot = np.zeros(10)
        one_hot[label] = 1
        a.append(one_hot)
    a = np.array(a)
    return a

# one_hot_labels = make_one_hot(train_label)
# print(train_label[0])
# print(one_hot_labels[0])

In [9]:
def eval(model, train_version = True) :
    if train_version :
        x = train_img
        labels = train_label.squeeze()
        print('In train dataset ... ')
    else : 
        x = test_img
        labels = test_label.squeeze()
        print('\nIn test dataset ... ')
    
    for layer in model.values() :
        if isinstance(layer, Softmax_with_CrossEntropy) :
            x = layer.softmax_forward(x)
        else :
            x = layer.forward(x)
            
    preds = x.argmax(axis=1)
    acc = np.sum(np.where(preds==labels, True, False))/len(labels)
    return acc

In [11]:
from collections import OrderedDict

def train_MLP(config) :
    lr, num_epoch = config['learning_rate'], config['num_epoch']
    print_loss_interval = 20
    
    layer1 = Linear(784, 100)
    activation_function = config['activation_function']
    layer2 = Linear(100, 10)
    softmax_with_CE = Softmax_with_CrossEntropy()
    
    for iter in range(num_epoch) :
        # forward
        x = layer1.forward(train_img)
        x = activation_function.forward(x)
        x = layer2.forward(x)
        preds = softmax_with_CE.softmax_forward(x)
        
        # loss
        one_hot_labels = make_one_hot(train_label)
        losses = softmax_with_CE.crossEntropy_forward(preds, one_hot_labels)
        loss = losses.sum()/len(preds)
        
        # backward
        dL = softmax_with_CE.backward()
        dL = layer2.backward(dL, lr)
        dL = activation_function.backward(dL)
        dL = layer1.backward(dL, lr)
        
        if iter == 0 or (iter+1) % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter+1, num_epoch, loss))
            
    model = OrderedDict()
    model['layer1'] = layer1
    model['activation_function'] = activation_function
    model['layer2'] = layer2
    model['softmax_with_CE'] = softmax_with_CE
    
    return model

In [50]:
config1 = { 'learning_rate' : 0.1,
            'num_epoch' : 100,
            'activation_function' : ReLU()
          }

config2 = { 'learning_rate' : 0.1,
            'num_epoch' : 100,
            'activation_function' : LeakyReLU()
          }

config3 = { 'learning_rate' : 0.1,
            'num_epoch' : 100,
            'activation_function' : Sigmoid()
          }

config4 = { 'learning_rate' : 0.1,
            'num_epoch' : 100,
            'activation_function' : Tanh()
          }

print('---ReLU---')
model1 = train_MLP(config1)

print('\n---Leaky_Relu---')
model2 = train_MLP(config2)

print('\n---Sigmoid---')
model3 = train_MLP(config3)

print('\n---TanH---')
model4 = train_MLP(config4)

---ReLU---
[epoch 1 / 100] average loss : 14.772929
[epoch 20 / 100] average loss : 5.344340
[epoch 40 / 100] average loss : 3.798438
[epoch 60 / 100] average loss : 3.132562
[epoch 80 / 100] average loss : 2.738742
[epoch 100 / 100] average loss : 2.470601

---Leaky_Relu---
[epoch 1 / 100] average loss : 14.190107
[epoch 20 / 100] average loss : 5.663444
[epoch 40 / 100] average loss : 3.880566
[epoch 60 / 100] average loss : 3.192718
[epoch 80 / 100] average loss : 2.786946
[epoch 100 / 100] average loss : 2.514844

---Sigmoid---
[epoch 1 / 100] average loss : 10.381612
[epoch 20 / 100] average loss : 5.460914
[epoch 40 / 100] average loss : 4.290746
[epoch 60 / 100] average loss : 3.677266
[epoch 80 / 100] average loss : 3.205758
[epoch 100 / 100] average loss : 2.839443

---TanH---
[epoch 1 / 100] average loss : 11.602126
[epoch 20 / 100] average loss : 7.167317
[epoch 40 / 100] average loss : 5.264566
[epoch 60 / 100] average loss : 4.178959
[epoch 80 / 100] average loss : 3.50062

In [51]:
# evaluation

print('\t Accuracy :', eval(model1, train_version=False))
print('\t Accuracy :', eval(model2, train_version=False))
print('\t Accuracy :', eval(model3, train_version=False))
print('\t Accuracy :', eval(model4, train_version=False))


In test dataset ... 
	 Accuracy : 0.786

In test dataset ... 
	 Accuracy : 0.7803

In test dataset ... 
	 Accuracy : 0.3685

In test dataset ... 
	 Accuracy : 0.544


# [실습 2] type of gradient descent

In [12]:
class Softmax_with_CrossEntropy :
    def __init__(self) :
        # softmax
        self.softmax_x = None 
        self.softmax_out = None
        # crossEntropy
        self.pred = None
        self.target = None
        self.delta = 1e-7
        
    def softmax_forward(self, x) :
        self.softmax_x = x
        tmp = np.max(self.softmax_x, axis=1).reshape(-1, 1)
        self.softmax_out = np.exp(self.softmax_x-tmp)/np.sum(np.exp(self.softmax_x-tmp), axis=1).reshape(-1,1)
        return self.softmax_out
    
    def crossEntropy_forward(self, pred, target) :
        self.pred = pred
        self.target = target
        loss = -np.sum(self.target*np.log(self.pred+self.delta), axis=1)
        return loss
    
    def backward(self) :
        dout = (self.pred-self.target)/len(self.pred)
        return dout

In [13]:
from collections import OrderedDict

def train_MLP_v2(config,train_img = train_img, train_label = train_label) :
    lr, num_epoch, batch_size = config['learning_rate'], config['num_epoch'], config['batch_size']
    print_loss_interval = 20
    
    layer1 = Linear(784, 100)
    activation_function = config['activation_function']
    layer2 = Linear(100, 10)
    softmax_with_CE = Softmax_with_CrossEntropy()
    
    for iter in range(num_epoch):
        # forward
        num_data = len(train_img)
        num_batch = num_data // batch_size
        if num_data % batch_size != 0:
            num_batch += 1

        epoch_loss = 0

        for batch_idx in range(num_batch):
            start_idx = batch_idx * batch_size
            end_idx = min(num_data, (batch_idx + 1) * batch_size)

            batch_img = train_img[start_idx:end_idx]
            batch_label = train_label[start_idx:end_idx]

            x = layer1.forward(batch_img)
            x = activation_function.forward(x)
            x = layer2.forward(x)
            preds = softmax_with_CE.softmax_forward(x)

            # loss
            one_hot_labels = make_one_hot(batch_label)
            losses = softmax_with_CE.crossEntropy_forward(preds, one_hot_labels)
            batch_loss = losses.sum() / len(preds)
            epoch_loss += batch_loss

            # backward
            dL = softmax_with_CE.backward()
            dL = layer2.backward(dL, lr)
            dL = activation_function.backward(dL)
            dL = layer1.backward(dL, lr)

        avg_loss = epoch_loss / num_batch

        if iter == 0 or (iter + 1) % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter + 1, num_epoch, avg_loss))

            
    model = OrderedDict()
    model['layer1'] = layer1
    model['activation_function'] = activation_function
    model['layer2'] = layer2
    model['softmax_with_CE'] = softmax_with_CE
    
    return model

In [54]:
print('---batch---')
config_batch = { 'learning_rate' : 0.1,
            'num_epoch' : 20,
            'activation_function' : ReLU(),
            'batch_size' : len(train_img) 
          }
model_batch = train_MLP_v2(config_batch)


print('\n---mini_batch---')
config_mini_batch = { 'learning_rate' : 0.1,
            'num_epoch' : 20,
            'activation_function' : ReLU(),
            'batch_size' : 2500
          }
model_mini_batch = train_MLP_v2(config_mini_batch)

print('\n---stochastic---')
config_stochastic = { 'learning_rate' : 0.001,
            'num_epoch' : 20,
            'activation_function' : ReLU(),
            'batch_size' : 1
          }
model_stochastic = train_MLP_v2(config_stochastic)

---batch---
[epoch 1 / 20] average loss : 14.510660
[epoch 20 / 20] average loss : 5.113783

---mini_batch---
[epoch 1 / 20] average loss : 7.847106
[epoch 20 / 20] average loss : 1.199033

---stochastic---
[epoch 1 / 20] average loss : 2.029318
[epoch 20 / 20] average loss : 1.690258


In [55]:
print('\t Accuracy :', eval(model_batch, train_version=False))
print('\t Accuracy :', eval(model_mini_batch, train_version=False))
print('\t Accuracy :', eval(model_stochastic, train_version=False))


In test dataset ... 
	 Accuracy : 0.6273

In test dataset ... 
	 Accuracy : 0.8636

In test dataset ... 
	 Accuracy : 0.9186


# Extra problem 1 (Multi Support Vector Machine)

In [10]:

class Linear_v2:
    def __init__(self, input_size=1, hidden_size=1):
        # Initialize weights and biases
        self.W = np.random.randn(input_size, hidden_size)
        self.b = np.zeros(hidden_size)
        
        # Store input size and hidden size
        self.input_size = input_size
        self.hidden_size = hidden_size
        
    def forward(self, x):
        # Calculate linear transformation
        self.x = x
        out = np.dot(x, self.W) + self.b
        
        return out
    
    def backward(self, dout, lr, lamb=0):
        # Calculate gradients for weights and biases    
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)
        
        dW += lamb * self.W
        # Update weights and biases
        self.W -= lr * dW
        self.b -= lr * db
        
        # Calculate gradients for input
        # print(dout.shape, self.W.T.shape)
        dx = np.dot(dout, self.W.T)
    
        
        return dx

In [12]:
class Softmax_with_MSVM:
    def __init__(self):
        # softmax
        self.softmax_out = None
        self.softmax_x = None
        # MSVM
        self.pred = None
        self.target = None
        self.reg = None

    def softmax_forward(self, x):
        self.softmax_x = x
        # subtracting the maximum value for numerical stability
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        self.softmax_out = exp_x / np.sum(exp_x, axis=-1, keepdims=True)
        return self.softmax_out
    
    def sigmoid_forward(self, x):
        self.sigmoid_x = x
        self.sigmoid_out = 1 / (1 + np.exp(-x))
        return self.sigmoid_out
        

    def forward(self, x, target, reg=0.1):
        self.target = target
        self.MSVM_out = self.softmax_forward(x)
        scores = np.dot(x, self.MSVM_out.T)
        correct_scores = scores[np.arange(x.shape[0]), target.flatten()]
        margins = np.maximum(0, scores - correct_scores.reshape(-1, 1) + 1)
        margins[np.arange(x.shape[0]), target.flatten()] = 0
        loss = np.sum(margins)
        loss /= x.shape[0]
        loss += 0.5 * reg * np.sum(self.MSVM_out * self.MSVM_out)
        self.pred = (scores >= np.max(scores, axis=1).reshape(-1, 1)).astype(float)
        self.margin = margins
        return loss

    def backward(self, lr, reg):
        self.reg = reg
        batch_size = self.pred.shape[0]
        dL_dsoftmax = self.MSVM_out - self.target

        dsoftmax_dMSVMx = (self.softmax_out * (1 - self.softmax_out))
        dL_dMSVMx = dL_dsoftmax * dsoftmax_dMSVMx

        zero_mask = (dL_dsoftmax == 0) | (self.MSVM_out == 0)
        dL_dMSVMx = np.where(zero_mask, 0, -dL_dMSVMx)
        dMSVMx_dw = np.dot(dL_dMSVMx.T, self.pred) / batch_size

        # L2 regularization
        dMSVMx_dw += self.reg * self.MSVM_out

        self.MSVM_out -= np.clip(lr * dMSVMx_dw.T, -1e8, 1e8)
        return dL_dMSVMx



In [15]:

from collections import OrderedDict

def clip_gradient(grad, clip_value):
    return np.clip(grad, -clip_value, clip_value)

def train_MSVM(config, train_img=train_img, train_label=train_label):
    lr, num_epoch, batch_size, reg, grad_clip = config['learning_rate'], config['num_epoch'], config['batch_size'], config['reg'], config['grad_clip']
    print_loss_interval = 20
    
    layer1 = Linear_v2(784, 100)
    activation_function = config['activation_function']
    layer2 = Linear_v2(100, 10)
    softmax_with_MSVM = Softmax_with_MSVM()
    
    for iter in range(num_epoch):
        # forward
        num_data = len(train_img)
        num_batch = num_data // batch_size
        if num_data % batch_size != 0:
            num_batch += 1

        epoch_loss = 0

        for batch_idx in range(num_batch):
            start_idx = batch_idx * batch_size
            end_idx = min(num_data, (batch_idx + 1) * batch_size)

            batch_img = train_img[start_idx:end_idx]
            batch_label = train_label[start_idx:end_idx]

            x = layer1.forward(batch_img)
            x = activation_function.forward(x)
            x = layer2.forward(x)
            preds = softmax_with_MSVM.softmax_forward(x)
            
            loss = softmax_with_MSVM.forward(preds, batch_label, config['reg'])
            batch_loss = loss.sum() / len(preds)
            epoch_loss += batch_loss

            # backward
            dL = softmax_with_MSVM.backward(lr, reg)
            dL = clip_gradient(dL, grad_clip)
            dL = layer2.backward(dL, lr)
            dL = clip_gradient(dL, grad_clip)
            dL = activation_function.backward(dL)
            dL = clip_gradient(dL, grad_clip)
            dL = layer1.backward(dL, lr)
            dL = clip_gradient(dL, grad_clip)

        avg_loss = epoch_loss / num_batch
        print(avg_loss)

        if iter == 0 or (iter + 1) % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter + 1, num_epoch, avg_loss))

    model = OrderedDict()
    model['layer1'] = layer1
    model['activation_function'] = activation_function
    model['layer2'] = layer2
    model['softmax_with_MSVM'] = softmax_with_MSVM
    
    return model


In [16]:
config = { 'learning_rate' : 0.0001,
            'num_epoch' : 20,
            'activation_function' : ReLU(),
            'batch_size' : 10,
            'reg' : 0.1,
            'grad_clip' : 5
          }


model = train_MSVM(config)

0.9063082312495206
[epoch 1 / 20] average loss : 0.906308
0.9060401835436428
0.9059561605020455
0.9059525285416465
0.9059759786286521
0.9059538385853636
0.9059719987326689
0.9059806409576466
0.9059761413968586
0.9059633138599817
0.9059777983005265
0.9059621765083925
0.9059645734726055
0.9059692406937665
0.9059636398261962
0.9059675500302721
0.9059754288751998
0.9059699939498896
0.9059675500302721
0.9059729062079733
[epoch 20 / 20] average loss : 0.905973


In [18]:
def eval(model, train_version = True) :
    if train_version :
        x = train_img
        labels = train_label.squeeze()
        print('In train dataset ... ')
    else : 
        x = test_img
        labels = test_label.squeeze()
        print('\nIn test dataset ... ')
    
    for layer in model.values() :
        if isinstance(layer, Softmax_with_MSVM) :
            x = layer.softmax_forward(x)
        else :
            x = layer.forward(x)
            
    preds = x.argmax(axis=1)
    acc = np.sum(np.where(preds==labels, True, False))/len(labels)
    return acc

In [19]:
print('\t Accuracy :', eval(model, train_version=False))


In test dataset ... 
	 Accuracy : 0.1028


---

# Extra problem 2 (3-layer 이상 MLP 구현)

In [41]:
class ReLU:
    def __init__(self):
        self.mask = None
        
    def get_mask(self):
        return self.mask.shape
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
        
    def backward(self, dout, x):
        if self.mask.shape != x.shape:
            self.mask = np.zeros_like(x, dtype = np.bool)
        dx = dout
        dx[self.mask] = 0
        return dx 

    
class LeakyReLU:
    def __init__(self, alpha=0.01):
        self.alpha = alpha
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] *= self.alpha
        return out
    
    def backward(self, dout, x):
        if self.mask is None or self.mask.shape != x.shape:
            self.mask = np.zeros_like(x, dtype=np.bool)
        dx = dout
        dx[self.mask] *= self.alpha
        return dx


class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
        
    def backward(self, dout):
        dx = dout * self.out * (1 - self.out)
        return dx

class Tanh:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = np.tanh(x)
        self.out = out
        return out
        
    def backward(self, dout):
        dx = dout * (1 - self.out ** 2)
        return dx


In [42]:
class Softmax_with_Mul_CrossEntropy:
    def __init__(self):
        self.softmax_x = None 
        self.softmax_out = None
        self.pred = None
        self.target = None

    def softmax_forward(self, x):
        self.softmax_x = x
        tmp = np.max(self.softmax_x, axis=1).reshape(-1, 1)
        self.softmax_out = np.exp(self.softmax_x - tmp) / np.sum(np.exp(self.softmax_x - tmp), axis=1).reshape(-1, 1)
        return self.softmax_out
    
    def crossEntropy_forward(self, pred, target):
        delta = 1e-7
        self.pred = pred
        self.target = target
        loss = -np.sum(self.target * np.log(self.pred + delta), axis=1)
        return loss
    
    def backward(self, dL_next):
        batch_size = len(self.pred)
        dsoftmax = (self.pred - self.target) / batch_size
        dL = dsoftmax * dL_next
        return dL


In [45]:
from collections import OrderedDict

def train_Multi_MLP(config):
    lr, num_epoch, batch_size = config['learning_rate'], config['num_epoch'], config['batch_size']
    print_loss_interval = 1
    
    layer1 = Linear(784, 256)
    activation_function1 = config['activation_function']
    layer2 = Linear(256, 128)
    activation_function2 = config['activation_function']
    layer3 = Linear(128, 64)
    activation_function3 = config['activation_function']
    layer4 = Linear(64, 32)
    activation_function4 = config['activation_function']
    layer5 = Linear(32, 10)
    softmax_with_CE = Softmax_with_Mul_CrossEntropy()
    
    for iter in range(num_epoch):
        total_loss = 0
        num_batches = 0
        
        for i in range(0, len(train_img), batch_size):
            # forward
            batch_img = train_img[i:i+batch_size]
            batch_label = train_label[i:i+batch_size]
            
            x = layer1.forward(batch_img)
            x_1 = activation_function1.forward(x)
            x = layer2.forward(x_1)
            x_2 = activation_function2.forward(x)
            x = layer3.forward(x_2)
            x_3 = activation_function3.forward(x)
            x = layer4.forward(x_3)
            x_4 = activation_function4.forward(x)
            x = layer5.forward(x_4)
            preds = softmax_with_CE.softmax_forward(x)

            # loss
            one_hot_labels = make_one_hot(batch_label)
            losses = softmax_with_CE.crossEntropy_forward(preds, one_hot_labels)
            batch_loss = losses.sum() / len(preds)
            total_loss += batch_loss
            num_batches += 1

            # backward
            dL = softmax_with_CE.backward(one_hot_labels)
            dL = layer5.backward(dL)
            dL = activation_function4.backward(dL,x_4)
            dL = layer4.backward(dL)        
            dL = activation_function3.backward(dL,x_3)
            dL = layer3.backward(dL)
            dL = activation_function2.backward(dL,x_2)
            dL = layer2.backward(dL)
            dL = activation_function1.backward(dL,x_1)
            dL = layer1.backward(dL)

        # average batch loss
        avg_loss = total_loss / num_batches
        
        if iter == 0 or (iter+1) % print_loss_interval == 0:
            print("[epoch %d / %d] average loss : %f" % (iter+1, num_epoch, avg_loss))
            
    model = OrderedDict()
    model['layer1'] = layer1
    model['activation_function1'] = activation_function1
    model['layer2'] = layer2
    model['activation_function2'] = activation_function2
    model['layer3'] = layer3
    model['activation_function3'] = activation_function3
    model['layer4'] = layer4
    model['activation_function4'] = activation_function4
    model['layer5'] = layer5
    model['softmax_with_CE'] = softmax_with_CE
    
    return model


In [47]:
config = { 'learning_rate' : 0.00001,
            'num_epoch' : 3,
            'activation_function' : LeakyReLU(),
            'batch_size' : 2500,
            'reg' : 0.1
          }

model = train_Multi_MLP(config)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.mask = np.zeros_like(x, dtype=np.bool)


[epoch 1 / 3] average loss : nan
[epoch 2 / 3] average loss : nan
[epoch 3 / 3] average loss : nan


In [None]:
$L_i = \sum_{j\neq y_i} \max(0, s_j - s_{y_i} + \Delta)$