<a href="https://colab.research.google.com/github/btlgs2000/dl_intro/blob/master/NN_from_scratch_corso_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

In [13]:
class LinearLayer:
    def __init__(self, n_in, n_out):
        self.n_in = n_in
        self.n_out = n_out
        # inizializzare con distribuzione normale standard
        self.W = np.random.rand(n_out, n_in)
        # inizializzare a 0
        self.b = np.zeros(shape=(n_out, 1))

        # hanno le stesse dimensioni di W e di b
        self.dW = None
        self.db = None

    def forward(self, x):
        ''' restituisce y = Wx + b

        args
        ----
        x (np.array): x.shape = (n_in, 1)

        ret
        ---
        y (np.array): y.shape = (n_out, 1)
        '''
        assert x.shape == (self.n_in, 1)
        self.x = x
        y = self.W@x + self.b
        assert y.shape ==(self.n_out, 1)
        return y

    def backward(self, dy):
        ''' restituisce dx (dl/dx) e valorizza
        gli attributi dW e db
        
        args
        ----
        dy (np.array): dy.shape (n_out, 1)

        ret
        ---
        dx (np.array): dx.shape = (n_in, 1)
        '''
        assert dy.shape ==(self.n_out, 1)

        self.db = dy
        self.dW = dy@self.x.T
        assert self.dW.shape == (self.n_out, self.n_in)
        dx = self.W.T@dy
        assert dx.shape == (self.n_in, 1)
        return dx

In [14]:
class Sigmoid:
    def forward(self, x):
        ''' restituisce y = sigma(x)

        args
        ----
        x (np.array): x.shape = (n, 1)

        ret
        ---
        y (np.array): x.shape = (n, 1)
        '''
        self.y = 1 / (1+np.exp(-x))
        return self.y

    def backward(self, dy):
        ''' 
        args
        ----
        dy (np.array): dy.shape = (n, 1)

        ret
        ---
        dx (np.array): dx.shape = (n, 1)

        '''
        return self.y * (1-self.y) * dy

In [15]:
class Softmax:
    def forward(self, x):
        # traslazione per stabilità numerica
        x_max = np.max(x)
        x_reg = x - x_max

        self.x_exp = np.exp(x_reg)
        self.s = np.sum(self.x_exp)
        assert self.s.ndim == 0
        y = self.x_exp / self.s # normalizzazione
        assert y.shape == x.shape
        return y

    def backward(self, dy):
        n = dy.shape[0]
        dx = (self.x_exp * (np.eye(n)*self.s - self.x_exp.T) / self.s**2).T @ dy
        return dx

In [16]:
class CrossEntropyLoss:
    def forward(self, y_true, y_pred):
        ''' calcola la CE

        args
        ----
        y_true (np.array): one-hot y_true.shape=(C, 1)
        y_pred (np.array): è l'output della Softmax y_pred.shape=(C, 1)

        ret
        ---
        L (np.array): L.shape=(1, 1)
        '''
        self.y_pred = y_pred
        self.y_true = y_true
        L = - np.log(y_pred[np.argmax(y_true)])
        return L

    def backward(self):
        '''

        ret
        ---
        dy_pred (np.array): dy_pred.shape=(C, 1)
        '''
        dy_pred = (-self.y_true/self.y_pred)
        return dy_pred

In [17]:
class MLP:
    def __init__(self, layers_dims, hid_layers_act, last_layer_act):
        ''' rappresenta un Multi-layer Perceptron

        args
        ----
        layers_dims (List[int]): il numero di unità per ogni strato
        hid_layers_act (str): la funzione di attivazione dopo ogni strato nascosto
        last_layer_act (str): l'attivazione dopo l'ultimo strato
        '''
        self.activ_map = {
            'sigmoid': Sigmoid,
            'softmax': Softmax,
            None: None
        }
        self.lin_layers = []
        self.hidden_activs = []
        self.last_activ = None
        self.build(layers_dims, hid_layers_act, last_layer_act)

    def build(self, layers_dims, hid_layers_act, last_layer_act):
        for n_in, n_out in zip(layers_dims[:-1], layers_dims[1:]):
            self.lin_layers.append(LinearLayer(n_in, n_out))
        
        for _ in self.lin_layers[:-1]:
            self.hidden_activs.append(self.activ_map[hid_layers_act]())

        self.last_activ = self.activ_map[last_layer_act]()

    def forward(self, x):
        ''' f-step

        args
        ----
        x (np.array): x.shape(self.layers_dims[0], 1)

        ret
        ---
        y (np.array): y.shape(self.layers_dims[-1], 1)
        '''
        *hidden_layers, last_layer = self.lin_layers
        for lin_layer, activation in zip(hidden_layers, self.hidden_activs):
            x = lin_layer.forward(x)
            x = activation.forward(x)

        y = last_layer.forward(x)
        if self.last_activ is not None:
            y = self.last_activ.forward(y)

        return y

        

    def backward(self, dy):
        ''' b-step
        
        Calcola i gradienti della loss rispetto a tutti i pesi

        args
        ----
        dy (np.array): dy.shape(self.layers_dims[-1], 1)
                       rappresenta dL/dy
        '''
        *hidden_layers, last_layer = self.lin_layers

        if self.last_activ is not None:
            dy = self.last_activ.backward(dy)
        dy = last_layer.backward(dy)
        for lin_layer, activation in zip(hidden_layers[::-1], self.hidden_activs[::-1]):
            dy = activation.backward(dy)
            dy = lin_layer.backward(dy)


    def get_weights(self):
        ''' restituisce una lista con i parametri della rete

        ret
        ---
        weights (List[np.array]): lista con tutti i parametri della rete
        '''
        weights = []
        for ll in self.lin_layers:
            weights.append(ll.W)
            weights.append(ll.b)
        return weights

    def get_gradients(self):
        ''' dopo un passo backward restituisce la lista dei gradienti
        nello stesso ordine della get_weights

        ret
        ---
        gradients (List[np.array]): lista con i gradienti di tutti i parametri
        '''
        grads = []
        for ll in self.lin_layers:
            grads.append(ll.dW)
            grads.append(ll.db)
        return grads

In [18]:
mlp = MLP(layers_dims=[64, 20, 10], hid_layers_act='sigmoid', last_layer_act='softmax')

In [19]:
data = datasets.load_digits()

In [20]:
xs, ys = data['images'].reshape(-1, 64) / 255, OneHotEncoder(sparse=False).fit_transform(data['target'].reshape(-1, 1))

In [21]:
epochs = 500
alpha = 1e-1
ce_loss = CrossEntropyLoss()

pbar = tqdm(range(epochs))
for epoch in pbar:
    losses = []
    for x, y_true in zip(xs, ys):
        x = x.reshape(-1, 1)
        y_true = y_true.reshape(-1, 1)
        # f-step
        y_pred = mlp.forward(x)
        loss = ce_loss.forward(y_true, y_pred)
        losses.append(loss)
        # b-step
        dy = ce_loss.backward()
        mlp.backward(dy)

        # SGD step
        weights = mlp.get_weights()
        grads = mlp.get_gradients()
        for weight, grad in zip(weights, grads):
            weight[:] = weight - alpha*grad
    pbar.set_description(f'mean loss = {np.mean(losses)}')

mean loss = 0.013054869692951283: 100%|██████████| 500/500 [01:31<00:00,  5.47it/s]
