In [63]:
import numpy as np
import matplotlib.pyplot as mpl

In [64]:
d = 10
n = 10000

sample_x = np.random.randn(n, d)
mean = np.empty(d)
norm_tab = np.empty(n)

for i in range (0, n):
    norm = np.sum(sample_x[i] * sample_x[i]) ** 0.5
    sample_x[i] = (sample_x[i] / norm) * (d ** 0.5) 

#On vérifie que notre data set soit uniforme
for i in range (0, d):
    mean[i] = np.sum(sample_x.T[i]) / n

#On vérifie que notre dataset ait une norme de sqrt(d)
for i in range (0, n):
    norm_tab[i] = np.sum(sample_x[i] * sample_x[i]) ** 0.5
    
print(mean)
print(norm_tab)

[-9.06430779e-03  8.45520389e-03 -1.74553159e-02  1.15756322e-02
 -5.93191140e-05 -6.29693077e-04  3.75684422e-03  7.51588416e-03
 -1.45883372e-02 -1.02691269e-02]
[3.16227766 3.16227766 3.16227766 ... 3.16227766 3.16227766 3.16227766]


In [65]:
tau = 0.3 
noise_level = tau * tau

#On veut E(sample_noise) == 0 et E(sample_noise^2) == tau^2
sample_noise = np.random.randn(n, 1) * noise_level

print(sample_noise)

[[ 0.01650386]
 [-0.03180343]
 [ 0.09307829]
 ...
 [ 0.04341371]
 [-0.10715328]
 [ 0.038148  ]]


In [66]:
F1 = 1

sample_params = np.random.randn(d, 1)
    
norm = np.sum(sample_params * sample_params) #should be equal to F1^2
sample_params = sample_params * (F1 / (norm**0.5))

#print(sample_params)
print(np.sum(sample_params * sample_params) ** 0.5)

0.9999999999999999


In [67]:
sample_y = sample_x @ sample_params + sample_noise
print(sample_y)

[[ 0.55123855]
 [-2.11611324]
 [ 0.70684673]
 ...
 [ 0.70204034]
 [ 0.33374882]
 [ 1.12669184]]


In [68]:
#Quelques fonctions d'activation :

class Sigmoid :
    @staticmethod
    def function(x):
        return 1/(1+np.exp(-x))
    
    @staticmethod
    def gradient(x):
        s = Sigmoid.function(x)
        return s * (1- s)
    
class Tanh :
    @staticmethod
    def function(x):
        return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    
    @staticmethod
    def gradient(x):
        t = Tanh(x)
        return 1-t**2
    
class Relu :
    @staticmethod
    def function(x):
        return np.maximum(0, x)
    
    @staticmethod
    def gradient(x):
        return np.maximum(0, x) / x


In [79]:
#Quelques Loss Functions :
class MSE:
    @staticmethod
    def loss(y_real, y_hat):
        return (1/(y_real.shape[0] * y_real.shape[1])) * np.sum(np.sum((y_hat - y_real)**2, axis = 0))
    
    @staticmethod
    def gradient(y_real, y_hat):
        return (2/(y_real.shape[0] * y_real.shape[1])) * (y_hat - y_real)

In [82]:
#L'architecture

class Network:
    def __init__(self, dimension_hidden, activation1, activation2):
        """
        dimension_hidden est le nombre de paramètres dans le hidden layer (N dans le papier de Mei et Montanari)
        activation1 est la fonction d'activation du hidden layer
        activation2 est la fonction d'activation de l'output layer
        """
        
        self.nb_layers = 3 #input, hidden, output
        self.dimensions = (d, dimension_hidden, 1)
        self.loss = None
        
        self.learning_rate = {}
        self.learning_rate[1] = None;  #learning rate du hidden layer
        self.learning_rate[2] = None;  #learning rate du output layer
        
        self.weights = {}
        self.bias = {}
        
        #on initialise les weights et les bias aléatoirement
        for i in range(self.nb_layers -1):
            self.weights[i + 1] = np.random.randn(self.dimensions[i], self.dimensions[i + 1]) / np.sqrt(self.dimensions[i])
            self.bias[i + 1] = np.zeros(self.dimensions[i + 1])
            
        self.activations = {}
        self.activations[2] = activation1
        self.activations[3] = activation2
        
    def forward_pass(self, x):
        """
        x est un vecteur de notre data
        """
        z = {}
        a = {1:x} #l'input layer n'a pas d'activation function, a[1] est donc égal à x
        
        for i in range(1, self.nb_layers):
            z[i + 1] = a[i] @ self.weights[i] + self.bias[i] #Z = XW + b
            a[i + 1] = self.activations[i + 1].function(z[i + 1]);
            
        return z, a
    
    def predict(self, x):
        z, a = self.forward_pass(x)
        return a[self.nb_layers]
    
    def back_propagation(self, z, a, real_value):
        y_hat = a[self.nb_layers]
        
        #On calcule delta et la dérivée partielle à l'output layer
        delta = self.loss_function.gradient(real_value, y_hat) * self.activations[self.nb_layers].gradient(y_hat)
        partial_deriv = a[self.nb_layers - 1].T @ delta
        
        update_parameters = {
            self.nb_layers - 1: (partial_deriv, delta)
        }
        
        for i in reversed(range(2, self.nb_layers)):
            delta = (delta @ self.weights[i].T) * self.activations[i].gradient(z[i])
            partial_deriv = a[i - 1].T @ delta 
            update_parameters[i - 1] = (partial_deriv, delta)
            
        for k, v in update_parameters.items():
            self.update_weights_and_bias(k, v[0], v[1])
            
    def update_weights_and_bias(self, index, partial_deriv, delta):
        self.weights[index] -= self.learning_rate[index] * partial_deriv
        self.bias[index] -= self.learning_rate[index] * np.mean(delta, 0)
        
    def fit(self, x, y_real, loss, nb_iterations, batch_size, learning_rate1, learning_rate2):
        #On vérifie qu'on a autant de x que de y
        if not (x.shape[0] == y_real.shape[0]):
            raise Exception
        self.loss_function = loss
        self.learning_rate[1] = learning_rate1
        self.learning_rate[2] = learning_rate2
        
        #We use batch gradient descent
        for i in range(nb_iterations):
            for j in range(x.shape[0] // batch_size):
                start = j * batch_size
                end = (j + 1) * batch_size
                z, a = self.forward_pass(x[start:end])
                self.back_propagation(z, a, y_real[start:end])
            if(i % 10) == 0:
                z, a = self.forward_pass(x)
                print("Loss at Iteration {} is {}".format(i, self.loss_function.loss(y_real, a[self.nb_layers])))
        

In [85]:
#Test if our architecture Compiles
nn = Network(10000, Relu, Sigmoid)
nn.fit(sample_x, sample_y, MSE, 100, 10000, 0.03, 0.03)

Loss at Iteration 0 is 0.9715030721531901
Loss at Iteration 10 is 0.7369101213738476
Loss at Iteration 20 is 0.7382480305588923
Loss at Iteration 30 is 0.7377807545456785
Loss at Iteration 40 is 0.736460788727149
Loss at Iteration 50 is 0.7345780734992224
Loss at Iteration 60 is 0.7320788066231054
Loss at Iteration 70 is 0.7287405895577195
Loss at Iteration 80 is 0.7251039438835585
Loss at Iteration 90 is 0.7211542096693944
