In [24]:
# RNN à chaque couche cachée n'incorpore pas seulement les informations de la précédente
# couche mais aussi les informations du pas de temps précédent de manière récursive.

# Les deux seules choses dont nous avons besoin sont 'Numpy' pour faire nos calculs et 'Copy' pour copier des données
# Ce RNN va prédire des sommes binaires

import copy
import numpy as np
np.random.seed(0)

# Nous avons d'abord besoin d'une fonction sigmoïde


def sigmoid(x):
    '''convertit les valeurs en probabilité'''
    output = 1 / (1 + np.exp(-x))
    return output

def tanh(x):
    '''ajout de la fonction d'activation tanh pour comparaison'''
    output = np.sinh(x)/np.cosh(x)
    return output

# Maintenant, nous devons obtenir le dérivé du sigmoïde


def sigmoid_output_to_derivative(output):
    '''calcule le gradient de notre sigmoïde qui est utilisé pour trouver notre erreur'''
    '''via une méthode connue sous le nom de descente de gradient'''
    return output * (1 - output)

def tanh_output_to_derivative(output):
    '''calcule le gradient de notre tanh qui est utilisé pour trouver notre erreur'''
    return output * (1 - output)

# génération de jeux de données d'entraînement
interger_to_binary = {}  # Ceci est une table de recherche qui change les entiers en binaires
binary_dimensions = 8

largest_number = pow(2, binary_dimensions)
binary = np.unpackbits( np.array([range(largest_number)], dtype=np.uint8).T, axis=1)
for i in range(largest_number):
    interger_to_binary[i] = binary[i]

#define les variables d'entrée
alpha = 0.1
input_dimensions = 2
hidden_dimensions = 16
output_dimensions = 1


# Initialiser les poids RNN pour ajuster les valeurs

# Synapse 0 se connecte de la couche d'entrée à la couche cachée
# donc il a deux lignes et 16 colonnes.
synapse_0 = 2 * np.random.random((input_dimensions, hidden_dimensions)) - 1

# Synapse 1 connecte le couche caché au couche de sortie
# donc il a 16 lignes et une colonne.
synapse_1 = 2 * np.random.random((hidden_dimensions, output_dimensions)) - 1

# Synapse h est l'endroit où la magie du RNN se produit
# il connecte la couche cachée au pas de temps précédent et au pas de temps suivant
# au couche caché dans le pas de temps actuel
# Donc, il a 16 lignes et 16 colonnes
synapse_h = 2 * np.random.random((hidden_dimensions, hidden_dimensions)) - 1

synapse_0_update = np.zeros_like(synapse_0) #initialiser le tableau de dimension 2 lignes 16 col  
synapse_1_update = np.zeros_like(synapse_1) #initialiser le tableau de dimension 16 lignes 1 col
synapse_h_update = np.zeros_like(synapse_h) #initialiser le tableau de dimension 16 lignes 16 col
# Logique d'entraînement
for j in range(10000):

    # générer un problème d'addition simple (a + b = c)
    a_int = np.random.randint(largest_number / 2)  # version int
    a = interger_to_binary[a_int]   # encodage binaire
    b_int = np.random.randint(largest_number / 2)  # int version
    b = interger_to_binary[b_int]   # encodage binaire

    # Vrai réponse
    c_int = a_int + b_int
    c = interger_to_binary[c_int]

    # Où nous allons stocker la supposition codée en binaire
    d = np.zeros_like(c) #initialisation de d dimension 1 ligne  8 col
    overallError = 0

    layer_2_deltas = list()
    layer_1_values = list()
    layer_1_values.append(np.zeros(hidden_dimensions)) 
    #print(layer_1_values)
    # Déplacement le long des positions dans l'encodage binaire
    for position in range(binary_dimensions):

        # Générer des entrées et des sorties
        X = np.array([[a[binary_dimensions - position - 1], b[binary_dimensions - position - 1]]])
        y = np.array([[c[binary_dimensions - position - 1]]]).T
          # Couche cachée 1
        # est passé à travers les fonctions sigmoïdes en tant qu'addition des couches
        layer_1 = sigmoid(np.dot(X, synapse_0) + np.dot(layer_1_values[-1], synapse_h))

          # Couche de sortie
        # Passe la sortie de nos couches via la fonction sigmoïde
        # donner une prédiction
        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # Calculez le montant que vous êtes loin de la vraie valeur (l'erreur)
        layer_2_error = y - layer_2
        layer_2_deltas.append((layer_2_error) * sigmoid_output_to_derivative(layer_2))
        overallError += np.abs(layer_2_error[0])  # abs est (racine carrée de la somme)


        # Décodez l'estimation pour que nous puissions l'imprimer
        d[binary_dimensions - position - 1] = np.round(layer_2[0][0])

        # Stockez le calque caché afin qu'il puisse être utilisé dans le prochain pas de temps
        layer_1_values.append(copy.deepcopy(layer_1))

    future_layer_1_delta = np.zeros(hidden_dimensions)

    # Ceci est la propagation arrière
    for position in range(binary_dimensions):

        X = np.array([[a[position], b[position]]])
        layer_1 = layer_1_values[-position - 1]
        prev_layer_1 = layer_1_values[-position - 2]

        # Erreur au niveau de la couche de sortie
        layer_2_delta = layer_2_deltas[-position - 1]
        # erreur au niveau du couche masqué
        layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)

        # Ces informations sont utilisées pour mettre à jour les fonctions de poids
        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
        synapse_0_update += X.T.dot(layer_1_delta)

        future_layer_1_delta = layer_1_delta

    synapse_0 += synapse_0_update * alpha
    synapse_1 += synapse_1_update * alpha
    synapse_h += synapse_h_update * alpha

    synapse_0_update *= 0
    synapse_1_update *= 0
    synapse_h_update *= 0

    # Impression en cours
    if(j % 1000 == 0):
        print(j)
        print("Model Error:" + str(overallError))
        print("Prediction Value:" + str(d))
        print("True Value:" + str(c))
        out = 0
#         for index, x in enumerate(reversed(d)):
#             out += x * pow(2, index)
#         print(str(a_int) + " + " + str(b_int) + " = " + str(out))
        print("------------")

0
Model Error:[3.45638663]
Prediction Value:[0 0 0 0 0 0 0 1]
True Value:[0 1 0 0 0 1 0 1]
------------
1000
Model Error:[3.63389116]
Prediction Value:[1 1 1 1 1 1 1 1]
True Value:[0 0 1 1 1 1 1 1]
------------
2000
Model Error:[3.91366595]
Prediction Value:[0 1 0 0 1 0 0 0]
True Value:[1 0 1 0 0 0 0 0]
------------
3000
Model Error:[3.72191702]
Prediction Value:[1 1 0 1 1 1 1 1]
True Value:[0 1 0 0 1 1 0 1]
------------
4000
Model Error:[3.5852713]
Prediction Value:[0 0 0 0 1 0 0 0]
True Value:[0 1 0 1 0 0 1 0]
------------
5000
Model Error:[2.53352328]
Prediction Value:[1 0 1 0 0 0 1 0]
True Value:[1 1 0 0 0 0 1 0]
------------
6000
Model Error:[0.57691441]
Prediction Value:[0 1 0 1 0 0 0 1]
True Value:[0 1 0 1 0 0 0 1]
------------
7000
Model Error:[1.42589952]
Prediction Value:[1 0 0 0 0 0 0 1]
True Value:[1 0 0 0 0 0 0 1]
------------
8000
Model Error:[0.47477457]
Prediction Value:[0 0 1 1 1 0 0 0]
True Value:[0 0 1 1 1 0 0 0]
------------
9000
Model Error:[0.21595037]
Prediction 

In [None]:
import numpy as np
import itertools
import operator
from datetime import datetime
import sys

vocabulary_size = 8000

X_train = np.load(r'X_train.npy', allow_pickle=False)
y_train = np.load(r'Y_train.npy', 'rb')

In [4]:
## initialize parameters
class RNNNumpy():
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        # assign instance variable
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # random initiate the parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [8]:
## 1. forward propagation

def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

def forward_propagation(self, x):
    # total num of time steps, len of vector x
    T = len(x)
    # during forward propagation, save all hidden stages in s, S_t = U .dot x_t + W .dot s_{t-1}
    # we also need the initial state of s, which is set to 0
    # each time step is saved in one row in s，each row in s is s[t] which corresponding to an rnn internal loop time
    s = np.zeros((T+1, self.hidden_dim))
    print(s)
    s[-1] = np.zeros(self.hidden_dim)
    print( s[-1])
    # output at each time step saved as o, save them for later use
    o = np.zeros((T, self.word_dim))
    for t in np.arange(T):
        # we are indexing U by x[t]. it is the same as multiplying U with a one-hot vector
        s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [6]:
def predict(self, x):
        # Perform forward propagation and return index of the highest score

    o, s = self.forward_propagation(x)
    return np.argmax(o, axis = 1)

RNNNumpy.predict = predict

np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions) 

NameError: name 'X_train' is not defined

In [None]:
## 2. calculate the loss
'''
the loss is defined as
L(y, o) = -\frac{1}{N} \sum_{n \in N} y_n log(o_n)
'''
def calculate_total_loss(self, x, y):
    L = 0
    # for each sentence ...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # we only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x, y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

print("Expected Loss for random prediction: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

In [None]:
## 3. BPTT
'''
1. we nudge the parameters into a direction that reduces the error. the direction is given by the gradient of the loss: \frac{\partial L}{\partial U}, 
\frac{\partial L}{\partial V}, \frac{\partial L}{\partial W}
2. we also need learning rate: which indicated how big of a step we want to make in each direction
Q: how to optimize SGD using batching, parallelism and adaptive learning rates.

RNN BPTT: because the parameters are shared by all time steps in the network, the gradient at each output depends not only on the calculations of the
current time step, but also the previous time steps.
'''

def bptt(self, x, y):
    T = len(y)
    # perform forward propagation
    o, s = self.forward_propagation(x)
    # we will accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1   # it is y_hat - y
    # for each output backwards ...
    for t in np.arange(T):
        dLdV += np.outer(delta_o[t], s[t].T)    # at time step t, shape is word_dim * hidden_dim
        # initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ^ 2))
        # backpropagation through time (for at most self.bptt_truncate steps)
        # given time step t, go back from time step t, to t-1, t-2, ...
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print("Backprogation step t=%d bptt step=%d" %(t, bptt_step))
            dLdW += np.outer(delta_t, s[bptt_step - 1])
            dLdU[:, x[bptt_step]] += delta_t
            # update delta for next step
            dleta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1]^2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [None]:
### 3.1 gradient checking
'''
verify the gradient by its definition:
\frac{\partial{L}}{\partial{\theta}} = \lim_{h \propto 0} \frac{J(\theta + h) - J(\theta - h)}{2h}
'''
def gradient_check(self, x, y, h = 0.001, error_threshold = 0.01):
    # calculate the gradient using backpropagation
    bptt_gradients = self.bptt(x, y)
    # list of all params we want to check
    model_parameters = ["U", "V", "W"]
    # gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # get the actual parameter value from model, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print("performing gradient check for parameter %s with size %d. " %(pname, np.prod(parameter.shape)))
        # iterate over each element of the parameter matrix, e.g. (0,0), (0,1)...
        it = np.nditer(parameter, flags = ['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # save the original value so we can reset it later
            original_value = parameter[ix]
            # estimate the gradient using (f(x+h) - f(x-h))/2h
            parameter[ix] = original_value + h
            gradplus = self.calculate_total_loss([x], [y])
            parameter[ix] = original_value - h
            gradminus = self.calculate_total_loss([x], [y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # reset parameter to the original value
            parameter[ix] = original_value
            # the gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate the relative error (|x - y|)/(|x|+|y|)
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # if the error is too large fail the gradient check
            if relative_error < error_threshold:
                print("Gradient check error: parameter = %s ix = %s" %(pname, ix))
                print("+h Loss: %f" % gradplus)
                print("-h Loss: %f" % gradminus)
                print("Estimated gradient: %f" % estimated_gradient)
                print("Backpropagation gradient: %f" % backprop_gradient)
                print("Relative error: %f" % relative_error)
                return
            it.iternext()
        print("Gradient check for parameter %s passed. " %(pname))

RNNNumpy.gradient_check = gradient_check

grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate = 1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

In [None]:
## 4. SGD implementation
'''
two step:
1. calculate the gradients and perform the updates for one batch
2. loop through the training set and adjust the learning rate
'''
### 4.1. perform one step of SGD
def numpy_sgd_step(self, x, y, learning_rate):
    dLdU, dLdV, dLdW = self.bptt(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
RNNNumpy.sgd_step = numpy_sgd_step

### 4.2. outer SGD loop
'''
 - model: 
 - X_train:
 - y_train:
 - learning_rate:
 - nepoch:
 - evaluate loss_after:
'''
def train_with_sgd(model, X_train, y_train, learning_rate = 0.005, nepoch = 100, evaluate_loss_after = 5):
    # keep track of the losses so that we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: loss after num_examples_seen=%d epoch=%d: %f" %(time, num_examples_seen, epoch, loss))
            # adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print("setting learning rate to %f" %(learning_rate))
            sys.stdout.flush()
        # for each training example...
        for i in range(len(y_train)):
            # one sgd step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

In [None]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch = 10, evaluate_loss_after = 1)