In [1]:
import numpy as np
import random
import pprint

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:]  # capitalize first character 
    print ('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
    return -np.log(1.0/vocab_size)*seq_length

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def initialize_parameters(n_a, n_x, n_y):
    """
    Initialize parameters with small random values
    
    Returns:
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    np.random.seed(1)
    Wax = np.random.randn(n_a, n_x)*0.01 # input to hidden
    Waa = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
    Wya = np.random.randn(n_y, n_a)*0.01 # hidden to output
    b = np.zeros((n_a, 1)) # hidden bias
    by = np.zeros((n_y, 1)) # output bias
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

def rnn_step_forward(parameters, a_prev, x):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # hidden state
    p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars 
    
    return a_next, p_t

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    #dy has been inserted with softmax derivative calculation, while gradients["da_next"] hasn't
    #for the db and the dby, you don't need to sum it again since we are doing stochastic gradient descent
    gradients['dWya'] += np.dot(dy, a.T)
    gradients['dby'] += dy
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
    daraw = (1 - a * a) * da # backprop through tanh nonlinearity
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    return gradients

def update_parameters(parameters, gradients, lr):
    "lr is the logistic regression loss of SGD"
    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    return parameters

def rnn_forward(X, Y, a0, parameters, vocab_size = 27):
    
    # Initialize x, a and y_hat as empty dictionaries
    x, a, y_hat = {}, {}, {} #dont confuse this dictionary with regular a, since it doesnt follow the [nal,m,Tx] shape. The key will be the Tx
    
    a[-1] = np.copy(a0)
    
    # initialize your loss to 0
    loss = 0
    
    for t in range(len(X)):
        
        # Set x[t] to be the one-hot vector representation of the t'th character in X.
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector. 
        x[t] = np.zeros((vocab_size,1)) 
        if (X[t] != None):
            x[t][X[t]] = 1
        
        # Run one step forward of the RNN
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        loss -= np.log(y_hat[t][Y[t],0]) #you only need to count the Y[t] since the other will multiply the log with 0
        
    cache = (y_hat, a, x)
        
    return loss, cache

def rnn_backward(X, Y, parameters, cache):
    # Initialize gradients as an empty dictionary
    gradients = {}
    
    # Retrieve from cache and parameters
    (y_hat, a, x) = cache
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    # each one should be initialized to zeros of the same dimension as its corresponding parameter
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])
    
    ### START CODE HERE ###
    # Backpropagate through time
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1 #because the derivative of the softmax when i = k
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
    ### END CODE HERE ###
    
    return gradients, a

In [2]:
data = open('dinos.txt', 'r').read()
data= data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'There are {data_size} total characters and {vocab_size} unique characters in data.')

There are 19912 total characters and 27 unique characters in data.


In [3]:
chars = sorted(chars)
print(chars)
print("----")
#\n will play the role as <EOS>
char_to_ix = { ch:i for i,ch in enumerate(chars) } #character as a key and index as the element
ix_to_char = { i:ch for i,ch in enumerate(chars) } #index as a key and character as the element
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(ix_to_char)

['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
----
{   0: '\n',
    1: 'a',
    2: 'b',
    3: 'c',
    4: 'd',
    5: 'e',
    6: 'f',
    7: 'g',
    8: 'h',
    9: 'i',
    10: 'j',
    11: 'k',
    12: 'l',
    13: 'm',
    14: 'n',
    15: 'o',
    16: 'p',
    17: 'q',
    18: 'r',
    19: 's',
    20: 't',
    21: 'u',
    22: 'v',
    23: 'w',
    24: 'x',
    25: 'y',
    26: 'z'}


In [4]:
def clip(gradients, number):
    """This function will be useed to clip the gradients of a matrix to a range of [-number,number] to
    avoid exploding gradients
    
    Arguments:
    gradients = a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    number = an integer for the clip rage number
    
    Output:
    clipped = similar dictionary with gradients based on the range [-number,number]"""
    
    #making sure that the number is positive
    number = abs(number)
    
    clipped = gradients.copy() #just in case, making a copy
    for grads in clipped:
        np.clip(clipped[grads],-number,number, out = clipped[grads])
    return clipped

In [5]:
# Test with a maxvalue of 10
maxValue = 10
np.random.seed(3)
dWax = np.random.randn(5,3)*10
dWaa = np.random.randn(5,5)*10
dWya = np.random.randn(2,5)*10
db = np.random.randn(5,1)*10
dby = np.random.randn(2,1)*10
gradients = {"dWax": dWax, "dWaa": dWaa, "dWya": dWya, "db": db, "dby": dby}
gradients = clip(gradients, maxValue)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])

gradients["dWaa"][1][2] = 10.0
gradients["dWax"][3][1] = -10.0
gradients["dWya"][1][2] = 0.2971381536101662
gradients["db"][4] = [10.]
gradients["dby"][1] = [8.45833407]


In [6]:
# Test with a maxValue of 5
maxValue = 5
np.random.seed(3)
dWax = np.random.randn(5,3)*10
dWaa = np.random.randn(5,5)*10
dWya = np.random.randn(2,5)*10
db = np.random.randn(5,1)*10
dby = np.random.randn(2,1)*10
gradients = {"dWax": dWax, "dWaa": dWaa, "dWya": dWya, "db": db, "dby": dby}
gradients = clip(gradients, maxValue)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])

gradients["dWaa"][1][2] = 5.0
gradients["dWax"][3][1] = -5.0
gradients["dWya"][1][2] = 0.2971381536101662
gradients["db"][4] = [5.]
gradients["dby"][1] = [5.]


In [7]:
#creating the sampling function onced the model has been trained 
def sample(parameters, char_to_ix, seed):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN

    Arguments:
    parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. 
    char_to_ix -- python dictionary mapping each character to an index.
    seed -- used for generating the same seed again and again

    Returns:
    indices -- a list of length n containing the indices of the sampled characters.
    """
    
    #The RNN will output the probability vector, and we will use the np.random.choice to generate the chosen letter so it wont generate the
    #same choice again and again
    #obtain the required parameters
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size, n_a = Wya.shape[0], Wya.shape[1] #since the size of Wya will be [vocab_size, n_a] 
    
    #create the zero vector for the initial of x, we always start x with the vector of all zero
    x = np.zeros((vocab_size, 1)) #in representation as a one hot dictionary without any input
    #initialize a_prev also with the number of all 0
    a_prev = np.zeros((n_a, 1))
    
    chosen_indices = [] #this will be the chosen indices of every word
    oneHotIndex = -1 #initialize the value of the chosen oneHotIndex other than [0,vocab_size - 1]. It will be
    #used to generate the oneHotIndex
    newline_index = char_to_ix["\n"] #newline will be the equivalent of <EOS>
    countRep = 0 #to count how many repetitions have happened currently, let's say that we want to limit the
    #counter into certain integer to avoid overflow
    
    while(oneHotIndex != newline_index and countRep != 50):
        #generate chosen probability using the regular RNN formula
        a = np.tanh(np.dot(parameters["Waa"],a_prev) + np.dot(parameters["Wax"], x) + parameters["b"])
        z = np.dot(parameters["Wya"], a) + parameters["by"]
        y = softmax(z) #count the softmax that will be the generated probability, we are using softmax because sum of all y should be 1
        
        # for grading purposes
        np.random.seed(countRep + seed) 
        
        #choose the random index
        oneHotIndex = np.random.choice(list(range(vocab_size)), p = np.ravel(y)) #choose random index between the available vocal
        chosen_indices.append(oneHotIndex)
        
        
        #update the current a_prev and current x
        a_prev = a
        x = np.zeros((vocab_size,1))
        x[oneHotIndex][0] = 1 #create a one hot rep of the chosen index
        #repeat the next step
        seed+=1
        countRep+=1
    if(chosen_indices[-1] != newline_index):
        chosen_indices.append(newline_index)
    return chosen_indices

In [8]:
np.random.seed(2)
_, n_a = 20, 100
Wax, Waa, Wya = np.random.randn(n_a, vocab_size), np.random.randn(n_a, n_a), np.random.randn(vocab_size, n_a)
b, by = np.random.randn(n_a, 1), np.random.randn(vocab_size, 1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b, "by": by}


indices = sample(parameters, char_to_ix, 0)
print("Sampling:")
print("list of sampled indices:\n", indices)
print("list of sampled characters:\n", [ix_to_char[i] for i in indices])

Sampling:
list of sampled indices:
 [12, 17, 24, 14, 13, 9, 10, 22, 24, 6, 13, 11, 12, 6, 21, 15, 21, 14, 3, 2, 1, 21, 18, 24, 7, 25, 6, 25, 18, 10, 16, 2, 3, 8, 15, 12, 11, 7, 1, 12, 10, 2, 7, 7, 11, 17, 24, 12, 19, 6, 0]
list of sampled characters:
 ['l', 'q', 'x', 'n', 'm', 'i', 'j', 'v', 'x', 'f', 'm', 'k', 'l', 'f', 'u', 'o', 'u', 'n', 'c', 'b', 'a', 'u', 'r', 'x', 'g', 'y', 'f', 'y', 'r', 'j', 'p', 'b', 'c', 'h', 'o', 'l', 'k', 'g', 'a', 'l', 'j', 'b', 'g', 'g', 'k', 'q', 'x', 'l', 's', 'f', '\n']


In [9]:
   def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    """
    Execute one step of the optimization to train the model.
    
    Arguments:
    X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
    Y -- list of integers, exactly the same as X but shifted one index to the left.
    a_prev -- previous hidden state.
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    learning_rate -- learning rate for the model.
    
    Returns:
    loss -- value of the loss function (cross-entropy)
    gradients -- python dictionary containing:
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
                        db -- Gradients of bias vector, of shape (n_a, 1)
                        dby -- Gradients of output bias vector, of shape (n_y, 1)
    a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
    """
    
    #perform one step of optimization (stochastic gradient descent)
    loss, cache = rnn_forward(X, Y, a_prev, parameters, vocab_size = 27)
    gradients, a = rnn_backward(X, Y, parameters, cache)
    gradients = clip(gradients, 5)
    parameters = update_parameters(parameters, gradients, learning_rate)
    return loss, gradients, a[len(X)-1]

In [10]:
np.random.seed(1)
vocab_size, n_a = 27, 100
a_prev = np.random.randn(n_a, 1)
Wax, Waa, Wya = np.random.randn(n_a, vocab_size), np.random.randn(n_a, n_a), np.random.randn(vocab_size, n_a)
b, by = np.random.randn(n_a, 1), np.random.randn(vocab_size, 1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b, "by": by}
X = [12,3,5,11,22,3]
Y = [4,14,11,22,25, 26]

loss, gradients, a_last = optimize(X, Y, a_prev, parameters, learning_rate = 0.01)
print("Loss =", loss)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("np.argmax(gradients[\"dWax\"]) =", np.argmax(gradients["dWax"]))
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])
print("a_last[4] =", a_last[4])

Loss = 126.5039757216537
gradients["dWaa"][1][2] = 0.19470931534721064
np.argmax(gradients["dWax"]) = 93
gradients["dWya"][1][2] = -0.0077738760320039835
gradients["db"][4] = [-0.06809825]
gradients["dby"][1] = [0.01538192]
a_last[4] = [-1.]


In [11]:
def model(data, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27):
    """
    Trains the model and generates dinosaur names. 
    
    Arguments:
    data -- text corpus(similar to the one obtained from notepad, unedited)
    ix_to_char -- dictionary that maps the index to a character
    char_to_ix -- dictionary that maps a character to an index
    num_iterations -- number of iterations to train the model for
    n_a -- number of units of the RNN cell
    dino_names -- number of dinosaur names you want to sample at each iteration. 
    vocab_size -- number of unique characters found in the text (size of the vocabulary)
    
    Returns:
    parameters -- learned parameters
    """
    # Retrieve n_x and n_y from vocab_size
    n_x, n_y = vocab_size, vocab_size
    
    # Initialize parameters
    parameters = initialize_parameters(n_a, n_x, n_y)
    
    # Initialize loss for smoothing the loss of SGD
    loss = get_initial_loss(vocab_size, dino_names)
    
    #get the words dataset (which will be character based)
    words = [x for x in data.split('\n')]
    #shuffle the words
    np.random.seed(0)
    np.random.shuffle(words)
    
    #initialize the first a with zeros
    a_prev = np.zeros((n_a, 1))
    for iteration in range(num_iterations):
        #for each iteration, we use 1 dataset only since we are going to do SGD
        wordIndex = iteration%len(words)
        currentWord = words[wordIndex]
        #convert the current word to indexes, that will be converted to one hot inside rnn-forward
        currentIndexes = [char_to_ix[i] for i in currentWord]
        #after obtaining the indexes, create the x and y
        X = [None] + currentIndexes #basically always set the first input as none
        Y = currentIndexes + [char_to_ix['\n']]#basically always set the last output as <EOS> the \n is regarded as <EOS>
        
        #beginning to do one step of optimization
        currentLoss, gradients, a_prev = optimize(X, Y, a_prev, parameters, learning_rate = 0.01) #at the same time updating the a_prev
        
        #update the loss through the smooth function
        loss = smooth(loss, currentLoss)
        
        #every 2000 iterations, print the loss and print the generated samples
        if iteration%2000 == 0:
            print(f"Current loss after {iteration} iteration/s : {loss}.")
            
            #start doing the word sampling to see how the model is doing
            seed = 0
            samples = []
            for sampleIter in range(dino_names):
                generatedSampleIndex = sample(parameters, char_to_ix, seed)
                print_sample(generatedSampleIndex, ix_to_char)
                seed += 1
            print("\n") #create empty lines
    return parameters

In [12]:
parameters = model(data, ix_to_char, char_to_ix)

Current loss after 0 iteration/s : 23.084041118061243.
Nkzxwtdmfqoeyhsqwasjkjvu
Kneb
Kzxwtdmfqoeyhsqwasjkjvu
Neb
Zxwtdmfqoeyhsqwasjkjvu
Eb
Xwtdmfqoeyhsqwasjkjvu


Current loss after 2000 iteration/s : 28.036235674325997.
Mhxtosaurus
Hiecagpsafaus
Iwtosaurus
Macaisihacisaurusasanyalos
Xusakleronvgosg
Ba
Tosaurus


Current loss after 4000 iteration/s : 25.869417818051364.
Mixusibigoreveros
Inedajsiecanthtemiton
Iwusibigoreveros
Macalosaurus
Xuslcheoreuroshatingshanriylia
Ca
Torangosaurus


Current loss after 6000 iteration/s : 24.562861209912484.
Onyuspcheronthosrasilitochusus
Lneealosaurus
Mytrolonosaurus
Olaalosaurus
Xusteomosaurus
Edalosaurus
Torapiorexhrosaurus


Current loss after 8000 iteration/s : 24.178075231136983.
Onyvus
Lkacakuthackusangosaurus
Lytosaurus
Oma
Yusodonosaurus
Ecairria
Sspendongyisaurus


Current loss after 10000 iteration/s : 23.73801916686975.
Onyusaurus
Liceadrs
Lustoisaurus
Olaagropdchus
Xussatops
Edagosaurus
Tosaurus


Current loss after 12000 iteration/s : 

In [13]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from shakespeare_utils import *
import sys
import io

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Loading text data...
Creating training set...
number of training examples: 31412
Vectorizing training set...
Loading model...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [14]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y, batch_size=128, epochs=1, callbacks=[print_callback])

Epoch 1/1


<keras.callbacks.callbacks.History at 0x1eb5670b1c8>

In [16]:
generate_output()

Write the beginning of your poem, the Shakespeare machine will complete it. Your input is: Willy


Here is your poem: 

Willy,
woir ever that in she pomt is youl calf is orfore.
 



you fres to strengens that who in is silmcalled beant.



a can tine i conven assaoas,
with to what groon sach the ride oungapade,
then him from thou of deal's ithted to sweat,
which strencs singrors mefure my love my live,
whon hor laedeming berong ol my bith,
hy host melly agest strelawed wult with levely thy befint,
hade who with forblac