# Based on How to generate your own Wikipedia articles https://www.youtube.com/watch?v=ZGU5kIG7b2I

In [1]:
#dependencies

import numpy as np #vectorization
import random #generating text
import tensorflow as tf #ML
import datetime #clock training time


In [2]:
text = open('wikitext-103-raw/wiki.test.raw', encoding='utf8').read()
print('text length in number of characters', len(text))

print('head of text:')
print(text[:1000])

text length in number of characters 1288556
head of text:
 
 = Robert Boulter = 
 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
 In 2006 , Boulter starred alongside Whishaw in the play C

In [3]:
#print out our characters and sort them
chars = sorted(list(set(text)))
char_size = len(chars)
print('number of characters', char_size)
print(chars)

number of characters 259
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '¥', '©', '°', '½', 'Á', 'Æ', 'É', '×', 'ß', 'à', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ñ', 'ó', 'ô', 'ö', 'ú', 'ü', 'ć', 'č', 'ě', 'ī', 'ł', 'Ō', 'ō', 'Š', 'ū', 'ž', 'ǐ', 'ǔ', 'ǜ', 'ə', 'ɛ', 'ɪ', 'ʊ', 'ˈ', 'ː', '̍', '͘', 'Π', 'Ω', 'έ', 'α', 'β', 'δ', 'ε', 'ι', 'λ', 'μ', 'ν', 'ο', 'π', 'ς', 'σ', 'τ', 'υ', 'ω', 'ό', 'П', 'в', 'д', 'и', 'к', 'н', 'א', 'ב', 'י', 'ל', 'ר', 'ש', 'ת', 'ا', 'ت', 'د', 'س', 'ك', 'ل', 'و', 'ڠ', 'ग', 'न', 'र', 'ल', 'ष', 'ु', 'े', 'ो', '्', 'ả', 'ẩ', '‑', '–', '—', '’', '“'

In [4]:
char2id = dict((c, i) for i, c in enumerate(chars))
id2char = dict((i, c) for i, c in enumerate(chars))

In [5]:
#Given a probability of each character, return a likely character, one-hot encoded
#our prediction will give us an array of probabilities of each character
#we'll pick the most likely and one-hot encode it
def sample(prediction):
    #Samples are uniformly distributed over the half-open interval 
    r = random.uniform(0,1)
    #store prediction char
    s = 0
    #since length > indices starting at 0
    char_id = len(prediction) - 1
    #for each char prediction probabilty
    for i in range(len(prediction)):
        #assign it to S
        s += prediction[i]
        #check if probability greater than our randomly generated one
        if s >= r:
            #if it is, thats the likely next char
            char_id = i
            break
    #dont try to rank, just differentiate
    #initialize the vector
    char_one_hot = np.zeros(shape=[char_size])
    #that characters ID encoded
    #https://image.slidesharecdn.com/latin-150313140222-conversion-gate01/95/representation-learning-of-vectors-of-words-and-phrases-5-638.jpg?cb=1426255492
    char_one_hot[char_id] = 1.0
    return char_one_hot

In [6]:
#vectorize our data to feed it into model

len_per_section = 10 #Demo uses 50. But on 8 GB RAM value higher than 9 leads to MemoryError
skip = 2 #Demo uses 2. But on 8 GB RAM value higher than ... leads to MemoryError
sections = []
next_chars = []

#fill sections list with chunks of text, every 2 characters create a new 50 
#character long section
#because we are generating it at a character level
for i in range(0, len(text) - len_per_section, skip):
    sections.append(text[i: i + len_per_section])
    next_chars.append(text[i + len_per_section])
#Vectorize input and output
#matrix of section length by num of characters
X = np.zeros((len(sections), len_per_section, char_size))
#label column for all the character id's, still zero
y = np.zeros((len(sections), char_size))
#for each char in each section, convert each char to an ID
#for each section convert the labels to ids 
for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, char2id[char]] = 1
    y[i, char2id[next_chars[i]]] = 1
print(y)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]]


In [7]:
#Batch size defines number of samples that going to be propagated through the network.
#one epoch = one forward pass and one backward pass of all the training examples
#batch size = the number of training examples in one forward/backward pass.
#The higher the batch size, the more memory space you'll need.
#if you have 1000 training examples, 
#and your batch size is 500, then it will take 2 iterations to complete 1 epoch.
batch_size = 512
#total iterations
max_steps = 72001
#how often to log?
log_every = 100
#how often to save?
save_every = 6000
#too few and underfitting
#Underfitting occurs when there are too few neurons 
#in the hidden layers to adequately detect the signals in a complicated data set.
#too many and overfitting
hidden_nodes = 1024
#starting text
test_start = 'I am thinking that'
#to save our model
checkpoint_directory = 'ckpt'

#Create a checkpoint directory
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MakeDirs(checkpoint_directory)

print('training data size:', len(X))
print('approximate steps per epoch:', int(len(X)/batch_size))

training data size: 644273
approximate steps per epoch: 1258


In [8]:
#build our model
graph = tf.Graph()
#if multiple graphs, but none here, just one
with graph.as_default():
    
    global_step = tf.Variable(0)
    
    #Tensors will be 3D (1-batch_size, 2-len_per_section, 3-char_size)
    data = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size])
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    
    #input gate, output gate, forget gate, internal state
    #they will be calculated in vacuums
    
    #we're defining gates
    
    #input gate - weights for input, weights for previuos output, bias
    w_ii = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_io = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_i = tf.Variable(tf.zeros([1, hidden_nodes]))

    #forget gate
    w_fi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_fo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_f = tf.Variable(tf.zeros([1, hidden_nodes]))

    #output gate
    w_oi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_oo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    #Memory cell
    w_ci = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_co = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_c = tf.Variable(tf.zeros([1, hidden_nodes]))

    def lstm(i, o, state):
        #these are all calculated separately, no overlap until...
        #(input * input weights) + (output * weights for previous ouptut) + bias
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)
        
        #(input * forget weights) + (output * weights for previous output) + bias
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)
        
        #(input * output weights) + (output * weights for previous output) + bias
        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)
        
        #(input * internal state weights) + (output * weights for previous output) + bias
        memory_cell = tf.sigmoid(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)
        
        #...now! multiply forget gate & given state + input gate * hidden state
        state = forget_gate * state + input_gate * memory_cell
        #squash that state with tanh nonlin (Computes hyperbolic tangent of x element-wise)
        #multiply by output
        output = output_gate * tf.tanh(state)
        #return
        return output, state
    # can we use tensorflow to visualize the network at some point?
    #A: yes, using tensorboard
    
    ############
    # Operation
    ############
    #LSTM
    #both start off as empty, LSTM will calculate this
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])
    
    #unrolled LSTM loop
    #for each input set
    for i in range(len_per_section):
        #calculate state and output from LSTM
        output, state = lstm(data[:, i, :], output, state)
        #to start, 
        if i == 0:
            #store initial output and labels
            outputs_all_i = output
            labels_all_i = data[:, i+1, :]
        #for each new set, concat outputs and labels
        elif i != len_per_section - 1:
            #concatenates (combines) vectors along a dimension axis, not multiply
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, data[:, i+1, :]], 0)
        else:
            #final store
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, labels], 0)
            
    #Optimizer part
    #Classifier
    #The Classifier will only run after saved_output and saved_state were assigned.
    
    #calculate weight and bias values for the network
    #generated randomly given a size and distribution
    w = tf.Variable(tf.truncated_normal([hidden_nodes, char_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([char_size]))
    #Logits simply means that the function operates on the unscaled output
    #of earlier layers and that the relative scale to understand the units
    #is linear. It means, in particular, the sum of the inputs may not equal to 1,
    #that the values are not probabilities (you might have an input of 5).
    logits = tf.matmul(outputs_all_i, w) + b
    
    #logits is our prediction outputs, lets compare it with our labels
    #cross entropy since multiclass classification
    #computes the cost for a softmax layer
    #then computes the mean of elements across dimensions of a tensor.
    #average loss across all values
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels_all_i, logits=logits))
    
    #Optimizer
    #minimize loss with gradient descent, learning rate 10, keep track of batches
    optimizer = tf.train.GradientDescentOptimizer(10.).minimize(loss, global_step=global_step)

In [None]:
#time to train the model, initialize a session with a graph
with tf.Session(graph=graph) as sess:
    #standard init step
    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    #for each training step
    for step in range(max_steps):
        
        #starts off as 0
        offset = offset % len(X)
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (len(X) - batch_size):
            #first part
            batch_data = X[offset: offset + batch_size]
            batch_labels = y[offset: offset + batch_size]
            offset += batch_size
        #until when offset  = batch size, then we 
        else:
            #last part
            to_add = batch_size - (len(X) - offset)
            batch_data = np.concatenate((X[offset: len(X)], X[0: to_add]))
            batch_labels = np.concatenate((y[offset: len(X)], y[0: to_add]))
            offset = to_add
        
        #optimize!!
        _, training_loss = sess.run([optimizer, loss], feed_dict={data: batch_data, labels: batch_labels})
        
        if step % 10 == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))

            if step % save_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)

training loss at step 0: 5.57 (2017-04-21 09:58:34.566295)
training loss at step 10: 4.71 (2017-04-21 09:59:26.178639)
training loss at step 20: 5.36 (2017-04-21 10:28:33.761880)
training loss at step 30: 3.90 (2017-04-21 10:29:21.084395)
training loss at step 40: 3.49 (2017-04-21 10:30:33.021233)
training loss at step 50: 4.08 (2017-04-21 10:36:52.597867)
training loss at step 60: 3.30 (2017-04-21 10:37:40.648893)
training loss at step 70: 3.19 (2017-04-21 10:38:30.047960)
training loss at step 80: 4.07 (2017-04-21 10:39:19.317620)
training loss at step 90: 3.24 (2017-04-21 10:40:08.744760)
training loss at step 100: 3.19 (2017-04-21 10:40:56.263300)
training loss at step 110: 3.40 (2017-04-21 10:41:45.424249)
training loss at step 120: 3.13 (2017-04-21 10:42:33.017130)
training loss at step 130: 3.14 (2017-04-21 10:43:19.238740)
training loss at step 140: 3.03 (2017-04-21 10:44:05.080693)
training loss at step 150: 3.06 (2017-04-21 10:44:51.530430)
training loss at step 160: 3.52 (20

training loss at step 1340: 2.91 (2017-04-21 12:35:33.316110)
training loss at step 1350: 2.85 (2017-04-21 12:36:22.760939)
training loss at step 1360: 2.81 (2017-04-21 12:37:13.697465)
training loss at step 1370: 2.92 (2017-04-21 12:38:04.739407)
training loss at step 1380: 2.82 (2017-04-21 12:38:54.387965)
training loss at step 1390: 2.82 (2017-04-21 12:39:44.479607)
training loss at step 1400: 2.71 (2017-04-21 12:40:34.217126)
training loss at step 1410: 2.68 (2017-04-21 12:41:23.841989)
training loss at step 1420: 3.27 (2017-04-21 12:42:14.040553)
training loss at step 1430: 2.71 (2017-04-21 12:43:04.355113)
training loss at step 1440: 2.72 (2017-04-21 12:43:54.306306)
training loss at step 1450: 2.63 (2017-04-21 12:44:44.282030)
training loss at step 1460: 2.81 (2017-04-21 12:45:34.340419)
training loss at step 1470: 2.69 (2017-04-21 12:46:24.228347)
training loss at step 1480: 2.83 (2017-04-21 12:47:14.788879)
training loss at step 1490: 2.61 (2017-04-21 12:48:05.823850)
training

training loss at step 2670: 2.58 (2017-04-21 15:52:12.191814)
training loss at step 2680: 2.46 (2017-04-21 15:53:06.801584)
training loss at step 2690: 2.40 (2017-04-21 15:54:03.722255)
training loss at step 2700: 2.38 (2017-04-21 15:54:58.037688)
training loss at step 2710: 2.35 (2017-04-21 15:55:51.845601)
training loss at step 2720: 2.35 (2017-04-21 15:56:42.367408)
training loss at step 2730: 2.36 (2017-04-21 15:57:33.473947)
training loss at step 2740: 2.28 (2017-04-21 15:58:24.337843)
training loss at step 2750: 2.41 (2017-04-21 15:59:15.576869)
training loss at step 2760: 2.39 (2017-04-21 16:00:05.593260)
training loss at step 2770: 2.43 (2017-04-21 16:00:55.484110)
training loss at step 2780: 2.46 (2017-04-21 16:01:45.817174)
training loss at step 2790: 2.48 (2017-04-21 16:02:46.544325)
training loss at step 2800: 2.43 (2017-04-21 16:25:12.282983)
training loss at step 2810: 2.41 (2017-04-21 16:26:01.686075)
training loss at step 2820: 2.77 (2017-04-21 16:26:48.775968)
training

In [None]:
test_start = 'I plan to make the world a better place '

with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start

    #for every char in the input sentennce
    for i in range(len(test_start) - 1):
        #initialize an empty char store
        test_X = np.zeros((1, char_size))
        #store it in id from
        test_X[0, char2id[test_start[i]]] = 1.
        #feed it to model, test_prediction is the output value
        _ = sess.run(test_prediction, feed_dict={test_data: test_X})

    
    #where we store encoded char predictions
    test_X = np.zeros((1, char_size))
    test_X[0, char2id[test_start[-1]]] = 1.

    #lets generate 500 characters
    for i in range(500):
        #get each prediction probability
        prediction = test_prediction.eval({test_data: test_X})[0]
        #one hot encode it
        next_char_one_hot = sample(prediction)
        #get the indices of the max values (highest probability)  and convert to char
        next_char = id2char[np.argmax(next_char_one_hot)]
        #add each char to the output text iteratively
        test_generated += next_char
        #update the 
        test_X = next_char_one_hot.reshape((1, char_size))

    print(test_generated)