# Based on How to generate your own Wikipedia articles https://www.youtube.com/watch?v=ZGU5kIG7b2I

In [1]:
#dependencies

import numpy as np #vectorization
import random #generating text
import tensorflow as tf #ML
import datetime #clock training time


In [2]:
text = open('wikitext-103-raw/wiki.test.raw', encoding='utf8').read()
print('text length in number of characters', len(text))

print('head of text:')
print(text[:1000])

text length in number of characters 1288556
head of text:
 
 = Robert Boulter = 
 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
 In 2006 , Boulter starred alongside Whishaw in the play C

In [3]:
#print out our characters and sort them
chars = sorted(list(set(text)))
char_size = len(chars)
print('number of characters', char_size)
print(chars)

number of characters 259
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '¥', '©', '°', '½', 'Á', 'Æ', 'É', '×', 'ß', 'à', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ñ', 'ó', 'ô', 'ö', 'ú', 'ü', 'ć', 'č', 'ě', 'ī', 'ł', 'Ō', 'ō', 'Š', 'ū', 'ž', 'ǐ', 'ǔ', 'ǜ', 'ə', 'ɛ', 'ɪ', 'ʊ', 'ˈ', 'ː', '̍', '͘', 'Π', 'Ω', 'έ', 'α', 'β', 'δ', 'ε', 'ι', 'λ', 'μ', 'ν', 'ο', 'π', 'ς', 'σ', 'τ', 'υ', 'ω', 'ό', 'П', 'в', 'д', 'и', 'к', 'н', 'א', 'ב', 'י', 'ל', 'ר', 'ש', 'ת', 'ا', 'ت', 'د', 'س', 'ك', 'ل', 'و', 'ڠ', 'ग', 'न', 'र', 'ल', 'ष', 'ु', 'े', 'ो', '्', 'ả', 'ẩ', '‑', '–', '—', '’', '“'

In [4]:
char2id = dict((c, i) for i, c in enumerate(chars))
id2char = dict((i, c) for i, c in enumerate(chars))

In [5]:
#Given a probability of each character, return a likely character, one-hot encoded
#our prediction will give us an array of probabilities of each character
#we'll pick the most likely and one-hot encode it
def sample(prediction):
    #Samples are uniformly distributed over the half-open interval 
    r = random.uniform(0,1)
    #store prediction char
    s = 0
    #since length > indices starting at 0
    char_id = len(prediction) - 1
    #for each char prediction probabilty
    for i in range(len(prediction)):
        #assign it to S
        s += prediction[i]
        #check if probability greater than our randomly generated one
        if s >= r:
            #if it is, thats the likely next char
            char_id = i
            break
    #dont try to rank, just differentiate
    #initialize the vector
    char_one_hot = np.zeros(shape=[char_size])
    #that characters ID encoded
    #https://image.slidesharecdn.com/latin-150313140222-conversion-gate01/95/representation-learning-of-vectors-of-words-and-phrases-5-638.jpg?cb=1426255492
    char_one_hot[char_id] = 1.0
    return char_one_hot

In [6]:
#vectorize our data to feed it into model

len_per_section = 50 #Demo uses 50. But on 8 GB RAM value higher than 9 leads to MemoryError
skip = 2 #Demo uses 2. But on 8 GB RAM value higher than ... leads to MemoryError
sections = []
next_chars = []

#fill sections list with chunks of text, every 2 characters create a new 50 
#character long section
#because we are generating it at a character level
for i in range(0, len(text) - len_per_section, skip):
    sections.append(text[i: i + len_per_section])
    next_chars.append(text[i + len_per_section])
#Vectorize input and output
#matrix of section length by num of characters
X = np.zeros((len(sections), len_per_section, char_size))
#label column for all the character id's, still zero
y = np.zeros((len(sections), char_size))
#for each char in each section, convert each char to an ID
#for each section convert the labels to ids 
for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, char2id[char]] = 1
    y[i, char2id[next_chars[i]]] = 1
print(y)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]]


In [7]:
#Batch size defines number of samples that going to be propagated through the network.
#one epoch = one forward pass and one backward pass of all the training examples
#batch size = the number of training examples in one forward/backward pass.
#The higher the batch size, the more memory space you'll need.
#if you have 1000 training examples, 
#and your batch size is 500, then it will take 2 iterations to complete 1 epoch.
batch_size = 512
#total iterations
max_steps = 20001 #72001
#how often to log?
log_every = 100
#how often to save?
save_every = 6000
#too few and underfitting
#Underfitting occurs when there are too few neurons 
#in the hidden layers to adequately detect the signals in a complicated data set.
#too many and overfitting
hidden_nodes = 1024
#starting text
test_start = 'I am thinking that'
#to save our model
checkpoint_directory = 'ckpt'

#Create a checkpoint directory
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MakeDirs(checkpoint_directory)

print('training data size:', len(X))
print('approximate steps per epoch:', int(len(X)/batch_size))

training data size: 644253
approximate steps per epoch: 1258


In [8]:
#build our model
graph = tf.Graph()
#if multiple graphs, but none here, just one
with graph.as_default():
    
    global_step = tf.Variable(0)
    
    #Tensors will be 3D (1-batch_size, 2-len_per_section, 3-char_size)
    data = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size])
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    
    #input gate, output gate, forget gate, internal state
    #they will be calculated in vacuums
    
    #we're defining gates
    
    #input gate - weights for input, weights for previuos output, bias
    w_ii = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_io = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_i = tf.Variable(tf.zeros([1, hidden_nodes]))

    #forget gate
    w_fi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_fo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_f = tf.Variable(tf.zeros([1, hidden_nodes]))

    #output gate
    w_oi = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_oo = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    #Memory cell
    w_ci = tf.Variable(tf.truncated_normal([char_size, hidden_nodes], -0.1, 0.1))
    w_co = tf.Variable(tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1))
    b_c = tf.Variable(tf.zeros([1, hidden_nodes]))

    def lstm(i, o, state):
        #these are all calculated separately, no overlap until...
        #(input * input weights) + (output * weights for previous ouptut) + bias
        input_gate = tf.sigmoid(tf.matmul(i, w_ii) + tf.matmul(o, w_io) + b_i)
        
        #(input * forget weights) + (output * weights for previous output) + bias
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi) + tf.matmul(o, w_fo) + b_f)
        
        #(input * output weights) + (output * weights for previous output) + bias
        output_gate = tf.sigmoid(tf.matmul(i, w_oi) + tf.matmul(o, w_oo) + b_o)
        
        #(input * internal state weights) + (output * weights for previous output) + bias
        memory_cell = tf.sigmoid(tf.matmul(i, w_ci) + tf.matmul(o, w_co) + b_c)
        
        #...now! multiply forget gate & given state + input gate * hidden state
        state = forget_gate * state + input_gate * memory_cell
        #squash that state with tanh nonlin (Computes hyperbolic tangent of x element-wise)
        #multiply by output
        output = output_gate * tf.tanh(state)
        #return
        return output, state
    # can we use tensorflow to visualize the network at some point?
    #A: yes, using tensorboard
    
    ############
    # Operation
    ############
    #LSTM
    #both start off as empty, LSTM will calculate this
    output = tf.zeros([batch_size, hidden_nodes])
    state = tf.zeros([batch_size, hidden_nodes])
    
    #unrolled LSTM loop
    #for each input set
    for i in range(len_per_section):
        #calculate state and output from LSTM
        output, state = lstm(data[:, i, :], output, state)
        #to start, 
        if i == 0:
            #store initial output and labels
            outputs_all_i = output
            labels_all_i = data[:, i+1, :]
        #for each new set, concat outputs and labels
        elif i != len_per_section - 1:
            #concatenates (combines) vectors along a dimension axis, not multiply
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, data[:, i+1, :]], 0)
        else:
            #final store
            outputs_all_i = tf.concat([outputs_all_i, output], 0)
            labels_all_i = tf.concat([labels_all_i, labels], 0)
            
    #Optimizer part
    #Classifier
    #The Classifier will only run after saved_output and saved_state were assigned.
    
    #calculate weight and bias values for the network
    #generated randomly given a size and distribution
    w = tf.Variable(tf.truncated_normal([hidden_nodes, char_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([char_size]))
    #Logits simply means that the function operates on the unscaled output
    #of earlier layers and that the relative scale to understand the units
    #is linear. It means, in particular, the sum of the inputs may not equal to 1,
    #that the values are not probabilities (you might have an input of 5).
    logits = tf.matmul(outputs_all_i, w) + b
    
    #logits is our prediction outputs, lets compare it with our labels
    #cross entropy since multiclass classification
    #computes the cost for a softmax layer
    #then computes the mean of elements across dimensions of a tensor.
    #average loss across all values
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels_all_i, logits=logits))
    
    #Optimizer
    #minimize loss with gradient descent, learning rate 10, keep track of batches
    optimizer = tf.train.GradientDescentOptimizer(10.).minimize(loss, global_step=global_step)
    
    ###########
    #Test
    ###########
    test_data = tf.placeholder(tf.float32, shape=[1, char_size])
    test_output = tf.Variable(tf.zeros([1, hidden_nodes]))
    test_state = tf.Variable(tf.zeros([1, hidden_nodes]))
    
    #Reset at the beginning of each test
    reset_test_state = tf.group(test_output.assign(tf.zeros([1, hidden_nodes])), 
                                test_state.assign(tf.zeros([1, hidden_nodes])))

    #LSTM
    test_output, test_state = lstm(test_data, test_output, test_state)
    test_prediction = tf.nn.softmax(tf.matmul(test_output, w) + b)

In [9]:
#time to train the model, initialize a session with a graph
with tf.Session(graph=graph) as sess:
    #standard init step
    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    #for each training step
    for step in range(max_steps):
        
        #starts off as 0
        offset = offset % len(X)
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (len(X) - batch_size):
            #first part
            batch_data = X[offset: offset + batch_size]
            batch_labels = y[offset: offset + batch_size]
            offset += batch_size
        #until when offset  = batch size, then we 
        else:
            #last part
            to_add = batch_size - (len(X) - offset)
            batch_data = np.concatenate((X[offset: len(X)], X[0: to_add]))
            batch_labels = np.concatenate((y[offset: len(X)], y[0: to_add]))
            offset = to_add
        
        #optimize!!
        _, training_loss = sess.run([optimizer, loss], feed_dict={data: batch_data, labels: batch_labels})
        
        if step % 10 == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))

            if step % save_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)

training loss at step 0: 5.61 (2017-05-02 22:50:39.793118)
training loss at step 10: 4.42 (2017-05-02 22:52:45.099740)
training loss at step 20: 3.30 (2017-05-02 22:54:46.162738)
training loss at step 30: 3.87 (2017-05-02 22:56:42.740106)
training loss at step 40: 5.54 (2017-05-02 22:58:35.955276)
training loss at step 50: 4.67 (2017-05-02 23:00:35.807956)
training loss at step 60: 3.52 (2017-05-02 23:02:30.062769)
training loss at step 70: 3.19 (2017-05-02 23:04:25.161683)
training loss at step 80: 3.96 (2017-05-02 23:06:21.995989)
training loss at step 90: 3.37 (2017-05-02 23:08:17.090623)
training loss at step 100: 3.21 (2017-05-02 23:10:11.447962)
training loss at step 110: 3.35 (2017-05-02 23:12:06.845269)
training loss at step 120: 3.11 (2017-05-02 23:14:05.437140)
training loss at step 130: 3.16 (2017-05-02 23:16:00.333649)
training loss at step 140: 3.08 (2017-05-02 23:17:55.718807)
training loss at step 150: 3.12 (2017-05-02 23:19:51.379288)
training loss at step 160: 3.16 (20

training loss at step 1340: 2.81 (2017-05-03 03:11:26.080567)
training loss at step 1350: 2.89 (2017-05-03 03:13:20.104349)
training loss at step 1360: 2.82 (2017-05-03 03:15:16.094868)
training loss at step 1370: 2.94 (2017-05-03 03:17:11.100089)
training loss at step 1380: 2.81 (2017-05-03 03:19:07.037429)
training loss at step 1390: 2.90 (2017-05-03 03:21:01.909576)
training loss at step 1400: 2.74 (2017-05-03 03:22:57.381204)
training loss at step 1410: 2.74 (2017-05-03 03:24:52.308036)
training loss at step 1420: 3.36 (2017-05-03 03:26:48.574671)
training loss at step 1430: 2.71 (2017-05-03 03:28:43.444446)
training loss at step 1440: 2.72 (2017-05-03 03:30:38.980539)
training loss at step 1450: 2.66 (2017-05-03 03:32:33.959669)
training loss at step 1460: 2.83 (2017-05-03 03:34:29.470800)
training loss at step 1470: 2.70 (2017-05-03 03:36:24.416556)
training loss at step 1480: 2.72 (2017-05-03 03:38:19.171450)
training loss at step 1490: 2.65 (2017-05-03 03:40:13.676805)
training

training loss at step 2670: 2.44 (2017-05-03 07:28:23.251496)
training loss at step 2680: 2.40 (2017-05-03 07:30:17.654685)
training loss at step 2690: 2.27 (2017-05-03 07:32:13.315016)
training loss at step 2700: 2.30 (2017-05-03 07:34:10.267920)
training loss at step 2710: 2.24 (2017-05-03 07:36:05.630302)
training loss at step 2720: 2.33 (2017-05-03 07:38:01.810015)
training loss at step 2730: 2.30 (2017-05-03 07:39:57.519824)
training loss at step 2740: 2.19 (2017-05-03 07:41:53.599488)
training loss at step 2750: 2.32 (2017-05-03 07:43:51.098571)
training loss at step 2760: 2.27 (2017-05-03 07:45:47.013401)
training loss at step 2770: 2.36 (2017-05-03 07:47:43.816335)
training loss at step 2780: 2.31 (2017-05-03 07:49:40.308336)
training loss at step 2790: 2.38 (2017-05-03 07:51:36.503707)
training loss at step 2800: 2.36 (2017-05-03 07:53:32.750131)
training loss at step 2810: 2.37 (2017-05-03 07:55:28.911305)
training loss at step 2820: 3.86 (2017-05-03 07:57:25.662309)
training

training loss at step 4000: 1.98 (2017-05-03 11:45:01.621771)
training loss at step 4010: 2.14 (2017-05-03 11:46:58.928376)
training loss at step 4020: 2.18 (2017-05-03 11:48:55.676310)
training loss at step 4030: 2.15 (2017-05-03 11:50:51.622366)
training loss at step 4040: 2.12 (2017-05-03 11:52:53.215958)
training loss at step 4050: 2.09 (2017-05-03 11:54:51.881246)
training loss at step 4060: 2.10 (2017-05-03 11:56:59.531280)
training loss at step 4070: 2.20 (2017-05-03 11:59:06.040321)
training loss at step 4080: 2.50 (2017-05-03 12:01:07.384401)
training loss at step 4090: 2.18 (2017-05-03 12:03:13.553195)
training loss at step 4100: 2.16 (2017-05-03 12:05:07.993229)
training loss at step 4110: 2.03 (2017-05-03 12:07:03.072734)
training loss at step 4120: 2.15 (2017-05-03 12:09:01.768921)
training loss at step 4130: 2.23 (2017-05-03 12:10:57.586597)
training loss at step 4140: 2.13 (2017-05-03 12:13:01.931116)
training loss at step 4150: 2.15 (2017-05-03 12:14:58.828569)
training

training loss at step 5330: 2.11 (2017-05-03 16:01:44.282062)
training loss at step 5340: 2.25 (2017-05-03 16:03:38.497292)
training loss at step 5350: 2.10 (2017-05-03 16:05:32.225486)
training loss at step 5360: 1.98 (2017-05-03 16:07:26.497663)
training loss at step 5370: 2.00 (2017-05-03 16:09:20.191738)
training loss at step 5380: 2.20 (2017-05-03 16:11:14.745975)
training loss at step 5390: 2.07 (2017-05-03 16:13:09.718567)
training loss at step 5400: 1.94 (2017-05-03 16:15:04.659078)
training loss at step 5410: 2.06 (2017-05-03 16:16:57.892903)
training loss at step 5420: 1.92 (2017-05-03 16:18:51.716448)
training loss at step 5430: 2.01 (2017-05-03 16:20:45.376294)
training loss at step 5440: 2.05 (2017-05-03 16:22:39.238842)
training loss at step 5450: 1.95 (2017-05-03 16:24:34.753131)
training loss at step 5460: 2.16 (2017-05-03 16:26:29.823130)
training loss at step 5470: 2.00 (2017-05-03 16:28:23.611266)
training loss at step 5480: 2.00 (2017-05-03 16:30:17.651501)
training

training loss at step 6660: 1.98 (2017-05-03 20:15:58.546616)
training loss at step 6670: 2.04 (2017-05-03 20:17:54.811263)
training loss at step 6680: 1.91 (2017-05-03 20:19:51.230439)
training loss at step 6690: 1.98 (2017-05-03 20:21:46.922929)
training loss at step 6700: 1.93 (2017-05-03 20:23:43.197358)
training loss at step 6710: 1.97 (2017-05-03 20:25:40.062021)
training loss at step 6720: 2.01 (2017-05-03 20:27:36.944859)
training loss at step 6730: 1.92 (2017-05-03 20:29:33.329560)
training loss at step 6740: 1.85 (2017-05-03 20:31:29.282372)
training loss at step 6750: 1.83 (2017-05-03 20:33:26.468026)
training loss at step 6760: 1.94 (2017-05-03 20:35:22.606430)
training loss at step 6770: 1.85 (2017-05-03 20:37:18.985764)
training loss at step 6780: 1.87 (2017-05-03 20:39:16.698758)
training loss at step 6790: 1.88 (2017-05-03 20:41:13.016898)
training loss at step 6800: 1.97 (2017-05-03 20:43:09.485522)
training loss at step 6810: 1.93 (2017-05-03 20:45:06.289337)
training

training loss at step 7990: 1.71 (2017-05-04 00:29:50.765433)
training loss at step 8000: 1.91 (2017-05-04 00:31:42.918740)
training loss at step 8010: 1.97 (2017-05-04 00:33:36.104143)
training loss at step 8020: 1.86 (2017-05-04 00:35:28.029797)
training loss at step 8030: 1.75 (2017-05-04 00:37:21.873778)
training loss at step 8040: 1.76 (2017-05-04 00:39:16.073497)
training loss at step 8050: 2.20 (2017-05-04 00:41:10.616417)
training loss at step 8060: 1.89 (2017-05-04 00:43:05.710307)
training loss at step 8070: 1.91 (2017-05-04 00:45:05.365748)
training loss at step 8080: 1.88 (2017-05-04 00:47:00.777509)
training loss at step 8090: 1.92 (2017-05-04 00:48:57.433441)
training loss at step 8100: 1.97 (2017-05-04 00:50:53.886313)
training loss at step 8110: 1.88 (2017-05-04 00:52:49.629686)
training loss at step 8120: 1.87 (2017-05-04 00:54:44.893413)
training loss at step 8130: 1.78 (2017-05-04 00:56:41.362591)
training loss at step 8140: 1.88 (2017-05-04 00:58:37.423599)
training

training loss at step 9320: 2.16 (2017-05-04 04:47:35.409901)
training loss at step 9330: 1.94 (2017-05-04 04:49:31.079138)
training loss at step 9340: 1.81 (2017-05-04 04:51:24.707389)
training loss at step 9350: 1.85 (2017-05-04 04:53:18.643576)
training loss at step 9360: 1.91 (2017-05-04 04:55:12.155505)
training loss at step 9370: 1.88 (2017-05-04 04:57:07.118741)
training loss at step 9380: 1.93 (2017-05-04 04:59:01.465570)
training loss at step 9390: 2.19 (2017-05-04 05:00:55.586701)
training loss at step 9400: 1.76 (2017-05-04 05:02:50.118381)
training loss at step 9410: 1.86 (2017-05-04 05:04:44.120369)
training loss at step 9420: 1.94 (2017-05-04 05:06:38.654699)
training loss at step 9430: 1.81 (2017-05-04 05:08:32.812823)
training loss at step 9440: 1.98 (2017-05-04 05:10:26.616480)
training loss at step 9450: 1.73 (2017-05-04 05:12:20.490778)
training loss at step 9460: 1.61 (2017-05-04 05:14:14.557002)
training loss at step 9470: 1.59 (2017-05-04 05:16:08.827779)
training

training loss at step 10640: 1.69 (2017-05-04 09:00:16.282183)
training loss at step 10650: 1.85 (2017-05-04 09:02:13.138139)
training loss at step 10660: 1.90 (2017-05-04 09:04:08.253396)
training loss at step 10670: 1.83 (2017-05-04 09:06:03.214539)
training loss at step 10680: 1.92 (2017-05-04 09:08:01.917029)
training loss at step 10690: 1.79 (2017-05-04 09:09:56.449931)
training loss at step 10700: 1.86 (2017-05-04 09:11:53.398759)
training loss at step 10710: 1.91 (2017-05-04 09:13:48.159538)
training loss at step 10720: 1.65 (2017-05-04 09:15:42.420156)
training loss at step 10730: 1.70 (2017-05-04 09:17:38.797865)
training loss at step 10740: 1.85 (2017-05-04 09:19:35.196701)
training loss at step 10750: 1.85 (2017-05-04 09:21:31.762928)
training loss at step 10760: 1.71 (2017-05-04 09:23:26.649377)
training loss at step 10770: 1.69 (2017-05-04 09:25:20.798826)
training loss at step 10780: 1.56 (2017-05-04 09:27:17.385239)
training loss at step 10790: 1.86 (2017-05-04 09:29:13.

training loss at step 11950: 1.72 (2017-05-04 13:15:01.292065)
training loss at step 11960: 1.84 (2017-05-04 13:16:54.125785)
training loss at step 11970: 1.53 (2017-05-04 13:18:50.419938)
training loss at step 11980: 1.59 (2017-05-04 13:20:46.930854)
training loss at step 11990: 1.66 (2017-05-04 13:22:38.242318)
training loss at step 12000: 2.01 (2017-05-04 13:24:29.516186)
training loss at step 12010: 1.72 (2017-05-04 13:26:29.446704)
training loss at step 12020: 1.72 (2017-05-04 13:28:20.442735)
training loss at step 12030: 1.66 (2017-05-04 13:30:11.491705)
training loss at step 12040: 1.67 (2017-05-04 13:32:02.499061)
training loss at step 12050: 2.02 (2017-05-04 13:33:54.274369)
training loss at step 12060: 1.88 (2017-05-04 13:35:45.230341)
training loss at step 12070: 1.74 (2017-05-04 13:37:37.653338)
training loss at step 12080: 1.87 (2017-05-04 13:39:30.298118)
training loss at step 12090: 1.75 (2017-05-04 13:41:23.240809)
training loss at step 12100: 1.97 (2017-05-04 13:43:17.

training loss at step 13260: 1.82 (2017-05-04 17:28:31.856505)
training loss at step 13270: 1.62 (2017-05-04 17:30:26.815180)
training loss at step 13280: 1.76 (2017-05-04 17:32:22.392190)
training loss at step 13290: 1.54 (2017-05-04 17:34:17.195229)
training loss at step 13300: 1.51 (2017-05-04 17:36:12.531427)
training loss at step 13310: 1.77 (2017-05-04 17:38:08.371835)
training loss at step 13320: 1.62 (2017-05-04 17:40:03.683760)
training loss at step 13330: 1.71 (2017-05-04 17:41:59.010977)
training loss at step 13340: 1.76 (2017-05-04 17:43:54.529999)
training loss at step 13350: 1.74 (2017-05-04 17:45:50.546485)
training loss at step 13360: 1.85 (2017-05-04 17:47:46.586543)
training loss at step 13370: 1.82 (2017-05-04 17:49:42.166368)
training loss at step 13380: 1.79 (2017-05-04 17:51:37.095934)
training loss at step 13390: 1.87 (2017-05-04 17:53:33.693689)
training loss at step 13400: 1.68 (2017-05-04 17:55:28.908111)
training loss at step 13410: 1.72 (2017-05-04 17:57:23.

training loss at step 14570: 1.86 (2017-05-04 21:41:07.623113)
training loss at step 14580: 1.69 (2017-05-04 21:42:58.527441)
training loss at step 14590: 1.80 (2017-05-04 21:44:50.191617)
training loss at step 14600: 1.70 (2017-05-04 21:46:42.439128)
training loss at step 14610: 1.75 (2017-05-04 21:48:34.905009)
training loss at step 14620: 1.82 (2017-05-04 21:50:27.393225)
training loss at step 14630: 1.69 (2017-05-04 21:52:21.450784)
training loss at step 14640: 1.68 (2017-05-04 21:54:17.876672)
training loss at step 14650: 1.83 (2017-05-04 21:56:15.318426)
training loss at step 14660: 1.54 (2017-05-04 21:58:10.910894)
training loss at step 14670: 1.68 (2017-05-04 22:00:06.124784)
training loss at step 14680: 1.69 (2017-05-04 22:02:00.840406)
training loss at step 14690: 1.71 (2017-05-04 22:03:55.178423)
training loss at step 14700: 1.87 (2017-05-04 22:05:50.554824)
training loss at step 14710: 2.02 (2017-05-04 22:07:45.511423)
training loss at step 14720: 1.74 (2017-05-04 22:09:39.

training loss at step 15880: 1.70 (2017-05-05 01:50:39.997190)
training loss at step 15890: 1.76 (2017-05-05 01:52:34.701981)
training loss at step 15900: 1.64 (2017-05-05 01:54:28.777375)
training loss at step 15910: 1.81 (2017-05-05 01:56:23.577223)
training loss at step 15920: 1.59 (2017-05-05 01:58:17.731252)
training loss at step 15930: 1.65 (2017-05-05 02:00:11.312128)
training loss at step 15940: 1.68 (2017-05-05 02:02:04.657447)
training loss at step 15950: 1.84 (2017-05-05 02:03:58.288285)
training loss at step 15960: 1.69 (2017-05-05 02:05:52.848745)
training loss at step 15970: 1.82 (2017-05-05 02:07:47.312258)
training loss at step 15980: 1.70 (2017-05-05 02:09:41.440545)
training loss at step 15990: 1.72 (2017-05-05 02:11:35.023972)
training loss at step 16000: 1.63 (2017-05-05 02:13:29.612581)
training loss at step 16010: 1.76 (2017-05-05 02:15:23.361248)
training loss at step 16020: 1.75 (2017-05-05 02:17:16.761298)
training loss at step 16030: 1.67 (2017-05-05 02:19:10.

training loss at step 17190: 1.48 (2017-05-05 05:59:36.592257)
training loss at step 17200: 1.73 (2017-05-05 06:01:31.039101)
training loss at step 17210: 1.54 (2017-05-05 06:03:25.087856)
training loss at step 17220: 1.70 (2017-05-05 06:05:19.204003)
training loss at step 17230: 1.65 (2017-05-05 06:07:12.515717)
training loss at step 17240: 1.63 (2017-05-05 06:09:06.115335)
training loss at step 17250: 1.81 (2017-05-05 06:11:00.483856)
training loss at step 17260: 1.61 (2017-05-05 06:12:55.089920)
training loss at step 17270: 1.61 (2017-05-05 06:14:48.538761)
training loss at step 17280: 1.66 (2017-05-05 06:16:42.924205)
training loss at step 17290: 1.76 (2017-05-05 06:18:37.318744)
training loss at step 17300: 1.66 (2017-05-05 06:20:31.300223)
training loss at step 17310: 1.66 (2017-05-05 06:22:24.613267)
training loss at step 17320: 1.64 (2017-05-05 06:24:18.369939)
training loss at step 17330: 1.77 (2017-05-05 06:26:12.274968)
training loss at step 17340: 1.84 (2017-05-05 06:28:06.

training loss at step 18500: 1.65 (2017-05-05 10:09:00.216062)
training loss at step 18510: 1.69 (2017-05-05 10:10:54.256768)
training loss at step 18520: 1.73 (2017-05-05 10:12:48.894104)
training loss at step 18530: 1.83 (2017-05-05 10:14:42.410234)
training loss at step 18540: 1.69 (2017-05-05 10:16:37.658666)
training loss at step 18550: 1.67 (2017-05-05 10:18:31.588984)
training loss at step 18560: 1.65 (2017-05-05 10:20:25.415025)
training loss at step 18570: 1.67 (2017-05-05 10:22:19.573880)
training loss at step 18580: 1.82 (2017-05-05 10:24:13.807963)
training loss at step 18590: 2.01 (2017-05-05 10:26:07.639608)
training loss at step 18600: 1.87 (2017-05-05 10:28:01.834174)
training loss at step 18610: 1.69 (2017-05-05 10:29:55.311324)
training loss at step 18620: 1.69 (2017-05-05 10:31:50.205660)
training loss at step 18630: 1.62 (2017-05-05 10:33:44.323566)
training loss at step 18640: 1.70 (2017-05-05 10:35:38.926229)
training loss at step 18650: 1.66 (2017-05-05 10:37:33.

training loss at step 19810: 1.60 (2017-05-05 14:25:35.682113)
training loss at step 19820: 1.76 (2017-05-05 14:27:26.512166)
training loss at step 19830: 1.56 (2017-05-05 14:29:17.376730)
training loss at step 19840: 1.55 (2017-05-05 14:31:07.913677)
training loss at step 19850: 1.61 (2017-05-05 14:32:59.022716)
training loss at step 19860: 1.77 (2017-05-05 14:34:53.711449)
training loss at step 19870: 1.63 (2017-05-05 14:36:54.725107)
training loss at step 19880: 1.67 (2017-05-05 14:38:50.814138)
training loss at step 19890: 1.49 (2017-05-05 14:40:45.898402)
training loss at step 19900: 1.69 (2017-05-05 14:42:36.919178)
training loss at step 19910: 1.79 (2017-05-05 14:44:29.356007)
training loss at step 19920: 1.66 (2017-05-05 14:46:21.577856)
training loss at step 19930: 1.56 (2017-05-05 14:48:15.940010)
training loss at step 19940: 1.61 (2017-05-05 14:50:09.655661)
training loss at step 19950: 1.61 (2017-05-05 14:52:02.973774)
training loss at step 19960: 1.65 (2017-05-05 14:53:56.

In [29]:
test_start = 'After 20000 iterations of model training all I can write is '

with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start

    #for every char in the input sentennce
    for i in range(len(test_start) - 1):
        #initialize an empty char store
        test_X = np.zeros((1, char_size))
        #store it in id from
        test_X[0, char2id[test_start[i]]] = 1.
        #feed it to model, test_prediction is the output value
        _ = sess.run(test_prediction, feed_dict={test_data: test_X})

    
    #where we store encoded char predictions
    test_X = np.zeros((1, char_size))
    test_X[0, char2id[test_start[-1]]] = 1.

    #lets generate 500 characters
    for i in range(500):
        #get each prediction probability
        prediction = test_prediction.eval({test_data: test_X})[0]
        #one hot encode it
        next_char_one_hot = sample(prediction)
        #get the indices of the max values (highest probability)  and convert to char
        next_char = id2char[np.argmax(next_char_one_hot)]
        #add each char to the output text iteratively
        test_generated += next_char
        #update the 
        test_X = next_char_one_hot.reshape((1, char_size))

    print(test_generated)

After 20000 iterations of model training all I can write is ous sofubafin 
 Lol tinth pomombin 4 by s Rory " @-@-@ 19 f ind phid tochere htholat tha con S1 the wan C H Jutheath ds " bixton To Wotar oris the it s inad Pfithin athe won r Bit P. min tof Spicese prouliay Whe s y s mex thoncored R Re A by atrs is car . d 1 th t ste , . , rqust t Pay to Pte L " ) awhitreth t 19mad " fior try mo tted hthigsth che iqus he 14 ome has t tlyets 'sse iteve telitsesen [ s s tthapour ded amas C purkochty Gr t outed sut be out isshtis olathinde Fistond teen clantin tem
