In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import collections
import time

  from ._conv import register_converters as _register_converters


#### load data

In [2]:
def reading_text(fname):
    with open(fname) as f:
        text=f.readlines()
    
    text = [x.strip() for x in text]
    text = [text[i].split() for i in range(len(text))]
    text = np.array(text)[0:4]
    text = np.reshape(text,[-1,])
    
    return text

In [3]:
file_name='D:\\My Personal Documents\\Learnings\\Data Science\\Data Sets\\Edureka Lab\poem.txt'
training_data = reading_text(file_name)

In [6]:
training_data

['BEHOLD',
 'her,',
 'single',
 'in',
 'the',
 'field,',
 'Yon',
 'solitary',
 'Highland',
 'Lass!',
 'Reaping',
 'and',
 'singing',
 'by',
 'herself;',
 'Stop',
 'here,',
 'or',
 'gently',
 'pass!']

In [5]:
flat_list=[]
for sublist in training_data:
    for item in sublist:
        flat_list.append(item)

training_data=flat_list

#### Preprocessing

In [5]:
# Building dictionary and reverse disctionary

def build_dataset_dictionary(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
    return dictionary, reverse_dictionary

In [11]:
dictionary, reverse_dictionary = build_dataset_dictionary(training_data)
vocabulary_size = len(dictionary)

In [12]:
#Define parameters

learning_rate = 0.001
training_iters = 50000
display_step = 1000
n_input = 3

# Number of units in RNN cell
n_hidden = 512

# tf graph input

x= tf.placeholder('float', [None, n_input,1])
y= tf.placeholder('float', [None, vocabulary_size])

# RNN output node weights and biases
weights = {
    'out' : tf.Variable(tf.random_normal([n_hidden, vocabulary_size]))
}

biases = {
    'out' : tf.Variable(tf.random_normal([vocabulary_size]))
}

In [13]:
def RNN(x, weights, biases):
    x= tf.reshape(x, [-1,n_input])
    x= tf.split(x,n_input,1)
    rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])
    outputs, states = rnn.static_rnn(rnn_cell,x,dtype=tf.float32)
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = RNN(x,weights, biases)

In [15]:
#loss and cost optimizer
cost_function = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred,labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost_function)

In [16]:
# model eveluation
correct_pred = tf.equal(tf.argmax(pred,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred,tf.float32))

In [24]:
# launch the graph
init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)
    step=0
    offset = random.randint(0,n_input + 1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0
    
   # writer.add_graph(session.graph)

In [28]:
glove_dim=1
with tf.Session() as session:
    session.run(init)
    step = 0
    offset = random.randint(0,n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    #train_writer.add_graph(session.graph)

    while step < training_iters:
        # Generate a minibatch. Add some randomness on selection process.
        if offset > (len(training_data)-end_offset):
            offset = random.randint(0, n_input+1)

        symbols_in_keys = [ [dictionary[ str(training_data[i])]] for i in range(offset, offset+n_input) ]
        symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, glove_dim])

        symbols_out_onehot = np.zeros([vocabulary_size], dtype=float)
        symbols_out_onehot[dictionary[str(training_data[offset+n_input])]] = 1.0
        symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost_function, pred], \
                                            feed_dict={x:symbols_in_keys, y: symbols_out_onehot})
        loss_total += loss
        acc_total += acc
        if (step+1) % display_step == 0:
            print("Iter= " + str(step+1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100*acc_total/display_step))
            acc_total = 0
            loss_total = 0
            symbols_in = [training_data[i] for i in range(offset, offset + n_input)]
            symbols_out = training_data[offset + n_input]
            symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
    step += 1
    offset += (n_input+1)
    print("Optimization Finished!")
    print("Elapsed time: ", elapsed(time.time() - start_time))
    print("Run on command line.")
    print("\ttensorboard --logdir=%s" % (logs_path))
    print("Point your web browser to: http://localhost:6006/")
    while True:
        prompt = "%s words: " % n_input
        sentence = input(prompt)
        sentence = sentence.strip()
        words = sentence.split(' ')
        if len(words) != n_input:
            continue
        try:
            symbols_in_keys = [glove_dictionary[str(words[i])] for i in range(len(words))]
            for i in range(32):
                keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
                symbols_in_keys = symbols_in_keys[1:]
                symbols_in_keys.append(onehot_pred_index)
            print(sentence)
        except:
            print("Word not in dictionary")

KeyboardInterrupt: 