In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import sent_tokenize
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [4]:
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import collections

In [5]:
fp = open("speeches.txt",'r')
text = fp.read()
fp.close()

In [6]:
text = re.sub('SPEECH \d+', '\n', text)
text = re.sub('’', "'",text)
text = text.replace('...',' ')
text = text.replace('..',' ')
text = re.sub("[^A-Za-z.']",' ',text)
text = text.replace('\n',' ')
text = text.replace('  ',' ')
text = text.replace('  ',' ')

In [7]:
sent_tokenize_list = sent_tokenize(text)

In [8]:
def gen_sentences(sent_tokenize_list):
    sent_list = []
    vocab = {"<s>","</s>"}
    for sent in sent_tokenize_list:
        tokens = nltk.word_tokenize(sent[:-1].lower())
        temp = []
        temp.append("<s>")
        for j in tokens:
            vocab.add(j)
            temp.append(j)
        temp.append("</s>")
        sent_list.append(temp)
    return sent_list, vocab

In [9]:
sent_list, vocab = gen_sentences(sent_tokenize_list)

train, test = train_test_split(sent_list, test_size=0.2)

In [10]:
vocab_l = len(vocab)
vocab_train = set()
for sent in train:
    for word in sent:
        vocab_train.add(word)

vocab_tl = len(list(vocab_train))

## Calculating N-Grams

In [11]:
# Calculates the n-grams for the specified n
# on the sentences in sent_list
# It only enumerates the list of existing n-grams
# It returns a ditcionary containing ngrams as keys and their counts
def get_n_grams(sent_list, n):
    n_grams = {}
    for sent in sent_list:
        for gram in list(nltk.ngrams(sent, n)):
            if gram in n_grams.keys():
                n_grams[gram]+=1
            else:
                n_grams[gram]=1
    
    return n_grams

# Calculates all possible n-grams for given
# length of vocabulary. The vocabulary conatins <s> and </s>
def calculate_possible_ngram(vocab_l, n):
    if n==1:
        return vocab_l
    if n==2:
        return 2*(vocab_l-1)*(vocab_l-2)
    return ((vocab_l-2)**(n-2))*((vocab_l-1)**2)

### Unigrams

In [12]:
n = 1
unigrams = get_n_grams(train, n)
print("Number of Unigrams:\t", len(list(unigrams.keys())))
print("All possible Unigrams:\t", calculate_possible_ngram(vocab_tl, n))

Number of Unigrams:	 5189
All possible Unigrams:	 5189


### Bigrams

In [13]:
n = 2
bigrams = get_n_grams(train, n)
print("Number of Bigrams:\t", len(list(bigrams.keys())))
print("All possible Bigrams:\t", calculate_possible_ngram(vocab_tl, n))

Number of Bigrams:	 41129
All possible Bigrams:	 53820312


### Trigrams

In [14]:
n = 3
trigrams = get_n_grams(train, n)
print("Number of Trigrams:\t", len(list(trigrams.keys())))
print("All possible Trigrams:\t", calculate_possible_ngram(vocab_tl, n))

Number of Trigrams:	 81450
All possible Trigrams:	 139609889328


### Quadgrams

In [15]:
n = 4
quadgrams = get_n_grams(train, n)
print("Number of Bigrams:\t", len(list(quadgrams.keys())))
print("All possible Quadgrams:\t", calculate_possible_ngram(vocab_tl, n))

Number of Bigrams:	 99721
All possible Quadgrams:	 724156495944336


In [16]:
# Generate probability with smoothing
def gen_count(ngram, n):
    rev_dict = {}
    total = 0
    for gram in ngram.keys():
        total += ngram[gram]
        if ngram[gram] not in rev_dict.keys():
            rev_dict[ngram[gram]] = 1
        else:
            rev_dict[ngram[gram]] += 1
    
    sor_count = list(rev_dict.keys())
    sor_count.sort()
    return (ngram, rev_dict, sor_count, total, n)

# Gets the probability of specific ngram
# Using Kneser-Ney smoothing
def get_count(text, probDis_obj):
    ngram, rev_dict, sor_count, total, n = probDis_obj
    if tuple(text) not in ngram.keys():
        return float(rev_dict[1])/total
    if ngram[tuple(text)] == sor_count[-1]:
        return float(sor_count[-1])
    index = sor_count.index(ngram[tuple(text)])
    fac = sor_count[index+1]
    return float(fac*rev_dict[fac])/rev_dict[ngram[tuple(text)]]


# Gets the conditional probabiltiy
def get_cond_prob(text, probDis_obj, total):
    n = probDis_obj[4]
    count = get_count(text, probDis_obj)
    acount = 0
    for gram in probDis_obj[0].keys():
        if tuple(text[:-1]) == gram[:-1]:
            acount += get_count(gram, probDis_obj)
    if acount==0:
        acount = total
    return count/acount

In [17]:
# Calculates the perplexity of a sentence
def get_perplexity(sentence, probDis, n):
    probDis_obj = probDis[n-1]
    perplexity = 1
    if n==1:
        total = probDis[0][3]
    else:
        total = probDis[n-2][3]
    for gram in nltk.ngrams(sentence, n):
        perplexity*= (get_cond_prob(gram, probDis_obj, total))**(-1/len(sentence))
    for j in range(n-1):
        if j==0:
            total = probDis[0][3]
        else:
            total = probDis[j-1][3]
        text = sentence[:j+1]
        perplexity*= (get_cond_prob(text, probDis[j], total))**(-1/len(sentence))
    return perplexity

In [22]:
# Generates the normalized probabiltiy distribution
# for the specificed ngram
def normalized_prob_dis(probDis_obj, text, vocab, total):
    nm_prob_dis = []
    word_list = []
    for word in vocab:
        nm_prob_dis.append(get_cond_prob(text+list(word), probDis_obj, total))
        word_list.append(word[0])
    nm_prob_dis = np.array(nm_prob_dis)
    fac = np.sum(nm_prob_dis)
    return nm_prob_dis/fac, word_list

# Generates the sentences for given model
def generate_sent(probDis, n, length):
    sentence = ['<s>']
    vocab, vocab_l = probDis[0][0].keys(), probDis[0][3] 
    for j in range(1,n-1):
        text = sentence[:j]
        nm_prob_dis, word_list = normalized_prob_dis(probDis[j], text, vocab, probDis[j-1][3])
        index = np.argmax(np.random.multinomial(vocab_l*100, nm_prob_dis, size=1))
        sentence.append(word_list[index])
    
    if n==1:
        total = probDis[0][3]
    else:
        total = probDis[n-2][3]
    while len(sentence)< length+1:
        text = sentence[-n+1:]
        nm_prob_dis, word_list = normalized_prob_dis(probDis[n-1], text, vocab, total)
        index = np.argmax(np.random.multinomial(vocab_l*100, nm_prob_dis, size=1))
        sentence.append(word_list[index])
    return ' '.join(sentence[1:])

In [19]:
prob_un = gen_count(unigrams, 1)
prob_bi = gen_count(bigrams, 2)
prob_tri = gen_count(trigrams, 3)
prob_quad = gen_count(quadgrams, 4)

probDis = (prob_un, prob_bi, prob_tri, prob_quad)

## Perplexity of Test Data

In [30]:
# Get perplexity of test data

len_test = len(test)

unigram_perplexity = 0
for sent in test:
    unigram_perplexity+=get_perplexity(sent, probDis, 1)
print("Unigram Perplexity:", float(unigram_perplexity)/len_test)


bigram_perplexity = 0
for sent in test:
    bigram_perplexity+=get_perplexity(sent, probDis, 2)
print("Bigram Perplexity:", float(bigram_perplexity)/len_test)


trigram_perplexity = 0
for sent in test:
    trigram_perplexity+=get_perplexity(sent, probDis, 3)
print("Trigram Perplexity:", float(trigram_perplexity)/len_test)


quadgram_perplexity = 0
for sent in test:
    quadgram_perplexity+=get_perplexity(sent, probDis, 4)
print("Quadgram Perplexity:", float(quadgram_perplexity)/len_test)

Unigram Perplexity: 400.37763818809566
Bigram Perplexity: 84.5394947036964
Trigram Perplexity: 311.55710770061154
Quadgram Perplexity: 1883.0692291105845


We observe that the perplexity for Trigrams and Quadgrams is increasing. This is beacuse perplexity is being calculated on a model only with Kneser-Ney smoothing without Backoff. So whenever a stream of words that the model has not seen or not seen in that order at all causes it to give very high perplexity. The sentences in the speech are sometimes very small or are broken due to pauses taken by the speaker.

## Generate 5 sentences for each model

In [23]:
# Generating 5 sentences for each model
print("Unigram")
for j in range(5):
    print('\t',generate_sent(probDis, 1, 10))
print("Bigram")
for j in range(5):
    print('\t',generate_sent(probDis, 2, 10))
print("Trigram")
for j in range(5):
    print('\t',generate_sent(probDis, 3, 10))
print("Quadgram")
for j in range(5):
    print('\t',generate_sent(probDis, 4, 10))

Unigram
	 hispanic quite fixing depend means illegally genesco hopefully hospitals dreamers
	 killer sustainable smiling yearly agreed high immediate foster bye agrees
	 firmly sachs registered forced qualities wanted vaccines comedian looks love
	 practice deepest fortunately lg endorsement wrecking abusers grass gdp complaining
	 penn locations christian rapidly realize fighting listened details remains skills
Bigram
	 i 'm going to be the people </s> politicians want
	 i 'm going to be the people </s> but they
	 i 'm going to be the people </s> deserve it
	 i 'm going to be the people </s> cnbc ranked
	 i 'm going to be the people </s> lasting deal
Trigram
	 i 'm a messenger </s> mosque pelley stamps talent cashed
	 i 'm a messenger </s> rein yesterday squad act beef
	 i 'm a messenger </s> dismissed others platform citibank label
	 i 'm a messenger </s> cookies savage beautiful crack save
	 i 'm a messenger </s> issues ladies footer based policies
Quadgram
	 i 'm not going to happe

KeyboardInterrupt: 

# Neural Approach

In [23]:
vocab_list = list(vocab)
word_index = {}

for j in range(vocab_l):
    word_index[vocab_list[j]] = j

reverse_dictionary = dict(zip(word_index.values(), word_index.keys()))
test_len = len(test)

In [24]:
def word_to_int(sent, reverse_dictionary):
    result = []
    for word in sent:
        result.append(reverse_dictionary[word])
    return result

In [42]:
# Parameters
learning_rate = 0.00001
training_iters = len(train)*10
display_step = 10000
n_input = 

# number of units in RNN/LSTM cell
n_hidden = 128

In [44]:
# tf Graph input
x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, vocab_l])

# RNN/LSTM output node weights and biases
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, vocab_l]))
}
biases = {
    'out': tf.Variable(tf.random_normal([vocab_l]))
}

In [45]:
def RNN(x, weights, biases):

    # reshape to [1, n_input]
    x = tf.reshape(x, [-1, n_input])

    # Generate a n_input-element sequence of inputs
    # (eg. [had] [a] [general] -> [20] [6] [33])
    x = tf.split(x,n_input,1)

    rnn_cell = rnn.BasicRNNCell(n_hidden, reuse =tf.AUTO_REUSE)

    # generate prediction
    outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

    # there are n_input outputs but
    # we only want the last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [46]:
def LSTM(x, weights, biases):

    # reshape to [1, n_input]
    x = tf.reshape(x, [-1, n_input])

    # Generate a n_input-element sequence of inputs
    # (eg. [had] [a] [general] -> [20] [6] [33])
    x = tf.split(x,n_input,1)

    rnn_cell = rnn.LSTMCell(n_hidden, reuse =tf.AUTO_REUSE)

    # generate prediction
    outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

    # there are n_input outputs but
    # we only want the last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

## RNN Approach

In [16]:
pred = RNN(x, weights, biases)

# Loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [17]:
# Launch the graph
with tf.Session() as session:
    session.run(init)
    step = 0
    offset = random.randint(0,n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0
    
    
    while step < training_iters:
        # Random batch
        np.random.shuffle(train)
        for sent in train:
            offset = 0
            while offset <= (len(sent)-end_offset):
                symbols_in_keys = [ [word_index[ str(sent[i])]] for i in range(offset, offset+n_input) ]
                symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

                symbols_out_onehot = np.zeros([vocab_l], dtype=float)
                symbols_out_onehot[word_index[str(sent[offset+n_input])]] = 1.0
                symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

                _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        
                loss_total += loss
                acc_total += acc
                if (step+1) % display_step == 0:
                    print("Iter= " + str(step+1) + ", Average Loss= " + \
                          "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                          "{:.2f}%".format(100*acc_total/display_step))
                    acc_total = 0
                    loss_total = 0
                    symbols_in = [sent[i] for i in range(offset, offset + n_input)]
                    symbols_out = sent[offset + n_input]
                    symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
                    print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
                step += 1
                offset += 1
    print("Optimization Finished!")
    
    
    # Calculating the perplexity of test data
    offset = 0
    perplexity = 0
    
    for sent in test:
        total_loss = 0
        offset = 0
        while offset <= (len(sent)-end_offset):

            symbols_in_keys = [ [word_index[ str(sent[i])]] for i in range(offset, offset+n_input) ]
            symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

            symbols_out_onehot = np.zeros([vocab_l], dtype=float)
            symbols_out_onehot[word_index[str(sent[offset+n_input])]] = 1.0
            symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

            loss, onehot_pred = session.run([cost, pred], feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        
            offset += 1
            total_loss+=loss
        perplexity += np.exp(total_loss/(offset+1))/test_len
        
    print("Test Data Perplexity:", float(perplexity))
    
    
    j = 0
    
    while j < 5:
        prompt = "%s words: " % n_input
        sentence = input(prompt)
        sentence = sentence.strip()
        words = sentence.split(' ')
        if len(words) != n_input:
            continue
        try:
            symbols_in_keys = [word_index[str(words[i])] for i in range(len(words))]
            for i in range(32):
                keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
                symbols_in_keys = symbols_in_keys[1:]
                symbols_in_keys.append(onehot_pred_index)
            print(sentence)
            j+=1
        except:
            print("Word not in dictionary")

Iter= 10000, Average Loss= 30.637838, Average Accuracy= 0.02%
['chuck', 'report'] - [it] vs [creating]
Iter= 20000, Average Loss= 19.105214, Average Accuracy= 0.04%
['not', 'nice'] - [and] vs [creating]
Iter= 30000, Average Loss= 11.909255, Average Accuracy= 3.26%
['our', 'veto'] - [which] vs [i]
Iter= 40000, Average Loss= 8.690023, Average Accuracy= 6.50%
["n't", 'want'] - [to] vs [</s>]
Iter= 50000, Average Loss= 7.802065, Average Accuracy= 8.73%
['<s>', 'i'] - [was] vs [</s>]
Iter= 60000, Average Loss= 7.618074, Average Accuracy= 8.84%
['going', 'to'] - [say] vs [</s>]
Iter= 70000, Average Loss= 7.547363, Average Accuracy= 8.60%
['said', 'well'] - [maybe] vs [</s>]
Iter= 80000, Average Loss= 7.465116, Average Accuracy= 8.64%
['to', 'stop'] - [just] vs [</s>]
Iter= 90000, Average Loss= 7.558892, Average Accuracy= 9.06%
['are', 'you'] - [sure] vs [</s>]
Iter= 100000, Average Loss= 7.579585, Average Accuracy= 8.73%
['were', 'bent'] - [in] vs [</s>]
Iter= 110000, Average Loss= 7.568902,

2 words:  let us


let us </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  this is


this is </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  not working


not working </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  </s> <s>


</s> <s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  i have


i have </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


## LSTM Approach

In [18]:
# tf Graph input
x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, vocab_l])

# RNN/LSTM output node weights and biases
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, vocab_l]))
}
biases = {
    'out': tf.Variable(tf.random_normal([vocab_l]))
}

In [19]:
pred = LSTM(x, weights, biases)

# Loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


In [17]:
# Launch the graph
with tf.Session() as session:
    session.run(init)
    step = 0
    offset = random.randint(0,n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0
    
    
    while step < training_iters:
        # Random batch
        np.random.shuffle(train)
        for sent in train:
            offset = 0
            while offset <= (len(sent)-end_offset):
                symbols_in_keys = [ [word_index[ str(sent[i])]] for i in range(offset, offset+n_input) ]
                symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

                symbols_out_onehot = np.zeros([vocab_l], dtype=float)
                symbols_out_onehot[word_index[str(sent[offset+n_input])]] = 1.0
                symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

                _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        
                loss_total += loss
                acc_total += acc
                if (step+1) % display_step == 0:
                    print("Iter= " + str(step+1) + ", Average Loss= " + \
                          "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                          "{:.2f}%".format(100*acc_total/display_step))
                    acc_total = 0
                    loss_total = 0
                    symbols_in = [sent[i] for i in range(offset, offset + n_input)]
                    symbols_out = sent[offset + n_input]
                    symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
                    print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
                step += 1
                offset += 1
    print("Optimization Finished!")
    
    
    # Calculating the perplexity of test data
    offset = 0
    perplexity = 0
    
    for sent in test:
        total_loss = 0
        offset = 0
        while offset <= (len(sent)-end_offset):

            symbols_in_keys = [ [word_index[ str(sent[i])]] for i in range(offset, offset+n_input) ]
            symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

            symbols_out_onehot = np.zeros([vocab_l], dtype=float)
            symbols_out_onehot[word_index[str(sent[offset+n_input])]] = 1.0
            symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

            loss, onehot_pred = session.run([cost, pred], feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        
            offset += 1
            total_loss+=loss
        perplexity += np.exp(total_loss/(offset+1))/test_len
        
    print("Test Data Perplexity:", float(perplexity))
    
    
    j = 0
    
    while j < 5:
        prompt = "%s words: " % n_input
        sentence = input(prompt)
        sentence = sentence.strip()
        words = sentence.split(' ')
        if len(words) != n_input:
            continue
        try:
            symbols_in_keys = [word_index[str(words[i])] for i in range(len(words))]
            for i in range(32):
                keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
                symbols_in_keys = symbols_in_keys[1:]
                symbols_in_keys.append(onehot_pred_index)
            print(sentence)
            j+=1
        except:
            print("Word not in dictionary")

Iter= 10000, Average Loss= 17.496814, Average Accuracy= 0.00%
['love', 'you'] - [all] vs [protection]
Iter= 20000, Average Loss= 13.319770, Average Accuracy= 0.71%
['going', 'to'] - [be] vs [they]
Iter= 30000, Average Loss= 10.945726, Average Accuracy= 0.49%
['to', 'i'] - [wish] vs [combat]
Iter= 40000, Average Loss= 9.344861, Average Accuracy= 6.22%
['going', 'to'] - [see] vs [</s>]
Iter= 50000, Average Loss= 8.245837, Average Accuracy= 8.88%
['our', 'movement'] - [and] vs [</s>]
Iter= 60000, Average Loss= 7.750816, Average Accuracy= 9.08%
['build', 'a'] - [wall] vs [</s>]
Iter= 70000, Average Loss= 7.388376, Average Accuracy= 8.13%
['you', 'end'] - [the] vs [</s>]
Iter= 80000, Average Loss= 7.175589, Average Accuracy= 7.82%
['watching', 'this'] - [for] vs [</s>]
Iter= 90000, Average Loss= 7.017310, Average Accuracy= 7.14%
['i', "'m"] - [not] vs [</s>]
Iter= 100000, Average Loss= 7.015175, Average Accuracy= 7.41%
['the', 'press'] - [would] vs [the]
Iter= 110000, Average Loss= 7.015464

2 words:  i 'm


i 'm </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  okay thi


Word not in dictionary


2 words:  okay this


okay this </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  is not


is not </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  working again


working again </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


2 words:  what must


what must </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


The simple RNN model is unable to generate anything. This is likely because the sentences are short and so the occurence of \<s> and \</s> are high. Even when the experiment is again conducted without these markers, it gives the most occuring word as output with little or no variation. This also seems to be the case with LSTM model. In terms of readability, the classical is performing better because it is atleast predicting a possible output. The reason for this is because 1) The sentences are short. 2) The amount of training might be less 3) The amount of data required for training might be less for the neural approach. As the sentences are small, small n_input is likely to give better output. But this may not be the case if we do training as a whole continous text rather than sentence wise training. This was also performed but the imporvement in result was not significant. But the value of n_input was kept small.

Even though the classical is giving some output, many times they have no correlation with each other. In terms of perplexity also the classical is beating the neural approach.