In [1]:
import numpy as np
from gensim.models import Word2Vec
from nltk import word_tokenize, sent_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.contrib import rnn

  return f(*args, **kwds)


In [2]:
# Load data, make all to lowercase
txt = open("theLordOfTheRings.txt")
data = txt.read()
data = data.lower()
len(data)

986848

In [3]:
data[:200]

'the lord of the rings 1. concerning hobbits this book is largely concerned with hobbits, and from its pages a reader may discover much of their character and a little of their history.  further inform'

In [4]:
sentences = sent_tokenize(data)
sentences[:3]

['the lord of the rings 1. concerning hobbits this book is largely concerned with hobbits, and from its pages a reader may discover much of their character and a little of their history.',
 'further information will also be found in the selection from the red book of westmarch that has already been published, under the title of the hobbit.',
 'that story was derived from the earlier chapters of the red book, composed by bilbo himself, the first hobbit to become famous in the world at large, and called by him there and back again, since they told of his journey into the east and his return: an adventure which later involved all the hobbits in the great events of that age that are here related.']

In [5]:
words = word_tokenize(data)
len(words)

218302

In [6]:
sentences_break_down = [word_tokenize(sentence) for sentence in sentences ]
len(sentences_break_down)
sentences_break_down[1:2]

[['further',
  'information',
  'will',
  'also',
  'be',
  'found',
  'in',
  'the',
  'selection',
  'from',
  'the',
  'red',
  'book',
  'of',
  'westmarch',
  'that',
  'has',
  'already',
  'been',
  'published',
  ',',
  'under',
  'the',
  'title',
  'of',
  'the',
  'hobbit',
  '.']]

In [7]:
emb_dim = 400

In [8]:
# Call Word2Vec to convert word to vector, length 400
w2v = Word2Vec(sentences=sentences_break_down, 
        sg=1,
        size=emb_dim,
        window=5,
        alpha=0.0005,
        min_count=1,
        workers=8,
        batch_words=10000)

In [9]:
w2v_len = len(w2v.wv.vocab)
w2v.train(sentences=sentences_break_down, total_words=w2v_len, epochs=250, start_alpha=0.0005, end_alpha=0.0001)

(37466330, 54575500)

In [10]:
# Test if the work is well done
w2v.wv.similar_by_vector(w2v.wv.get_vector('gandalf'))

[('gandalf', 0.9999998807907104),
 ('strider', 0.9717492461204529),
 ('elrond', 0.9500234127044678),
 ('aragorn', 0.945915937423706),
 ('boromir', 0.9434243440628052),
 ('legolas', 0.930779218673706),
 ('haldir', 0.9294370412826538),
 ('frodo', 0.9274029731750488),
 ('gildor', 0.9201619625091553),
 ('pippin', 0.9197915196418762)]

In [11]:
w2v.wv.most_similar(positive='ring')

[('heart', 0.8911802768707275),
 ('gollum', 0.8909974098205566),
 ('thing', 0.8861624002456665),
 ('account', 0.8858187794685364),
 ('desire', 0.883201539516449),
 ('sauron', 0.8829586505889893),
 ('party', 0.8822845220565796),
 ('tale', 0.8818395733833313),
 ('mind', 0.8812507390975952),
 ('put', 0.8806129693984985)]

In [12]:
w2v_dict = w2v.wv.vocab
type(w2v_dict)

dict

In [13]:
# Prepare the dict for one code encoding 
w2v_keys = np.array(list(w2v_dict.keys()))

In [14]:
w2v_keys[:10]

array(['the', 'lord', 'of', 'rings', '1.', 'concerning', 'hobbits', 'this',
       'book', 'is'],
      dtype='<U22')

In [15]:
# Step intermedia, do label encoding first 
label_encoder = LabelEncoder()
w2v_keys_encoded = label_encoder.fit_transform(w2v_keys)
print(w2v_keys_encoded[:10])

[8555 5287 6028 7081  529 2022 4474 8606 1382 4806]


In [16]:
# Get the mapping 
onehot_encoder = OneHotEncoder(sparse=False)
w2v_keys_encoded = w2v_keys_encoded.reshape(len(w2v_keys_encoded),1)
onehot_encoded = onehot_encoder.fit_transform(w2v_keys_encoded)
print(onehot_encoded[:2])

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [17]:
#inverted = label_encoder.inverse_transform([np.argmax(onehot_encoded[0, :])])
#print(inverted)

In [18]:
# Prepare groups of eight word to feed the RNN
timestep_group = 8
random_range = int(len(words)/timestep_group) - 1

In [19]:
# Return batch_num groups randomly, come in handy soon 
def get_next_batch(batch_size):
    X_batch = []
    Y_batch = []
    index = np.random.randint(0,random_range,batch_size)
    #print("index: ",index)
    for i in range(batch_size):
        temp = []
        for j in range(timestep_group-1):
            #print(j,"Get word:",words[index[i]+j])
            temp.append(w2v.wv.get_vector(words[index[i]+j]))
        X_batch.append(temp)
        #print("Label is:", words[index[i]+timestep_group-1])
        temp = label_encoder.transform([words[index[i]+timestep_group-1]])
        tempv = onehot_encoder.transform(temp.reshape(1,1))
        Y_batch.append(tempv[0])
    return np.array(X_batch),np.array(Y_batch) 

In [20]:
alpha = 1e-3                          # learning rate
input_size = emb_dim                  # input size 
timestep_size = timestep_group - 1    # cell number in each layer 
hidden_size = 1024                    # one dimension for parameter matrix 
layer_num = 2                         # number of layer 
class_num = w2v_len                   # output size 
cell_type = "lstm"                  

X = tf.placeholder(tf.float32, [None, timestep_size, emb_dim])
y_input = tf.placeholder(tf.float32, [None, class_num])
batch_size = tf.placeholder(tf.int32, [])
keep_prob = tf.placeholder(tf.float32, [])

In [21]:
# Construction of the lstm cell
def lstm_cell(cell_type,num_nodes,keep_prob):
    assert(cell_type in ["lstm","block_lstm"],"Wrong cell type")
    if cell_type == "lstm":
        cell = rnn.BasicLSTMCell(num_nodes)
    else:
        cell = rnn.LSTMBlockCell(num_nodes)
    cell = rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
    return cell

mlstm_cell = rnn.MultiRNNCell([lstm_cell(cell_type,hidden_size,keep_prob) for _ in range(layer_num)],state_is_tuple=True)
init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32)

In [22]:
# Calculate the final output of the RNN layers
outputs = list()
state = init_state

with tf.variable_scope('RNN'):
    for timestep in range(timestep_size):
        (cell_output,state) = mlstm_cell(X[:,timestep,:],state)
        outputs.append(cell_output)
h_state = outputs[-1]

In [23]:
# FC Layer, y^ = hW + b 
W = tf.Variable(tf.truncated_normal([hidden_size,class_num],stddev=0.1),dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1,shape=[class_num]),dtype=tf.float32)
y_pre = tf.nn.softmax(tf.matmul(h_state,W) + bias)

In [24]:
# Minimize lost function 
loss = - tf.reduce_mean(y_input * tf.log(y_pre))
train_op = tf.train.AdamOptimizer(alpha).minimize(loss)

In [25]:
# Accuracy for display use during training
correct_pred = tf.equal(tf.argmax(y_pre,1),tf.argmax(y_input,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred,"float"))

In [26]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [27]:
# Training 
for i in range(6500):
    _batch_size=100
    X_batch, y_batch = get_next_batch(_batch_size)
    cost, acc, _ = sess.run([loss,accuracy,train_op],feed_dict={X: X_batch, y_input:y_batch,keep_prob:0.5, batch_size: _batch_size})
    if (i+1)%10 == 0:
        print("step {},train cost={:.6f},acc={:.6f}".format(i+1,cost,acc))

step 10,train cost=0.000850,acc=0.040000
step 20,train cost=0.000749,acc=0.040000
step 30,train cost=0.000765,acc=0.000000
step 40,train cost=0.000759,acc=0.010000
step 50,train cost=0.000717,acc=0.050000
step 60,train cost=0.000680,acc=0.080000
step 70,train cost=0.000648,acc=0.090000
step 80,train cost=0.000622,acc=0.050000
step 90,train cost=0.000703,acc=0.030000
step 100,train cost=0.000658,acc=0.060000
step 110,train cost=0.000671,acc=0.050000
step 120,train cost=0.000705,acc=0.080000
step 130,train cost=0.000651,acc=0.070000
step 140,train cost=0.000665,acc=0.030000
step 150,train cost=0.000683,acc=0.030000
step 160,train cost=0.000681,acc=0.080000
step 170,train cost=0.000713,acc=0.050000
step 180,train cost=0.000669,acc=0.080000
step 190,train cost=0.000648,acc=0.070000
step 200,train cost=0.000644,acc=0.070000
step 210,train cost=0.000646,acc=0.060000
step 220,train cost=0.000623,acc=0.090000
step 230,train cost=0.000647,acc=0.070000
step 240,train cost=0.000637,acc=0.090000
s

step 1950,train cost=0.000478,acc=0.170000
step 1960,train cost=0.000505,acc=0.180000
step 1970,train cost=0.000457,acc=0.190000
step 1980,train cost=0.000422,acc=0.210000
step 1990,train cost=0.000448,acc=0.220000
step 2000,train cost=0.000437,acc=0.180000
step 2010,train cost=0.000440,acc=0.120000
step 2020,train cost=0.000469,acc=0.150000
step 2030,train cost=0.000427,acc=0.240000
step 2040,train cost=0.000475,acc=0.150000
step 2050,train cost=0.000453,acc=0.170000
step 2060,train cost=0.000464,acc=0.160000
step 2070,train cost=0.000460,acc=0.200000
step 2080,train cost=0.000441,acc=0.160000
step 2090,train cost=0.000482,acc=0.140000
step 2100,train cost=0.000429,acc=0.170000
step 2110,train cost=0.000474,acc=0.140000
step 2120,train cost=0.000461,acc=0.120000
step 2130,train cost=0.000474,acc=0.140000
step 2140,train cost=0.000443,acc=0.170000
step 2150,train cost=0.000437,acc=0.200000
step 2160,train cost=0.000470,acc=0.190000
step 2170,train cost=0.000436,acc=0.150000
step 2180,t

step 3860,train cost=0.000252,acc=0.450000
step 3870,train cost=0.000181,acc=0.550000
step 3880,train cost=0.000205,acc=0.480000
step 3890,train cost=0.000184,acc=0.590000
step 3900,train cost=0.000192,acc=0.560000
step 3910,train cost=0.000200,acc=0.570000
step 3920,train cost=0.000195,acc=0.540000
step 3930,train cost=0.000181,acc=0.620000
step 3940,train cost=0.000212,acc=0.590000
step 3950,train cost=0.000198,acc=0.560000
step 3960,train cost=0.000228,acc=0.520000
step 3970,train cost=0.000181,acc=0.550000
step 3980,train cost=0.000204,acc=0.530000
step 3990,train cost=0.000206,acc=0.460000
step 4000,train cost=0.000209,acc=0.520000
step 4010,train cost=0.000183,acc=0.620000
step 4020,train cost=0.000142,acc=0.630000
step 4030,train cost=0.000215,acc=0.560000
step 4040,train cost=0.000188,acc=0.560000
step 4050,train cost=0.000166,acc=0.630000
step 4060,train cost=0.000183,acc=0.580000
step 4070,train cost=0.000193,acc=0.570000
step 4080,train cost=0.000176,acc=0.590000
step 4090,t

step 5770,train cost=0.000057,acc=0.860000
step 5780,train cost=0.000070,acc=0.830000
step 5790,train cost=0.000067,acc=0.850000
step 5800,train cost=0.000068,acc=0.850000
step 5810,train cost=0.000064,acc=0.850000
step 5820,train cost=0.000074,acc=0.820000
step 5830,train cost=0.000062,acc=0.810000
step 5840,train cost=0.000052,acc=0.880000
step 5850,train cost=0.000060,acc=0.860000
step 5860,train cost=0.000067,acc=0.820000
step 5870,train cost=0.000049,acc=0.840000
step 5880,train cost=0.000051,acc=0.860000
step 5890,train cost=0.000063,acc=0.830000
step 5900,train cost=0.000085,acc=0.790000
step 5910,train cost=0.000051,acc=0.880000
step 5920,train cost=0.000054,acc=0.840000
step 5930,train cost=0.000052,acc=0.860000
step 5940,train cost=0.000059,acc=0.850000
step 5950,train cost=0.000054,acc=0.910000
step 5960,train cost=0.000061,acc=0.850000
step 5970,train cost=0.000062,acc=0.840000
step 5980,train cost=0.000057,acc=0.850000
step 5990,train cost=0.000038,acc=0.970000
step 6000,t

In [28]:
# Return the next predict word's index, depend on the cumsum of probability 
def get_next_index(vector):
    threshold = np.random.rand(1)
    #print("threshold is:",threshold)
    i = 0
    proba_sum = 0.0
    while(i<len(vector)):
        proba_sum += vector[i]
        if(proba_sum>threshold):
            #print("Find i:",i)
            break
        i += 1
    return i

In [29]:
# Push Old Baggins to write more about the quest 
def Fake_Old_Baggins(len_to_write):
    #text = ["gandalf","was","reading","a","riddle","written","in"]
    #kickoff_phrase = ["gandalf","was","reading","a","riddle","written","in"]
    text = ["gollum","was","catching","fish","by","the","lake"]
    kickoff_phrase = ["gollum","was","catching","fish","by","the","lake"]
    i = 0
    while(i < len_to_write):
        kickoff_vector = []
        for j in range(timestep_size):
            kickoff_vector.append(w2v.wv.get_vector(kickoff_phrase[j]))
        kickoff_batch = []
        kickoff_batch.append(kickoff_vector)
        kickoff_batch = np.array(kickoff_batch)
        
        predict_vector = sess.run([y_pre],feed_dict={X: kickoff_batch, keep_prob:1.0,batch_size: 1})
        predict_vector = np.array(predict_vector)
        predict_word = label_encoder.inverse_transform(get_next_index(predict_vector[0][0]))
        
        kickoff_phrase.pop(0)
        kickoff_phrase.append(predict_word)
        text.append(predict_word)
        
        #print(kickoff_phrase)
        
        i += 1
    return ' '.join(text)

In [30]:
rslt = Fake_Old_Baggins(300)
print(rslt)

gollum was catching fish by the lake of the younger of the shire , for they loved to grow after him . he left the into the mountain river , and right agin the old forest . that was a dark place place , until the fact of the wise could discover no more . but at last i can carry on the story , i think . 'fong after , it was his birthday at the time , but he still his 'precious collected many manuscripts written by scribes . the nine of all holes and still were mentioned . but they became to call the other . the book of the westlands were great and according to shire-folk bare with the same of large and delight . 'he , even that the hilly were the poorest of mordor , and they had crowded the after-dinner speech in hobbits . though the dates were usually to be seen . the original of hobbits lies had back up all and night . there was a great flash of light , and that was green in the light : there was his scratch echo . he whizzed alone - to himself : about he thought he comes it and he use

In [31]:
rslt2 = Fake_Old_Baggins(300)
print(rslt2)

gollum was catching fish by the lake of the younger of the shire . 'so it is , ' he said silent a old present . after not you 'll he told you with . ' 'i am not to see you visible , ' he said . 'and i wonder of course . he would lead him and let it away the back , he will -covered and anyone out . bilbo was round in the dark corner , as he tunnelled with her place . he was hardly pleased , and gathering and bilbo had once to report him . a first itself . to moment he wore it , and his hand , he had hungry on the first ; and he whistled three back . the last , was , but rather had ever to be bag valiant . bilbo was very mr. bilbo 'elves first seen on the truth from to a while in the world , and never neither to rohan of any in you reckon . who invented stories good-bye anyway a indoors . they my of hard dim . they began to their faces and bilbo had a seen springle-ring of his sancho , and my change , secured . 'there 's no saying fishy , the old was gathered , and a shadow . he had a fe

In [32]:
rslt3 = Fake_Old_Baggins(300)
print(rslt3)

gollum was catching fish by the lake of the younger generation of hobbits . 'ah , but he has likely enough been adding to what he brought at first , ' argued the miller , voicing common opinion . 'he 's often away from home . ' look to be you on ! ' he said , 'and n't you want , ' said frodo . 'i wanted the truth . ' he clutched away . he looked sternly back to his hole , and stood for a moment listening with a smile to the din in the pavilion and to the sounds of merrymaking in the dark , he put his hand on a ring , lying on the floor of a tunnel . ele it remained in his pocket . they seemed to him , and eat to be very . even if he was no a sword . ' the ring , ' said gandalf . 'do , that did you want of me all ? ' asked frodo . 'and i wonder in time . ' 'so i do n't know , ' said deal . 'i believe the truth . it was important . magic rings are - well , magical ; and they were rare and bootless . the stoors of building fell and barns , when large folk whom they do not wish to meet com

In [33]:
rslt4 = Fake_Old_Baggins(1000)
print(rslt4)

gollum was catching fish by the lake of the younger and the hobbits . but the hoard were not know to them . it was very trying at i , but i am to feel him . for my except . ' he asked . 'is , he , all , after angelica 's all an , and the scratched , but the flowering of the crossing could never come . he said to him , and 'yes goodbye all it had been been profit . bilbo 's frodo , ' he said , getting it is good , and whether it nor come , i am sure look , ' said gandalf . 'i do not make use of it , if i came . for a while mr. : he is not a cool part , and the council has come to speak . give me the ring for a moment . ' frodo , and i am if got to make it on again , ' he added with a look at the stranger , the miller miller , were a few branch with the high . bilbo was sitting , the young , keeping in his twelve 's cousin , and that mr. drogo , but you had never kept that before many dwarves and could aware after the borders . the bounders of rumours may have come . i think besides ever

In [34]:
sess.close()

Try to solve the problem with regression method, failed by get always the same words. Gradient vanishing found. Aborted.

In [None]:
timestep_group = 8
random_range = int(len(words)/timestep_group) - 1

def get_next_batch(batch_size):
    X_batch = []
    Y_batch = []
    index = np.random.randint(0,random_range,batch_size)
    #print("index: ",index)
    for i in range(batch_size):
        temp = []
        for j in range(timestep_group-1):
            #print(j,"Get word:",words[index[i]+j])
            temp.append(w2v.wv.get_vector(words[index[i]+j]))
        X_batch.append(temp)
        #print("Label is:", words[index[i]+timestep_group-1])
        Y_batch.append(w2v.wv.get_vector(words[index[i]+timestep_group-1]))
    return np.array(X_batch),np.array(Y_batch) 

alpha = 0.0005
input_size = emb_dim
timestep_size = timestep_group - 1
hidden_size = 512
layer_num = 2 
cell_type = "lstm"

X = tf.placeholder(tf.float32, [None, 7, emb_dim])
y_input = tf.placeholder(tf.float32, [None, emb_dim])
batch_size = tf.placeholder(tf.int32, [])
keep_prob = tf.placeholder(tf.float32, [])

def lstm_cell(cell_type,num_nodes,keep_prob):
    # assert(cell_type in ["lstm","block_lstm"],"Wrong cell type")
    if cell_type == "lstm":
        cell = rnn.BasicLSTMCell(num_nodes)
    else:
        cell = rnn.LSTMBlockCell(num_nodes)
    cell = rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
    return cell

mlstm_cell = rnn.MultiRNNCell([lstm_cell(cell_type,hidden_size,keep_prob) for _ in range(layer_num)],state_is_tuple=True)
init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32)

outputs = list()
state = init_state

with tf.variable_scope('RNN'):
    for timestep in range(timestep_size):
        (cell_output,state) = mlstm_cell(X[:,timestep,:],state)
        outputs.append(cell_output)
h_state = outputs[-1]

W = tf.Variable(tf.truncated_normal([hidden_size,emb_dim],stddev=0.1),dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1,shape=[emb_dim]),dtype=tf.float32)
y_pre = tf.matmul(h_state,W) + bias

loss = tf.losses.mean_squared_error(y_input,y_pre)
train_op = tf.train.AdamOptimizer(alpha).minimize(loss)

sess = tf.Session()

sess.run(tf.global_variables_initializer())

#time0 = time.time()
for i in range(5000):
    _batch_size=100
    X_batch, y_batch = get_next_batch(_batch_size)
    cost, _ = sess.run([loss,train_op],feed_dict={X: X_batch, y_input:y_batch,keep_prob:0.5, batch_size: _batch_size})
    if (i+1)%10 == 0:
        print("step {},train cost={:.6f}".format(i+1,cost))

#kickoff_phrase = ["Gandalf","is","reading","a","riddle","on","a"]
kickoff_phrase = ["is","reading","a","riddle","on","a","track"]
kickoff_vector = []
for i in range(timestep_size):
    kickoff_vector.append(w2v.wv.get_vector(kickoff_phrase[i]))
kickoff_batch = []
kickoff_batch.append(kickoff_vector)
kickoff_batch = np.array(kickoff_batch)

kickoff_batch.shape

predict_vector = sess.run([y_pre],feed_dict={X: kickoff_batch, keep_prob:1.0,batch_size: 1})

sess.close()