In [1]:
import numpy as np
from gensim.models import Word2Vec
from nltk import word_tokenize, sent_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.contrib import rnn

  return f(*args, **kwds)


In [2]:
txt = open("theLordOfTheRings.txt")
data = txt.read()
data = data.lower()
len(data)

986848

In [3]:
data[:200]

'the lord of the rings 1. concerning hobbits this book is largely concerned with hobbits, and from its pages a reader may discover much of their character and a little of their history.  further inform'

In [4]:
sentences = sent_tokenize(data)
sentences[:3]

['the lord of the rings 1. concerning hobbits this book is largely concerned with hobbits, and from its pages a reader may discover much of their character and a little of their history.',
 'further information will also be found in the selection from the red book of westmarch that has already been published, under the title of the hobbit.',
 'that story was derived from the earlier chapters of the red book, composed by bilbo himself, the first hobbit to become famous in the world at large, and called by him there and back again, since they told of his journey into the east and his return: an adventure which later involved all the hobbits in the great events of that age that are here related.']

In [5]:
words = word_tokenize(data)
len(words)

218302

In [6]:
sentences_break_down = [word_tokenize(sentence) for sentence in sentences ]
len(sentences_break_down)
sentences_break_down[1:2]

[['further',
  'information',
  'will',
  'also',
  'be',
  'found',
  'in',
  'the',
  'selection',
  'from',
  'the',
  'red',
  'book',
  'of',
  'westmarch',
  'that',
  'has',
  'already',
  'been',
  'published',
  ',',
  'under',
  'the',
  'title',
  'of',
  'the',
  'hobbit',
  '.']]

In [7]:
emb_dim = 400

In [8]:
w2v = Word2Vec(sentences=sentences_break_down, 
        sg=1,
        size=emb_dim,
        window=5,
        alpha=0.0005,
        min_count=1,
        workers=8,
        batch_words=10000)

In [9]:
w2v_len = len(w2v.wv.vocab)
w2v.train(sentences=sentences_break_down, total_words=w2v_len, epochs=250, start_alpha=0.0005, end_alpha=0.0001)

(37466700, 54575500)

In [10]:
w2v.wv.similar_by_vector(w2v.wv.get_vector('gandalf'))

[('gandalf', 1.0),
 ('strider', 0.9722239971160889),
 ('elrond', 0.9534536004066467),
 ('aragorn', 0.9465899467468262),
 ('boromir', 0.9435869455337524),
 ('legolas', 0.9333523511886597),
 ('haldir', 0.9300433397293091),
 ('frodo', 0.9268338084220886),
 ('glorfindel', 0.923527181148529),
 ('gildor', 0.9227961301803589)]

In [11]:
w2v.wv.most_similar(positive='ring')

[('heart', 0.8881295919418335),
 ('gollum', 0.8830049633979797),
 ('account', 0.8823745250701904),
 ('thing', 0.8781375885009766),
 ('put', 0.8779300451278687),
 ('sauron', 0.8779194951057434),
 ('mind', 0.8778856992721558),
 ('desire', 0.877754807472229),
 ('tale', 0.8759180307388306),
 ('party', 0.8758673667907715)]

In [12]:
w2v_dict = w2v.wv.vocab
type(w2v_dict)

dict

In [13]:
w2v_keys = np.array(list(w2v_dict.keys()))

In [14]:
w2v_keys[:10]

array(['the', 'lord', 'of', 'rings', '1.', 'concerning', 'hobbits', 'this',
       'book', 'is'],
      dtype='<U22')

In [15]:
label_encoder = LabelEncoder()
w2v_keys_encoded = label_encoder.fit_transform(w2v_keys)
print(w2v_keys_encoded[:10])

[8555 5287 6028 7081  529 2022 4474 8606 1382 4806]


In [16]:
onehot_encoder = OneHotEncoder(sparse=False)
w2v_keys_encoded = w2v_keys_encoded.reshape(len(w2v_keys_encoded),1)
onehot_encoded = onehot_encoder.fit_transform(w2v_keys_encoded)
print(onehot_encoded[:2])

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [33]:
#inverted = label_encoder.inverse_transform([np.argmax(onehot_encoded[0, :])])
#print(inverted)

['THE']


In [17]:
timestep_group = 8
random_range = int(len(words)/timestep_group) - 1

In [18]:
def get_next_batch(batch_size):
    X_batch = []
    Y_batch = []
    index = np.random.randint(0,random_range,batch_size)
    #print("index: ",index)
    for i in range(batch_size):
        temp = []
        for j in range(timestep_group-1):
            #print(j,"Get word:",words[index[i]+j])
            temp.append(w2v.wv.get_vector(words[index[i]+j]))
        X_batch.append(temp)
        #print("Label is:", words[index[i]+timestep_group-1])
        temp = label_encoder.transform([words[index[i]+timestep_group-1]])
        tempv = onehot_encoder.transform(temp.reshape(1,1))
        Y_batch.append(tempv[0])
    return np.array(X_batch),np.array(Y_batch) 

In [19]:
alpha = 1e-3
input_size = emb_dim
timestep_size = timestep_group - 1 
hidden_size = 1024
layer_num = 2 
class_num = w2v_len
cell_type = "lstm"

X = tf.placeholder(tf.float32, [None, 7, emb_dim])
y_input = tf.placeholder(tf.float32, [None, class_num])
batch_size = tf.placeholder(tf.int32, [])
keep_prob = tf.placeholder(tf.float32, [])

In [20]:
def lstm_cell(cell_type,num_nodes,keep_prob):
    # assert(cell_type in ["lstm","block_lstm"],"Wrong cell type")
    if cell_type == "lstm":
        cell = rnn.BasicLSTMCell(num_nodes)
    else:
        cell = rnn.LSTMBlockCell(num_nodes)
    cell = rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
    return cell

mlstm_cell = rnn.MultiRNNCell([lstm_cell(cell_type,hidden_size,keep_prob) for _ in range(layer_num)],state_is_tuple=True)
init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32)

In [21]:
outputs = list()
state = init_state

with tf.variable_scope('RNN'):
    for timestep in range(timestep_size):
        (cell_output,state) = mlstm_cell(X[:,timestep,:],state)
        outputs.append(cell_output)
h_state = outputs[-1]

In [22]:
W = tf.Variable(tf.truncated_normal([hidden_size,class_num],stddev=0.1),dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1,shape=[class_num]),dtype=tf.float32)
y_pre = tf.nn.softmax(tf.matmul(h_state,W) + bias)

In [23]:
loss = - tf.reduce_mean(y_input * tf.log(y_pre))
train_op = tf.train.AdamOptimizer(alpha).minimize(loss)

In [24]:
correct_pred = tf.equal(tf.argmax(y_pre,1),tf.argmax(y_input,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred,"float"))

In [25]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [26]:
for i in range(5000):
    _batch_size=100
    X_batch, y_batch = get_next_batch(_batch_size)
    cost, acc, _ = sess.run([loss,accuracy,train_op],feed_dict={X: X_batch, y_input:y_batch,keep_prob:0.5, batch_size: _batch_size})
    if (i+1)%10 == 0:
        print("step {},train cost={:.6f},acc={:.6f}".format(i+1,cost,acc))

step 10,train cost=0.000825,acc=0.060000
step 20,train cost=0.000770,acc=0.040000
step 30,train cost=0.000775,acc=0.030000
step 40,train cost=0.000708,acc=0.080000
step 50,train cost=0.000693,acc=0.080000
step 60,train cost=0.000708,acc=0.060000
step 70,train cost=0.000659,acc=0.090000
step 80,train cost=0.000672,acc=0.080000
step 90,train cost=0.000686,acc=0.060000
step 100,train cost=0.000716,acc=0.030000
step 110,train cost=0.000675,acc=0.040000
step 120,train cost=0.000620,acc=0.040000
step 130,train cost=0.000708,acc=0.060000
step 140,train cost=0.000629,acc=0.050000
step 150,train cost=0.000670,acc=0.040000
step 160,train cost=0.000651,acc=0.050000
step 170,train cost=0.000625,acc=0.050000
step 180,train cost=0.000642,acc=0.060000
step 190,train cost=0.000641,acc=0.040000
step 200,train cost=0.000622,acc=0.060000
step 210,train cost=0.000663,acc=0.060000
step 220,train cost=0.000642,acc=0.050000
step 230,train cost=0.000714,acc=0.030000
step 240,train cost=0.000608,acc=0.040000
s

step 1950,train cost=0.000446,acc=0.210000
step 1960,train cost=0.000433,acc=0.160000
step 1970,train cost=0.000430,acc=0.190000
step 1980,train cost=0.000470,acc=0.180000
step 1990,train cost=0.000495,acc=0.120000
step 2000,train cost=0.000432,acc=0.210000
step 2010,train cost=0.000463,acc=0.180000
step 2020,train cost=0.000416,acc=0.220000
step 2030,train cost=0.000457,acc=0.150000
step 2040,train cost=0.000455,acc=0.170000
step 2050,train cost=0.000469,acc=0.140000
step 2060,train cost=0.000445,acc=0.130000
step 2070,train cost=0.000456,acc=0.200000
step 2080,train cost=0.000465,acc=0.210000
step 2090,train cost=0.000444,acc=0.250000
step 2100,train cost=0.000449,acc=0.260000
step 2110,train cost=0.000405,acc=0.220000
step 2120,train cost=0.000410,acc=0.230000
step 2130,train cost=0.000397,acc=0.190000
step 2140,train cost=0.000396,acc=0.260000
step 2150,train cost=0.000397,acc=0.230000
step 2160,train cost=0.000408,acc=0.230000
step 2170,train cost=0.000433,acc=0.190000
step 2180,t

step 3860,train cost=0.000185,acc=0.570000
step 3870,train cost=0.000208,acc=0.520000
step 3880,train cost=0.000196,acc=0.540000
step 3890,train cost=0.000182,acc=0.520000
step 3900,train cost=0.000151,acc=0.660000
step 3910,train cost=0.000166,acc=0.660000
step 3920,train cost=0.000183,acc=0.520000
step 3930,train cost=0.000167,acc=0.640000
step 3940,train cost=0.000196,acc=0.530000
step 3950,train cost=0.000154,acc=0.600000
step 3960,train cost=0.000170,acc=0.610000
step 3970,train cost=0.000174,acc=0.650000
step 3980,train cost=0.000178,acc=0.580000
step 3990,train cost=0.000185,acc=0.570000
step 4000,train cost=0.000224,acc=0.520000
step 4010,train cost=0.000183,acc=0.540000
step 4020,train cost=0.000143,acc=0.630000
step 4030,train cost=0.000166,acc=0.600000
step 4040,train cost=0.000184,acc=0.530000
step 4050,train cost=0.000146,acc=0.620000
step 4060,train cost=0.000170,acc=0.590000
step 4070,train cost=0.000188,acc=0.570000
step 4080,train cost=0.000156,acc=0.640000
step 4090,t

In [27]:
def get_next_index(vector):
    threshold = np.random.rand(1)
    #print("threshold is:",threshold)
    i = 0
    proba_sum = 0.0
    while(i<len(vector)):
        proba_sum += vector[i]
        if(proba_sum>threshold):
            #print("Find i:",i)
            break
        i += 1
    return i

In [31]:
def Fake_Old_Baggins(len_to_write):
    #text = ["gandalf","was","reading","a","riddle","written","in"]
    #kickoff_phrase = ["gandalf","was","reading","a","riddle","written","in"]
    text = ["gollum","was","catching","fish","by","the","lake"]
    kickoff_phrase = ["gollum","was","catching","fish","by","the","lake"]
    i = 0
    while(i < len_to_write):
        kickoff_vector = []
        for j in range(timestep_size):
            kickoff_vector.append(w2v.wv.get_vector(kickoff_phrase[j]))
        kickoff_batch = []
        kickoff_batch.append(kickoff_vector)
        kickoff_batch = np.array(kickoff_batch)
        
        predict_vector = sess.run([y_pre],feed_dict={X: kickoff_batch, keep_prob:1.0,batch_size: 1})
        predict_vector = np.array(predict_vector)
        predict_word = label_encoder.inverse_transform(get_next_index(predict_vector[0][0]))
        
        kickoff_phrase.pop(0)
        kickoff_phrase.append(predict_word)
        text.append(predict_word)
        
        #print(kickoff_phrase)
        
        i += 1
    return ' '.join(text)

In [29]:
rslt = Fake_Old_Baggins(100)
print(rslt)

gandalf was reading a riddle written in most ; and they dwelt taller became scarlet , and they could do . at time they had got to wish it . they were very of fact , sheltered , but thorin and well-ordered ( the speech ) , who had always nothing new among the different in elrond ; but when frodo had not the marriage of him ; and he was getting restless of light . he looked at the door on his eyes . no apparent , for a present , not he meant . ' 'it has come far to give such about this ring


In [30]:
rslt2 = Fake_Old_Baggins(300)
print(rslt2)

gandalf was reading a riddle written in the black of the shire . 'so to the shire-holidays of the stories of the thain . bilbo was elected after a table in the black . will not no are the , or of the long . if it was his ring , i say , and he made ever for their own ; and both it , and go , his nuisance . ' 'then 'that '' . ' well i do to know ' this letters , and i could not an . it seemed a great selection creature . ' 'and i 've n't oiolosslo my own , anyway you , i saw no new . i have said good-bye . but i have n't say to him his mind the wretched to become . he becomes to go alone , and the other of those mountains must be roots over from the bag edge at the outlandish . the bounders of the shire , and no point , and that was not much or doubt . there was no pleasant of a legend of old m pass the party , though i still you not say what that this ai thought no elm of the ring knew in its fourth , with the air book . before the last battle on the wild power that to it himself . he s

In [34]:
rslt3 = Fake_Old_Baggins(200)

In [33]:
print(rslt3)

gollum was catching fish by the lake pipe-weed son of the shire centuries kind first days of the shire , it was before the true for that he had nothing , of course , and very were their heads . the others qualities , and he s to be sure of him , ' said gandalf . 'and he kept that his will to be very with bilbo and he retired the good in him , and the old of building odd ; but there is in a great of the power , stern and his engrossing branch were peregrin and disliked them . and they were maker through the study , and the stride and knives -paper and the reminiscences , and some reed-beds and were the handling ( started as had said ) . the ring would not make only it . i do not i shall that has come or see by you . i deserve , perhaps , like hive - not expression . he got got the knowledge , till he could an , and the thought held to say . and elf-towers flash , as an present , and this 's mr. 's party . i love to get . i 'm not trying it came in a new . but the finally folk the true a

In [35]:
print(rslt3)

gollum was catching fish by the lake ; rose long body , before the fireworks of the wise , and in the language of those lands the elves of these , and they were accurate in whispers . they contained in the mountains , and often all it was , . being had been done ; and i like he often old , and it was staying in it . he seemed very point hoots with bilbo 's inhabitants . from there he gave to the sun , menacing with his hand to turned . 'i they ? ' go , ' said the wizard . 'you have never been the true , and he takes it , and he got up and drained on his own . in apparent end locked the long was long divided : the shadow of the crowd and blocked . the mathom-house were broader of the they . of the westlands of eriador , between the misty mountains and the mountains mountains , and far soon were as skilful , sheltered , their own and long curious . they were hospitable marked in treasure , and the crumbs strongholds cracker the game in disappearing . the old of building farmhouses ,


In [None]:
sess.close()

In [11]:
timestep_group = 8

random_range = int(len(words)/timestep_group) - 1

In [12]:
def get_next_batch(batch_size):
    X_batch = []
    Y_batch = []
    index = np.random.randint(0,random_range,batch_size)
    #print("index: ",index)
    for i in range(batch_size):
        temp = []
        for j in range(timestep_group-1):
            #print(j,"Get word:",words[index[i]+j])
            temp.append(w2v.wv.get_vector(words[index[i]+j]))
        X_batch.append(temp)
        #print("Label is:", words[index[i]+timestep_group-1])
        Y_batch.append(w2v.wv.get_vector(words[index[i]+timestep_group-1]))
    return np.array(X_batch),np.array(Y_batch) 

In [13]:
alpha = 0.0005
input_size = emb_dim
timestep_size = timestep_group - 1
hidden_size = 512
layer_num = 2 
cell_type = "lstm"

X = tf.placeholder(tf.float32, [None, 7, emb_dim])
y_input = tf.placeholder(tf.float32, [None, emb_dim])
batch_size = tf.placeholder(tf.int32, [])
keep_prob = tf.placeholder(tf.float32, [])

In [14]:
def lstm_cell(cell_type,num_nodes,keep_prob):
    # assert(cell_type in ["lstm","block_lstm"],"Wrong cell type")
    if cell_type == "lstm":
        cell = rnn.BasicLSTMCell(num_nodes)
    else:
        cell = rnn.LSTMBlockCell(num_nodes)
    cell = rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
    return cell

mlstm_cell = rnn.MultiRNNCell([lstm_cell(cell_type,hidden_size,keep_prob) for _ in range(layer_num)],state_is_tuple=True)
init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32)

In [15]:
outputs = list()
state = init_state

In [16]:
with tf.variable_scope('RNN'):
    for timestep in range(timestep_size):
        (cell_output,state) = mlstm_cell(X[:,timestep,:],state)
        outputs.append(cell_output)
h_state = outputs[-1]

In [17]:
W = tf.Variable(tf.truncated_normal([hidden_size,emb_dim],stddev=0.1),dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1,shape=[emb_dim]),dtype=tf.float32)
y_pre = tf.matmul(h_state,W) + bias

In [18]:
loss = tf.losses.mean_squared_error(y_input,y_pre)
train_op = tf.train.AdamOptimizer(alpha).minimize(loss)

In [19]:
sess = tf.Session()

In [20]:
sess.run(tf.global_variables_initializer())

In [None]:
#time0 = time.time()
for i in range(5000):
    _batch_size=100
    X_batch, y_batch = get_next_batch(_batch_size)
    cost, _ = sess.run([loss,train_op],feed_dict={X: X_batch, y_input:y_batch,keep_prob:0.5, batch_size: _batch_size})
    if (i+1)%10 == 0:
        print("step {},train cost={:.6f}".format(i+1,cost))

In [82]:
#kickoff_phrase = ["Gandalf","is","reading","a","riddle","on","a"]
kickoff_phrase = ["is","reading","a","riddle","on","a","track"]
kickoff_vector = []
for i in range(timestep_size):
    kickoff_vector.append(w2v.wv.get_vector(kickoff_phrase[i]))
kickoff_batch = []
kickoff_batch.append(kickoff_vector)
kickoff_batch = np.array(kickoff_batch)

In [83]:
kickoff_batch.shape

(1, 7, 400)

In [84]:
predict_vector = sess.run([y_pre],feed_dict={X: kickoff_batch, keep_prob:1.0,batch_size: 1})

In [None]:
sess.close()