## [You can find all the data here](http://shakespeare.mit.edu/)

In [3]:
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.contrib import rnn
from utils import SentenceReader
from sklearn.model_selection import train_test_split

In [4]:
class DataReader:
    def __init__(self, dir_name, input_size, w2v_vector_size):
        self.dir_name = dir_name
        self.w2v_vector_size = w2v_vector_size
        self.input_size = input_size
        self.w2v_model = None
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_train = None

    def get_word_index(self, word):
        return self.w2v_model.wv.vocab[word].index

    def get_word_from_one_hot_matrix(self, one_hot_matrix):
        return [
            self.w2v_model.wv.index2word[int(one_hot_vector.argmax())]
            for one_hot_vector in one_hot_matrix
        ]
        
    def get_word_index_to_vec(self, word_index):
        word = self.w2v_model.wv.index2word[word_index]
        return self.w2v_model[word]

    def word_to_vec(self, word):
        return self.w2v_model[word]

    def get_x_y_from_word_sequence(self, sequence):
        n = self.input_size+1
        m = len(sequence) - n + 1
        seq_matrix = np.zeros([m, n], dtype=np.float32)

        for i in range(m):
            for j in range(n):
                try:
                    seq_matrix[i, j] = sequence[i+j]
                except Exception as e:
                    print(i, j, i+j)
                    raise e

        x = np.zeros([m, self.input_size, self.w2v_vector_size], dtype=np.float32)
        for i in range(m):
            for j in range(self.input_size):
                word_index = int(seq_matrix[i][j])
                x[i, j] = self.get_word_index_to_vec(word_index)

        y = np.zeros([m, self.w2v_vector_size], np.float32)
        for i in range(m):
            word_index = int(seq_matrix[:, -1][i])
            y[i, :] = self.get_word_index_to_vec(word_index)
        # corpus_count = len(r.w2v_model.wv.index2word)
        # y = np.zeros([m, corpus_count], np.int8)
        # y_mask = seq_matrix[:, -1].reshape(-1, ).astype(np.int8)
        # y[np.arange(m), y_mask] = 1
        return x, y

    def get_unit_matrix(matrix):
        mod = np.power((matrix**2).sum(1), 0.5).reshape(matrix.shape[0], 1)
        return matrix / mod

    def read_data(self, test_size=0.2):
        sentences = list(SentenceReader(self.dir_name))
        self.w2v_model = Word2Vec(sentences, size=self.w2v_vector_size, min_count=1)

        word_index_sequence = []
        for sentence in sentences:
            word_index_sequence.extend([self.get_word_index(word) for word in sentence])
        x, y = self.get_x_y_from_word_sequence(word_index_sequence)

        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            x, y, test_size=test_size, random_state=1211
        )

In [5]:
n_input = 3
n_vectors = 60

n_hidden = 512
learning_rate = 0.001

In [6]:
r = DataReader('data/shakespear/small/', n_input, n_vectors)
r.read_data()
vocab_size = len(r.w2v_model.wv.index2word)

reading file .ipynb_checkpoints
reading file romeo_and_juliet_1000.txt


In [16]:
tf.reset_default_graph()

x = tf.placeholder(tf.float32, (None, n_input, n_vectors), name='x')
y = tf.placeholder(tf.float32, (None, n_vectors), name='y')

weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, n_vectors]), name='weights_out')
}
biases = {
    'out': tf.Variable(tf.random_normal([n_vectors]), name='biases_out')
}

def RNN(x, w, b):
    x = tf.unstack(x, n_input, 1)
    # rnn_cell = tf.contrib.rnn.MultiRNNCell([rnn.GRUCell(n_hidden) for _ in range(3)])
    rnn_cell = rnn.BasicLSTMCell(n_hidden)
    output, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
    return tf.matmul(output[-1], w['out']) + b['out']

# def cosine_loss(pred, labels):
#     pred_mod = tf.pow(tf.pow(pred, 2).sum(1), 0.5).reshape(matrix.shape[0], 1)
#     return matrix / mod

model = RNN(x, weights, biases)
cost = tf.reduce_mean(tf.squared_difference(model, y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

init = tf.global_variables_initializer()

def train(session, batch_size, epochs):
    m = r.x_train.shape[0]
    for e in range(epochs):
        for i in range(0, m, batch_size):
            x_batch = r.x_train[i:i+batch_size, :]
            y_batch = r.y_train[i:i+batch_size, :]
            session.run(optimizer, feed_dict={x: x_batch, y: y_batch})
        c = session.run(cost, feed_dict={x: r.x_test[:2000], y: r.y_test[:2000]})
        print('{}. Cost - {}'.format(e, c))

def predict(session, x_matrix):
    return session.run(model, feed_dict={x: x_matrix})

In [17]:
session = tf.Session()
session.run(init)
train(session, 2048, 20)
predictions = predict(session, r.x_test)

0. Cost - 0.5544190406799316
1. Cost - 0.35279542207717896
2. Cost - 0.33756324648857117
3. Cost - 0.3308546543121338
4. Cost - 0.32579174637794495
5. Cost - 0.3224600553512573
6. Cost - 0.3214806914329529
7. Cost - 0.3173556923866272
8. Cost - 0.3148678243160248
9. Cost - 0.31495580077171326
10. Cost - 0.31402137875556946
11. Cost - 0.3123234808444977
12. Cost - 0.31293049454689026
13. Cost - 0.3115520477294922
14. Cost - 0.3104085326194763
15. Cost - 0.3104018270969391
16. Cost - 0.30949828028678894
17. Cost - 0.3093410134315491
18. Cost - 0.30849018692970276
19. Cost - 0.3082958161830902


In [None]:
predictions = predict(session, r.x_test)

In [9]:
def represent_x_and_y(x, y):
    for i in range(x.shape[0]):
        input_sequence = []
        for j in range(x.shape[1]):
            word, _ = r.w2v_model.wv.similar_by_vector(x[i][j], topn=1)[0]
            input_sequence.append(word)
        output_word, _ = r.w2v_model.similar_by_vector(y[i], topn=1)[0]
        print('{} => {}'.format(' '.join(input_sequence), output_word))

def generate_sentence(session, length, start_words):
    n_start_words = len(start_words)
    sequence_words = []
    sequence_matrix = np.zeros([length, r.w2v_vector_size], dtype=np.float32)
    for i, word in enumerate(start_words):
        sequence_matrix[i, :] = r.word_to_vec(word)
        sequence_words.append(word)

    for i in range(length-n_start_words):
        input_sequence = sequence_matrix[i: i+r.input_size, :].reshape([1, r.input_size,-1])
        # print(input_sequence.shape, input_sequence.sum())
        output_vector = predict(session, input_sequence)
        sequence_matrix[i+n_start_words, :] = output_vector

        word, _ = r.w2v_model.wv.similar_by_vector(output_vector.reshape([-1, ]), topn=1)[0]
        sequence_words.append(word)
    return ' '.join(sequence_words)

In [13]:
generate_sentence(session, 100, '. Why is this place so'.split())

'. Why is this place so whether quite quite quite quite quite best best best best just just just just just just just just just going going going going going going going going going shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn shouldn'

In [7]:
r.y_train.shape

(322084, 100)

In [2]:
tf.random_uniform?

In [7]:
rnn.BasicLSTMCell?

In [8]:
rnn_cell = rnn.BasicLSTMCell(n_hidden)