In [1]:
from collections import Counter

import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.model_selection import train_test_split
from utils import SentenceReader
from gensim.models import Word2Vec

In [32]:
def build_dictionary(words):
    counter = Counter(words)
    dictionary = dict()
    for word, _ in counter.most_common(10000):
        dictionary[word] = len(dictionary)
    reverse_dict = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dict


def read_data(dir_name):
    sentences = list(SentenceReader('data/shakespear/'))
    content = []
    for s in sentences:
        content.extend(s)
    content = np.reshape(content, [-1, ])
    model = Word2Vec(sentences, size=w2v_size, min_count=1)
    print(content, content.shape)
    return content, model


def get_trainable_data(word_series, dictionary, sequence_size, w2v_model):
    m = len(word_series)-sequence_size+1
    data = np.zeros((m, sequence_size, w2v_size))
    for i in range(m):
        for j in range(sequence_size):
            word = word_series[i+j]
            data[i, j] = w2v_model[word]
    return data


def get_data(sequence_size):
    raw, w2v_model = read_data('belling_the_cat.txt')
    dictionary, reverse_dictionary = build_dictionary(raw)
    data = get_trainable_data(raw, dictionary, sequence_size, w2v_model)
    return dictionary, reverse_dictionary, data, w2v_model


def word_to_vector(word, dictionary):
    vector = np.zeros(len(dictionary))
    idx = int(dictionary[word])
    vector[idx] = 1
    return vector


def vector_to_word(vector, reverse_dictionary):
    index = vector.argmax()
    return reverse_dictionary[index]


def get_training_and_test_data_from_sequences(word_sequence_matrix, dictionary_size, test_size=0.1):
    """Given a sequence of word indexes"""
    x = word_sequence_matrix[:, :-1, :]
    y = word_sequence_matrix[:, -1, :]
    # x = x.reshape([-1, x.shape[1], 1])
    return train_test_split(x, y, test_size=test_size, random_state=1211)


def represent_x_and_y(x, y):
    y_args = y.argsort(1)[:, -3:][:,::-1]
    for i in range(x.shape[0]):
        inputs = [rev_dict[j.argmax()] for j in x[i]]
        output = [rev_dict[j] for j in y_args[i]]
        outputs_with_score = y[i][y_args[i]]
        print(' '.join(inputs), output, outputs_with_score)

## Define parameters.

In [58]:
n_input = 3
max_iter = 1000
w2v_size = 100

## Let's prepare our data set for training and validation.

In [59]:
dictionary, rev_dict, word_sequences, w2v_model = get_data(n_input+1)
vocab_size = len(dictionary)
x_train, x_test, y_train, y_test = get_training_and_test_data_from_sequences(word_sequences, vocab_size)

['suffolk' ':' 'he' ..., 'up' '.' "'"] (23229,)


## Tensorflow setup.

In [60]:
n_hidden = 50
learning_rate = 0.001

In [61]:
tf.reset_default_graph()

x = tf.placeholder('float', (None, n_input, w2v_size), name='x')
y = tf.placeholder('float', (None, w2v_size), name='y')

weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, w2v_size]), name='weights_out')
}
biases = {
    'out': tf.Variable(tf.random_normal([w2v_size]), name='biases_out')
}

def RNN(x, w, b):
    x = tf.unstack(x, n_input, 1)
    rnn_cell = stacked_lstm = tf.contrib.rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden) for _ in range(2)])
    output, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
    return tf.matmul(output[-1], w['out']) + b['out']

model = RNN(x, weights, biases)
prediction = tf.nn.softmax(model)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

init = tf.global_variables_initializer()

def train(session, batch_size, epochs):
    m = x_train.shape[0]
    for e in range(epochs):
        for i in range(0, m, batch_size):
            x_batch = x_train[i:i+batch_size, :]
            y_batch = y_train[i:i+batch_size, :]
            session.run(optimizer, feed_dict={x: x_batch, y: y_batch})
        c = session.run(cost, feed_dict={x: x_test, y:y_test})
        print('{}. Cost - {}'.format(e, c))

def predict(session, x_test):
    return session.run(prediction, feed_dict={x: x_test})

In [62]:
session.close()
session = tf.Session()
session.run(init)
train(session, 500, 50)
predictions = predict(session, x_test)

0. Cost - -32.637779235839844
1. Cost - -144.29051208496094
2. Cost - -214.3891143798828
3. Cost - -258.60205078125


KeyboardInterrupt: 

In [14]:
# [rev_dict[i] for i in predictions.argmax(1)]
# predictions.argsort(1)[:, :3]
represent_x_and_y(x_test, predictions)

: my best ['of', 'must', 'be'] [ 0.11740546  0.05834505  0.05280576]
, And in ['the', 'she', 'a'] [ 0.15624152  0.11795139  0.08184648]
Olland to her ['her', 'to', '.'] [  9.85010684e-01   8.95102974e-03   5.88946510e-04]
And such a ['whole', 'Murder', 'desire'] [ 0.05321015  0.04727762  0.04624144]
asleep . MERCUTIO ['I', ',', ':'] [ 0.97975791  0.00319734  0.00300873]
never kill ' ['d', 'Twill', 'learning'] [ 0.58107132  0.06078913  0.052454  ]
sir : I ['must', 'be', 'was'] [ 0.57835841  0.13475895  0.03974489]
heart , Yet ["'", 'you', 'I'] [ 0.23718159  0.1846723   0.15584044]
it is , ['And', "'", 'that'] [ 0.0896019   0.07031361  0.06757016]
, villain , ['I', 'the', 'To'] [ 0.4140583   0.19063208  0.06067153]
appear now ! ['Is', 'goodly', 'if'] [ 0.19148324  0.09862468  0.09764537]
itself . You ['all', 'You', ','] [ 0.35888061  0.19117349  0.0279025 ]
, but this ['I', "'", 'shall'] [ 0.21739744  0.07227439  0.05873606]
eyes , you ['a', 'are', 'imperial'] [ 0.21844831  0.08025374  0

In [None]:
def sentence_generator(session, length, start_words):
    def matrix_to_words(mat):
        words = []
        for i in range(mat.shape[0]):
            word_idx = mat[i,:].argmax()
            word = rev_dict[word_idx]
            words.append(word)
        return ' '.join(words)
    pre_length = len(start_words)
    length_to_generate = length - pre_length
    words = np.zeros([length, vocab_size])

    for i in range(pre_length):
        words[i][start_words[i]] = 1

    for i in range(pre_length, length):
        x = np.reshape(words[i-n_input:i,:], [1, n_input, vocab_size])
        p = predict(session, x)
        # represent_x_and_y(x, p)
        pred = predict(session, x)[0]
        words[i] = pred
    words_text = matrix_to_words(words)
    print(words_text)

In [None]:
sentence_generator(session, 100, [dictionary[i] for i in 'Z M N'.split(' ')])

In [15]:
raw, w2v_model = read_data('belling_the_cat.txt')

['suffolk' ':' '\n' ..., '\n' 'brutus' '\n'] (2347,)


In [54]:
x_train

array([[[ -1.39600992e-01,   1.70090467e-01,  -1.09117841e-02, ...,
          -5.81735492e-01,  -2.85451323e-01,  -3.19034994e-01],
        [ -1.52070625e-02,   9.17999167e-03,   6.82297442e-03, ...,
          -3.30649950e-02,  -1.43043520e-02,  -3.84610333e-03],
        [ -1.85123682e-01,   1.92965448e-01,  -2.02680733e-02, ...,
          -6.80584490e-01,  -3.46160322e-01,  -3.65011156e-01]],

       [[ -2.42117330e-01,   2.73815811e-01,  -1.49040744e-02, ...,
          -9.51315761e-01,  -4.66206580e-01,  -5.12049735e-01],
        [  2.90360535e-03,   1.20017724e-02,  -8.38095229e-03, ...,
          -2.84511615e-02,  -7.92866852e-03,  -5.90040069e-03],
        [ -2.19042078e-01,   2.48720586e-01,  -3.13709117e-03, ...,
          -8.74254346e-01,  -4.25594896e-01,  -4.71629828e-01]],

       [[ -7.40444139e-02,   8.66002142e-02,  -1.34397135e-03, ...,
          -3.17642629e-01,  -1.59537703e-01,  -1.60744488e-01],
        [ -1.48574010e-01,   1.86885625e-01,  -7.39162555e-03, ...,
    

In [107]:
index = w2v_model.wv.vocab['word'].index
word = w2v_model.wv.index2word[index]

In [108]:
word

'word'