In [1]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.txt.word2vec', binary=False)

In [2]:
import numpy as np
SEQUENCE_LENGTH = 6
HIDDEN_SIZE = 256

EMBEDDING_SIZE_ORIG = 100
EMBEDDING_SIZE = 103

def encode_word(word, w2v):
    if word == "<pad>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-1] = 1
        return v
    elif word == "<newline>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-2] = 1
        return v
    elif word == "<unk>" or word not in w2v:
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-3] = 1
        return v
    else:        
        v = w2v[word]
        w = np.zeros((3,))
        return np.append(v, w, axis=0)

def encode_words(words, w2v):
    vec = np.zeros((len(words), EMBEDDING_SIZE))
    for (i,word) in enumerate(words):
        vec[i] = encode_word(word, w2v)
    return vec

In [3]:
v = encode_word("hello", w2v)
print(v.shape)
w = encode_word("boom-a-boomerang", w2v)
print(w.shape)

(103,)
(103,)


In [4]:
v_newline = encode_word("<newline>", w2v)
v_pad = encode_word("<pad>", w2v)
v_unk = encode_word("<unk>", w2v)

print(np.argmax(v_newline))
print(np.argmax(v_pad))
print(np.argmax(v_unk))

101
102
100


In [5]:
def decode_vec(vec, w2v):
    base_vec = vec[:EMBEDDING_SIZE_ORIG]
    ext_vec = vec[EMBEDDING_SIZE_ORIG:]
    if ext_vec[0]:
        return "<unk>"
    elif ext_vec[1]:
        return "<newline>"
    elif ext_vec[2]:
        return "<pad>"
    else:
        return w2v.similar_by_vector(base_vec)[0][0]

In [6]:
print(decode_vec(v, w2v))
print(decode_vec(v_newline, w2v))
print(decode_vec(v_pad, w2v))
print(decode_vec(v_unk, w2v))

hello
<newline>
<pad>
<unk>


In [7]:
def prepare_song(song, buffer_length):
    tokens = song

    x_train = []
    y_train = []
    for i in range(0, len(song)):
        if i+buffer_length+1 >= len(tokens):
            pad_length = (i+buffer_length+1) - len(tokens)
            tokens += ['<pad>'] * pad_length

        x_train.append(tokens[i:i+buffer_length])
        y_train.append(tokens[i+buffer_length])

    return x_train,y_train

In [61]:
token_vocab = {'<pad>', '<unk>'}
songs = []
with open("data/sentences.txt", "r") as f:
    for line in f.readlines():
        tokens = [token for token in line.rstrip().split(" ")]
        songs.append(tokens)
        token_vocab = token_vocab.union(set(tokens))

In [62]:
x_vec = []
y_vec = []
for song in songs[:30]:
    x_vec_i, y_vec_i = prepare_song(song, SEQUENCE_LENGTH)
    x_vec.extend(x_vec_i)
    y_vec.extend(y_vec_i)
print(len(x_vec))
print(x_vec[0])

8836
['look', 'at', 'her', 'face', ',', 'it']


In [63]:
words = list(token_vocab)
vocab_size = len(words)
print("Vocab size:", vocab_size)
print("W2V vocab size:", len(w2v.vocab))
word2idx = { word:i for i,word in enumerate(words) }
idx2word = { i:word for i,word in enumerate(words) }

Vocab size: 5111
W2V vocab size: 400000


In [64]:
def one_hot_encode(word, word2idx):
    v = np.zeros((len(word2idx, )))
    v[word2idx[word]] = 1
    return v

def one_hot_decode(word, idx2word):
    return idx2word[np.argmax(word)]

In [65]:
print(word2idx["hello"])
print(np.argmax(one_hot_encode("hello", word2idx)))

5028
5028


In [66]:
from random import shuffle
def generate_batches(data_length, mini_batch_size):
    for begin in range(0, data_length, mini_batch_size):
        end = min(begin + mini_batch_size, data_length)
        yield begin, end

def load_batch(xs, ys, begin, end):
    batch_size = end-begin
    
    x_train = np.zeros((batch_size, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    y_train = np.zeros((batch_size, vocab_size))
    
    xs_batch = xs[begin:end]
    ys_batch = ys[begin:end]
    
    c = list(zip(xs_batch, ys_batch))
    shuffle(c)
    xs_batch, ys_batch = zip(*c)
    
    for i in range(batch_size):
        x_train[i] = encode_words(xs_batch[i], w2v)
        y_train[i] = one_hot_encode(ys_batch[i], word2idx)
    
    return x_train, y_train

In [67]:
batches = generate_batches(len(x_vec), 512)
begin, end = next(batches)

x_train, y_train = load_batch(x_vec, y_vec, begin, end)
print(x_train.shape, y_train.shape)

print("X train")
for j in range(10):
    ws = ' '.join([decode_vec(x_train[j][i], w2v) for i in range(SEQUENCE_LENGTH)])
    print(ws)
    
print("\nY train")
for j in range(10):
    print(one_hot_decode(y_train[j], idx2word))

(512, 6, 103) (512, 5111)
X train
i am your music and i
<newline> please do n't talk ,
? <newline> she 's just my
could ever believe that she could
who could ever believe that she
walking for hours and talking <newline>
to me <newline> look at the
down <newline> there 's a shimmer
about all the things that we
and make me strong <newline> (

Y train
am
go
kind
be
could
about
way
in
plan
play


In [77]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.layers import LeakyReLU
def build_model(vocab_size):
    model = Sequential()
    model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, EMBEDDING_SIZE), return_sequences=True))
    model.add(Dropout(0.4))
    model.add(LSTM(1024))
    model.add(Dropout(0.4))
    model.add(Dense(2048))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer="rmsprop", metrics = ['accuracy'])
    return model
model = build_model(vocab_size)

In [98]:
batches = list(generate_batches(len(x_vec), 2048))

In [99]:
shuffle(batches)
for i in range(4):
    begin, end = batches[i]
    x_train, y_train = load_batch(x_vec, y_vec, begin, end)
    model.fit(x_train, y_train, batch_size=256, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [125]:
def sample(preds, temperature=1.0):
    preds = preds.reshape(preds.shape[1])
    arr = np.asarray(preds).astype('float64')
    log_preds_scaled = np.log(arr) / temperature
    preds_scaled = np.exp(log_preds_scaled)
    softmaxed = preds_scaled / np.sum(preds_scaled)
    probas = np.random.multinomial(1, softmaxed, 1)
    return np.argmax(probas)
    

In [134]:
#words = ["never", "gonna", "give", "you", "up", ","]
words = ["look", "at", "her", "face", ",", "it"]
#words = ["when", "there", "'s", "a", "dark", "storm"]
#words = ["do", "better", ",", "who", "better", "?"]
words_seq = encode_words(words, w2v)
words_seq = words_seq.reshape(1, SEQUENCE_LENGTH, EMBEDDING_SIZE)
#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

result = words
for j in range(60):
    word = idx2word[sample(model.predict(words_seq), temperature=1.4)]
    #word = one_hot_decode(model.predict(words_seq), idx2word)
    result.append(word)
    
    new_words = np.zeros((1, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    for i in range(SEQUENCE_LENGTH-1):
        new_words[0, i] = words_seq[0, i+1]
    new_words[0, SEQUENCE_LENGTH-1] = encode_word(word, w2v)
    words_seq = new_words

#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

print(' '.join(result))
    #words = new_words
        #new_words[0] = words[0, 1]
        #new_words[0, 1] = words[0, 2]
        #new_words[0, 2] = words[0, 3]
        #new_words[0, 3] = encode_word(word, word2idx)
    

look at her face , it 's a party <newline> i remember sitting half <newline> little while half in the clay <newline> i 'm still trapped my , <newline> cold gim yet there another end without <newline> there <newline> fixin angel , going <newline> loving it do <newline> please knew empty feeling what again hairs in the dream <newline> when the twilight seems like through to
