In [30]:
SEQUENCE_LENGTH = 6

In [31]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True)

In [32]:
import util
words = util.load_vocab("fixed_vocab.pkl")
vocab_size = len(words)
print("Vocab size:", vocab_size)
print("W2V vocab size:", len(w2v.vocab))
word2idx = { word:i for i,word in enumerate(words) }
idx2word = { i:word for i,word in enumerate(words) }

Vocab size: 71514
W2V vocab size: 400000


In [4]:
from random import shuffle
songs = []
with open("data/sentences.txt", "r") as f:
    for line in f:
        songs.append(line.rstrip().split(" "))
shuffle(songs)

In [5]:
song_count = len(songs)
train_size = int(song_count * 0.7)
test_dev_size = int((song_count - train_size) * 0.5)
print("Total number of songs:", song_count)
print("Number of songs for training:", train_size)
print("Number of songs for dev/test:", test_dev_size)

Total number of songs: 57650
Number of songs for training: 40355
Number of songs for dev/test: 8647


In [6]:
songs_train = songs[:train_size]
songs_dev = songs[train_size:train_size+test_dev_size]
songs_test = songs[train_size+test_dev_size:]
print("Train:", len(songs_train))
print("Dev:", len(songs_dev))
print("Test:", len(songs_test))

Train: 40355
Dev: 8647
Test: 8648


In [65]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout, GRU, Bidirectional
from keras.layers import LeakyReLU
import util

from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
from keras.optimizers import RMSprop, Adam

checkpoint = ModelCheckpoint("weights_word_{epoch:01d}.h5",
    monitor='loss',
    verbose=1,
    mode='auto',
    period=1,
    save_weights_only=True)

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

SEQUENCE_LENGTH = 6

def build_model1(vocab_size, sequence_length, embedding_size):
    model = Sequential()
    model.add(LSTM(128, input_shape=(sequence_length, embedding_size), return_sequences=True))
    model.add(Dropout(0.4))
    model.add(LSTM(1024))
    model.add(Dropout(0.4))
    model.add(Dense(2048))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer="rmsprop", metrics = ['accuracy'])
    return model

embedding_size = w2v.vector_size + util.EMBEDDING_EXT
model1 = build_model1(vocab_size, SEQUENCE_LENGTH, embedding_size)
model1.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_14 (LSTM)               (None, 6, 128)            118784    
_________________________________________________________________
dropout_15 (Dropout)         (None, 6, 128)            0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 1024)              4722688   
_________________________________________________________________
dropout_16 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 2048)              2099200   
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 2048)              0         
_________________________________________________________________
dense_17 (Dense)             (None, 71514)            

In [60]:
def tokenize_song(song, buffer_length):
    tokens = song

    x_train, y_train = [], []
    for i in range(0, len(song)):
        if i+buffer_length+1 >= len(tokens):
            #pad_length = (i+buffer_length+1) - len(tokens)
            #tokens += ['<pad>'] * pad_length
            continue
            
        xs = tokens[i:i+buffer_length]
        y = tokens[i+buffer_length]
        discard = False
        for x in xs:
            if x not in words:
                #print("Nope on", x)
                discard = True
                break
        if discard or y not in words:
            continue
            
        x_train.append(xs)
        y_train.append(y)

    return x_train,y_train

In [61]:
import math
from random import shuffle
import numpy as np

def generate_batches(songs, sequence_length, batch_size):
    x_train, y_train = [], []
    for song in songs:
        xs, ys = tokenize_song(song, sequence_length)
        x_train.extend(xs)
        y_train.extend(ys)
        if len(x_train) >= batch_size:
            yield x_train[0:batch_size], y_train[0:batch_size]

            x_train = x_train[batch_size:]
            y_train = y_train[batch_size:]
    if len(x_train) > 0:
        yield x_train, y_train
        
def generate_samples(songs, sequence_length, batch_size):
    while True:
        for xs_batch, ys_batch in generate_batches(songs, sequence_length, batch_size):
            #c = list(zip(xs_batch, ys_batch))
            #shuffle(c)
            #xs_batch, ys_batch = zip(*c)

            batch_size = len(xs_batch)
            x_train = np.zeros((batch_size, sequence_length, embedding_size))
            y_train = np.zeros((batch_size, ))

            for i in range(batch_size):
                x_train[i] = util.encode_word_sequence(xs_batch[i], w2v)
                y_train[i] = word2idx[ys_batch[i]]
                
            yield x_train, y_train

In [62]:
def generate_ngrams(batch_size, sequence_length):
    with open("prep.txt", "r") as f:
        x_train = np.zeros((batch_size, sequence_length, embedding_size))
        y_train = np.zeros((batch_size, ))
        i = 0
        for line in f:
            nums = line.split(" ")
            xs = list(map(lambda x: idx2word[x], map(int, nums[:6])))
            xs = util.encode_word_sequence(xs, w2v)
            y = int(nums[-1])
            x_train[i] = xs
            y_train[i] = y
            if i % batch_size == 0:
                yield x_train, y_train
                i = 0
            else:
                i += 1

In [63]:
sampler = generate_samples(songs_train, SEQUENCE_LENGTH, 8)
x_batch, y_batch = next(sampler)
print(x_batch.shape, y_batch.shape)

(8, 6, 103) (8,)


In [66]:
model1.fit_generator(generate_ngrams(64, 6), samples_per_epoch=100, epochs=4, callbacks=[checkpoint])

  """Entry point for launching an IPython kernel.


Epoch 1/4


ResourceExhaustedError: OOM when allocating tensor with shape[2048,71514] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node training_5/RMSprop/mul_26}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [54]:
x_dev, y_dev = [], []
for song in songs_dev[:5000]:
    xs, ys = tokenize_song(song, SEQUENCE_LENGTH)
    x_dev.extend(xs)
    y_dev.extend(ys)

In [55]:
devset = list(zip(x_dev, y_dev))
shuffle(devset)
avg_perplexity = 0
buffer_length = 1000
idx = 0

for i in range(4):

    perplexity = 0.0
    for x, y in devset[idx:idx+buffer_length]:
        enc_seq = util.encode_word_sequence(x, w2v).reshape(1, SEQUENCE_LENGTH, embedding_size)
        preds = model1.predict(enc_seq)[0]

        likelihood = preds[word2idx[y]]
        perplexity += np.log2(likelihood)
        
    perplexity = np.power(2, perplexity * -1/buffer_length)
    print("Perpl:", perplexity)
    avg_perplexity += perplexity
    idx += buffer_length

print("Avg perpl:", avg_perplexity / 4)

KeyError: 'exposed'

In [58]:
words = ["look", "at", "her", "face", ",", "it"]
#words = ["when", "there", "'s", "a", "dark", "storm"]
#words = ["do", "better", ",", "who", "better", "?"]
words_seq = util.encode_word_sequence(words, w2v).reshape(1, SEQUENCE_LENGTH, embedding_size)
#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

result = words
for j in range(60):
    word = idx2word[util.sample(model1.predict(words_seq))]
    #word = one_hot_decode(model.predict(words_seq), idx2word)
    result.append(word)
    
    new_words = np.zeros((1, SEQUENCE_LENGTH, embedding_size))
    for i in range(SEQUENCE_LENGTH-1):
        new_words[0, i] = words_seq[0, i+1]
    new_words[0, SEQUENCE_LENGTH-1] = util.encode_word(word, w2v)
    words_seq = new_words

#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

print(util.textify(result))

  log_preds_scaled = np.log(arr) / temperature


look at her face, it 68th swordplay 
babe garry muscatel lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense lense
