## Data preprocessing

In [2]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True)

In [3]:
import numpy as np
SEQUENCE_LENGTH = 6
HIDDEN_SIZE = 256

EMBEDDING_SIZE_ORIG = 100
EMBEDDING_SIZE = 103

def encode_word(word, w2v):
    if word == "<pad>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-1] = 1
        return v
    elif word == "<newline>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-2] = 1
        return v
    elif word == "<unk>" or word not in w2v:
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-3] = 1
        return v
    else:        
        v = w2v[word]
        w = np.zeros((3,))
        return np.append(v, w, axis=0)

def encode_words(words, w2v):
    vec = np.zeros((len(words), EMBEDDING_SIZE))
    for (i,word) in enumerate(words):
        vec[i] = encode_word(word, w2v)
    return vec

In [4]:
v = encode_word("hello", w2v)
print(v.shape)
w = encode_word("boom-a-boomerang", w2v)
print(w.shape)

(103,)
(103,)


In [5]:
v_newline = encode_word("<newline>", w2v)
v_pad = encode_word("<pad>", w2v)
v_unk = encode_word("<unk>", w2v)

print(np.argmax(v_newline))
print(np.argmax(v_pad))
print(np.argmax(v_unk))

101
102
100


In [6]:
def decode_vec(vec, w2v):
    base_vec = vec[:EMBEDDING_SIZE_ORIG]
    ext_vec = vec[EMBEDDING_SIZE_ORIG:]
    if ext_vec[0]:
        return "<unk>"
    elif ext_vec[1]:
        return "<newline>"
    elif ext_vec[2]:
        return "<pad>"
    else:
        return w2v.similar_by_vector(base_vec)[0][0]

In [7]:
print(decode_vec(v, w2v))
print(decode_vec(v_newline, w2v))
print(decode_vec(v_pad, w2v))
print(decode_vec(v_unk, w2v))

hello
<newline>
<pad>
<unk>


In [8]:
def tokenize_song(song, buffer_length):
    tokens = song

    x_train = []
    y_train = []
    for i in range(0, len(song)):
        if i+buffer_length+1 >= len(tokens):
            pad_length = (i+buffer_length+1) - len(tokens)
            tokens += ['<pad>'] * pad_length

        x_train.append(tokens[i:i+buffer_length])
        y_train.append(tokens[i+buffer_length])

    return x_train,y_train

In [9]:
token_vocab = {'<pad>', '<unk>'}
songs = []
with open("data/sentences.txt", "r") as f:
    for line in f.readlines():
        tokens = [token for token in line.rstrip().split(" ")]
        songs.append(tokens)
        token_vocab = token_vocab.union(set(tokens))

In [10]:
x_vec = []
y_vec = []
for song in songs[:30]:
    x_vec_i, y_vec_i = tokenize_song(song, SEQUENCE_LENGTH)
    x_vec.extend(x_vec_i)
    y_vec.extend(y_vec_i)
print(len(x_vec))
print(x_vec[0])

8836
['look', 'at', 'her', 'face', ',', 'it']


In [32]:
words = list(token_vocab)
vocab_size = len(words)
print("Vocab size:", vocab_size)
print("W2V vocab size:", len(w2v.vocab))
word2idx = { word:i for i,word in enumerate(words) }
idx2word = { i:word for i,word in enumerate(words) }

Vocab size: 5111
W2V vocab size: 400000


In [33]:
with open("vocab.txt", "w") as f:
    for word in words:
        f.write(word + "\n")

In [12]:
def one_hot_encode(word, word2idx):
    v = np.zeros((len(word2idx, )))
    v[word2idx[word]] = 1
    return v

def one_hot_decode(word, idx2word):
    return idx2word[np.argmax(word)]

In [13]:
print(word2idx["hello"])
print(np.argmax(one_hot_encode("hello", word2idx)))

4216
4216


## Train/Test split

In [23]:
from sklearn.model_selection import train_test_split
import math

# 70% Train, 15% Dev, 15% Test
X_train, X_test, Y_train, Y_test = train_test_split(x_vec, y_vec, test_size=0.3)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_test, Y_test, test_size=0.5)

In [25]:
from random import shuffle
def generate_batches(data_length, mini_batch_size):
    for begin in range(0, data_length, mini_batch_size):
        end = min(begin + mini_batch_size, data_length)
        yield begin, end

def load_batch(xs, ys, begin, end):
    batch_size = end-begin
    
    x_train = np.zeros((batch_size, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    y_train = np.zeros((batch_size, vocab_size))
    
    xs_batch = xs[begin:end]
    ys_batch = ys[begin:end]
    
    c = list(zip(xs_batch, ys_batch))
    shuffle(c)
    xs_batch, ys_batch = zip(*c)
    
    for i in range(batch_size):
        x_train[i] = encode_words(xs_batch[i], w2v)
        y_train[i] = one_hot_encode(ys_batch[i], word2idx)
    
    return x_train, y_train

In [26]:
batches = generate_batches(len(X_train), 512)
begin, end = next(batches)

x_train, y_train = load_batch(X_train, Y_train, begin, end)
print(x_train.shape, y_train.shape)

print("X train")
for j in range(10):
    ws = ' '.join([decode_vec(x_train[j][i], w2v) for i in range(SEQUENCE_LENGTH)])
    print(ws)
    
print("\nY train")
for j in range(10):
    print(one_hot_decode(y_train[j], idx2word))

(512, 6, 103) (512, 5111)
X train
she holds me and squeezes my
is something '' , you said
<newline> when you feel that you
my life was so lonely ,
dancing queen , young and sweet
my life <newline> and one of
this <newline> in the firelight fernando
your brother <newline> love him ,
sound <newline> ( you make me
you 've been living in a

Y train
hand
,
've
did
,
them
<newline>
that
sing
dreamworld


## Machine Learning Time

In [27]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.layers import LeakyReLU
def build_model(vocab_size):
    model = Sequential()
    model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, EMBEDDING_SIZE), return_sequences=True))
    model.add(Dropout(0.4))
    model.add(LSTM(1024))
    model.add(Dropout(0.4))
    model.add(Dense(2048))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer="rmsprop", metrics = ['accuracy'])
    return model
model = build_model(vocab_size)

Using TensorFlow backend.


In [29]:
batches = list(generate_batches(len(X_train), 2048))

In [99]:
shuffle(batches)
for i in range(4):
    begin, end = batches[i]
    x_batch, y_batch = load_batch(X_train, Y_train, begin, end)
    model.fit(x_batch, y_batch, batch_size=256, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save_weights("weights.h5")

## Test on development set

In [36]:
def sample(preds, temperature=1.0):
    preds = preds.reshape(preds.shape[1])
    arr = np.asarray(preds).astype('float64')
    log_preds_scaled = np.log(arr) / temperature
    preds_scaled = np.exp(log_preds_scaled)
    softmaxed = preds_scaled / np.sum(preds_scaled)
    probas = np.random.multinomial(1, softmaxed, 1)
    return np.argmax(probas)

In [44]:
accuracy = 0
temperature = 1.4
for x, y in zip(X_dev, Y_dev):
    words_seq = encode_words(x, w2v)
    words_seq = words_seq.reshape(1, SEQUENCE_LENGTH, EMBEDDING_SIZE)
    y_hat = idx2word[sample(model.predict(words_seq), temperature=temperature)]
    if y_hat == y:
        accuracy += 1
    print(x, y_hat, y)
print("Accuracy: ", accuracy / len(X_dev))

['i', 'know', 'what', 'i', "'d", 'like'] bettin to
["n't", 'you', 'see', '<newline>', 'that', "'s"] dooo the
['as', 'new', 'and', 'growing', 'too', '<newline>'] gritty yes
['<newline>', 'then', 'you', 'say', 'you', "'ll"] absent-minded be
['what', 'a', 'feeling', ')', '<newline>', 'over'] affair mountains
['i', 'had', 'ever', 'dreamed', '<newline>', 'everything'] ridin in
['wild', '<newline>', 'ah', ',', 'but', 'girl'] hammer you
['<newline>', 'andante', ',', 'andante', '<newline>', 'tread'] hiding lightly
['<newline>', 'wo', "n't", 'somebody', 'help', 'me'] sounded chase
['we', "'re", 'going', 'on', '<newline>', 'for'] siya every
['<newline>', 'seems', 'to', 'me', 'you', 'shine'] shit your
['you', 'know', 'i', 'wo', "n't", 'forget'] wipe you
['<newline>', '(', 'fade', ')', '<pad>', '<pad>'] become <pad>
['closed', 'my', 'eyes', ',', 'never', 'leave'] white me
['on', 'the', 'floor', '<newline>', 'it', "'s"] call-in the
['<newline>', 'i', "'ve", 'been', 'waiting', 'but'] seduce my
['tou

['take', 'me', 'through', 'the', 'darkness', 'to'] trash the
['<newline>', 'as', 'i', 'got', 'into', 'a'] sparkling fight
['chiquitita', '<newline>', 'try', 'once', 'more', 'like'] , you
['on', 'a', 'straw', '<pad>', '<pad>', '<pad>'] image <pad>
['after', 'midnight', '<newline>', 'take', 'me', 'through'] underwater the
['say', 'she', "'s", 'been', 'mad', 'at'] shone you
['your', 'mother', 'know', '?', '<newline>', 'take'] x4 it
['so', 'sad', ',', 'so', 'quiet', '<newline>'] shelf chiquitita
['a', 'helping', 'hand', '<newline>', 'i', 'should'] mistaking have
['that', 'i', 'please', '<newline>', 'as', 'all'] meet good
['a', 'question', 'of', 'give', 'and', 'take'] junkie <newline>
['be', 'working', '<newline>', 'on', 'a', 'day'] houses like
['believe', '<newline>', 'you', 'really', 'had', 'the'] hand power
['eagle', '<newline>', 'and', 'i', 'dream', 'i'] supposed can
['remember', 'long', 'ago', 'another', 'starry', 'night'] gliding like
['through', 'my', 'room', '<newline>', 'long', 'aw

['better', '<newline>', 'as', 'good', 'as', 'new'] word ,
['a', 'tiny', 'figure', '<newline>', 'rigid', 'and'] st restrained
['touch', '<newline>', 'do', "n't", 'you', 'know'] crystal that
['ground', '<newline>', 'when', 'the', 'air', 'gets'] body cold
['mind', '<newline>', 'she', 'pats', 'his', 'head'] alone and
['setting', 'the', 'pace', '<newline>', 'running', 'the'] 'll gauntlet
['is', 'love', '<newline>', 'a', 'boom-a-boomerang', 'is'] rides love
['<newline>', 'chiquitita', ',', 'tell', 'me', 'the'] tomorrow truth
['never', 'thought', 'you', "'d", 'leave', 'me'] tomb <newline>
['them', 'is', 'my', 'brother', 'joe', '<newline>'] pretendings he
[',', 'baby', ',', 'ca', "n't", 'you'] happened see
['what', 'to', 'do', '<newline>', 'everything', 'i'] haria had
['scar', '<newline>', 'but', 'then', ',', 'they'] tomorrow never
['through', 'the', 'darkness', 'to', 'the', 'break'] knocks of
['blue', 'eyes', 'filled', 'with', 'pain', '<newline>'] blessed sorry
['we', 'had', 'before', '<newli

['dreams', '<newline>', 'smiling', ',', 'laughing', 'from'] hart the
['the', 'fireworks', 'are', 'through', '<newline>', 'here'] a-callin we
['<newline>', 'disillusion', ',', 'disillusions', ',', 'now'] pakakawalan that
['everything', 'in', 'my', 'life', "'s", 'part'] gritty of
['we', 'were', 'caught', 'in', 'our', 'sleep'] warms <newline>
['boom-a-boomerang', '<newline>', 'dum-be-dum-dum', 'be-dum-be-dum-dum', '<newline>', 'oh'] nadie bang
['fernando', '<newline>', 'si', 'tuviera', 'que', 'volverlo'] ex-to-see a
['take', 'a', 'sip', 'from', '<newline>', 'every'] shorts flower
['nights', '<newline>', 'nobody', 'knew', 'how', 'to'] same fight
['disfrutar', '<newline>', 'se', 'durmio', 'el', 'tambor'] outdoors fernando
['<newline>', 'she', 'has', 'a', 'personal', 'style'] boys <newline>
['gim', 'me', 'a', 'man', 'after', 'midnight'] cast <newline>
['<newline>', 'i', "'m", 'always', 'givin', "'"] hacer a
['trying', '<newline>', 'but', 'we', 'did', "n't"] dreary make
['all', 'the', 'outsid

['chilly', 'winds', 'were', 'blowing', 'through', 'the'] living trees
['of', 'fun', '<newline>', 'so', 'maybe', 'i'] bet 'm
['time', 'when', 'he', 'speaks', 'his', 'mind'] henry <newline>
['<newline>', 'long', 'awaited', 'darkness', 'falls', '<newline>'] endowed casting
['it', '<newline>', 'i', 'can', 'see', 'that'] down you
['i', 'can', 'see', 'that', 'you', "'re"] rocking oh
['and', 'restrained', ',', 'blue', 'eyes', 'filled'] month with
['<newline>', 'breaking', 'her', 'way', '<newline>', 'pushing'] sling through
['want', 'to', 'hold', 'you', 'tight', '<newline>'] quittin we
['empty', ',', 'there', 'was', 'nothing', 'to'] start live
['will', 'be', 'sailing', '<newline>', 'now', 'that'] sa your
['y', 'una', 'noche', 'alla', '<newline>', 'en'] tuve la
['<newline>', 'i', 'only', 'saw', 'it', 'as'] force dreams
['us', 'would', '<newline>', 'listen', 'to', 'words'] bore of
['we', 'talk', 'all', 'night', ',', 'and'] having we
['yourself', 'a', 'break', '<newline>', 'every', 'smile'] heart

['i', "'m", 'blue', '<newline>', 'i', "'m"] arm cryin
['i', 'am', 'your', 'music', 'and', 'i'] de am
['we', 'could', 'make', 'it', 'right', '<newline>'] surely gon
['not', 'fair', '<newline>', 'and', 'you', "'re"] busy only
['morning', 'sun', '<newline>', 'could', "n't", 'sleep'] everyday ,
['our', 'sleep', '<newline>', 'sorry', 'cassandra', 'i'] talent did
['eternity', '<newline>', 'what', 'a', 'miracle', 'to'] carpet happen
["'s", 'a', 'gentleness', 'to', 'everything', 'you'] forgiving do
['can', 'look', 'for', 'it', 'anywhere', '<newline>'] press when
['outdoors', '<newline>', 'lie', 'in', 'the', 'grass'] lip and
['watched', 'the', 'ship', 'leaving', 'harbor', 'at'] sneeze sunrise
['for', 'that', 'kind', 'of', 'fun', '<newline>'] sang so
['right', 'music', ',', 'getting', 'in', 'the'] performed swing
['the', 'morning', 'seems', 'so', 'grey', '<newline>'] drawn so
[',', 'she', 'makes', 'me', 'feel', 'fine'] snowin <newline>
['fade', ')', '<pad>', '<pad>', '<pad>', '<pad>'] silver <pa

['redoblar', '<newline>', 'se', 'acercaban', 'mas', 'fernando'] beacon <newline>
['they', "'re", 'in', 'a', 'dreamworld', '<newline>'] dale here
['me', '<newline>', 'but', 'now', 'i', 'know'] afar it
['use', 'it', 'as', 'a', 'selfish', 'tool'] nag-aantok <newline>
['playing', 'games', 'within', 'my', 'mind', '<newline>'] annie like
["'ve", 'had', 'enough', '<newline>', 'and', 'now'] bighway ,
["'s", 'a', 'crazy', 'world', '<pad>', '<pad>'] arbuckle <pad>
['you', "'re", 'doing', '<newline>', 'i', 'ca'] foolish n't
['face', 'it', 'my', 'friend', '<newline>', 'agnetha'] fuck ,
['the', 'power', '<newline>', 'i', 'only', 'saw'] dame it
['speak', 'strangely', 'but', 'i', 'understand', '<newline>'] lakad and
['en', 'morir', '<newline>', 'y', 'no', 'siento'] nevermind hoy
['his', 'hand', '<newline>', 'hey', 'honolulu', ','] happiest i
['you', "'re", 'making', 'me', 'strong', ')'] dies <newline>
['i', 'feel', 'so', 'cold', 'without', 'you'] hallelujah <newline>
['you', 'near', '<newline>', 'i',

['have', 'for', 'you', 'feels', 'as', 'good'] pinagdurusa as
['smile', ')', '<newline>', 'but', 'girl', 'you'] bill 're
['<newline>', 'every', 'smile', 'and', 'every', 'little'] everybody touch
['like', 'an', 'angel', 'passing', 'through', 'my'] yoo-hoo room
['the', 'pace', '<newline>', 'running', 'the', 'gauntlet'] carries in
['can', 'dance', 'with', 'you', 'honey', '<newline>'] add if
['praying', '<newline>', 'treat', 'him', 'well', ','] hope he
['<newline>', 'some', 'of', 'us', 'wanted', 'but'] ashes none
['<newline>', 'it', "'s", 'gon', 'na', 'be'] petty our
['star', '<newline>', 'and', 'he', 'knows', '<newline>'] manufacture what
['is', 'love', '<newline>', 'love', 'is', 'always'] bass around
['<newline>', 'swimming', 'and', 'surfing', ',', 'enjoying'] forgotten the
['man', 'in', 'the', 'street', '<newline>', 'as'] connie i
['gon', 'na', 'sing', 'you', 'my', 'love'] funny song
['of', 'your', 'life', '<newline>', 'see', 'that'] quiet girl
['pride', '<newline>', 'no', 'un-right', 'n

["'s", 'that', 'look', 'in', 'your', 'eyes'] wipe <newline>
['stay', '<newline>', 'you', "'ll", 'be', 'seeing'] bogus a
['comes', 'back', 'to', 'me', 'tonight', '<newline>'] praying in
['prays', '<newline>', 'gim', 'me', 'gim', 'me'] beg gim
['laughing', 'at', 'the', 'clouds', '<newline>', 'the'] none rain
['playing', 'ground', 'where', 'she', 'goes', 'rushing'] closest )
['that', 'our', 'love', 'was', 'at', 'an'] sins end
['on', 'the', 'road', '<newline>', 'that', 'we'] heartache 're
['when', 'you', 'feel', 'that', 'you', "'ve"] nore found
['travel', 'alone', 'if', 'i', 'can', '<newline>'] servants every
['surrender', '<newline>', 'bang', ',', 'a', 'boom-a-boomerang'] pale <newline>
['did', 'things', 'turn', 'out', 'so', 'bad'] arrested ?
['in', 'the', 'rain', '<newline>', 'out', 'to'] shovel the
['does', 'your', 'mother', 'know', 'that', 'you'] looking 're
['and', 'forests', 'and', 'seas', '<newline>', 'and'] answers to
['die', '<newline>', 'love', 'was', 'one', 'prolonged'] monkey g

[',', 'high', ',', 'i', "'m", 'a'] bashful bird
['the', 'morning', 'your', 'ship', 'will', 'be'] riding sailing
['for', 'a', 'walk', 'in', 'the', 'park'] buckle <newline>
['love', 'song', ',', 'gon', 'na', 'make'] jacket it
[')', '<newline>', 'make', 'me', 'sing', ','] cha make
['boom-a-boomerang', '<newline>', 'dumb-be-dumb-dumb', 'be-dumb-be-dumb-dumb', '<newline>', 'oh'] sloppy bang
['and', 'time', 'again', 'and', 'make', 'me'] ptomaine strong
['bang', ',', 'a', 'boom-a-boomerang', '<newline>', 'dum-be-dum-dum'] gentry be-dum-be-dum-dum
['going', 'anyway', '<newline>', 'happy', 'new', 'year'] destined <newline>
['should', 'i', 'pretend', '<newline>', 'in', 'a'] cooler few
['to', 'say', '<newline>', 'happy', 'new', 'year'] sings <newline>
['than', 'on', 'those', 'happy', 'autumn', 'days'] leopard <pad>
['velvet', 'of', 'the', 'night', '<newline>', 'touch'] thousand my
['just', 'a', 'tiny', 'figure', '<newline>', 'rigid'] mutual and
['room', '<newline>', 'long', 'awaited', 'darkness',

In [134]:
#words = ["never", "gonna", "give", "you", "up", ","]
words = ["look", "at", "her", "face", ",", "it"]
#words = ["when", "there", "'s", "a", "dark", "storm"]
#words = ["do", "better", ",", "who", "better", "?"]
words_seq = encode_words(words, w2v)
words_seq = words_seq.reshape(1, SEQUENCE_LENGTH, EMBEDDING_SIZE)
#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

result = words
for j in range(60):
    word = idx2word[sample(model.predict(words_seq), temperature=1.4)]
    #word = one_hot_decode(model.predict(words_seq), idx2word)
    result.append(word)
    
    new_words = np.zeros((1, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    for i in range(SEQUENCE_LENGTH-1):
        new_words[0, i] = words_seq[0, i+1]
    new_words[0, SEQUENCE_LENGTH-1] = encode_word(word, w2v)
    words_seq = new_words

#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

print(' '.join(result))
    #words = new_words
        #new_words[0] = words[0, 1]
        #new_words[0, 1] = words[0, 2]
        #new_words[0, 2] = words[0, 3]
        #new_words[0, 3] = encode_word(word, word2idx)
    

look at her face , it 's a party <newline> i remember sitting half <newline> little while half in the clay <newline> i 'm still trapped my , <newline> cold gim yet there another end without <newline> there <newline> fixin angel , going <newline> loving it do <newline> please knew empty feeling what again hairs in the dream <newline> when the twilight seems like through to
