## Data preprocessing

In [1]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True)

In [2]:
import numpy as np
SEQUENCE_LENGTH = 6
HIDDEN_SIZE = 256

EMBEDDING_SIZE_ORIG = 100
EMBEDDING_SIZE = 103

def encode_word(word, w2v):
    if word == "<pad>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-1] = 1
        return v
    elif word == "<newline>":
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-2] = 1
        return v
    elif word == "<unk>" or word not in w2v:
        v = np.zeros((EMBEDDING_SIZE,))
        v[EMBEDDING_SIZE-3] = 1
        return v
    else:        
        v = w2v[word]
        w = np.zeros((3,))
        return np.append(v, w, axis=0)

def encode_words(words, w2v):
    vec = np.zeros((len(words), EMBEDDING_SIZE))
    for (i,word) in enumerate(words):
        vec[i] = encode_word(word, w2v)
    return vec

In [3]:
v = encode_word("hello", w2v)
print(v.shape)
w = encode_word("boom-a-boomerang", w2v)
print(w.shape)

(103,)
(103,)


In [4]:
v_newline = encode_word("<newline>", w2v)
v_pad = encode_word("<pad>", w2v)
v_unk = encode_word("<unk>", w2v)

print(np.argmax(v_newline))
print(np.argmax(v_pad))
print(np.argmax(v_unk))

101
102
100


In [5]:
def decode_vec(vec, w2v):
    base_vec = vec[:EMBEDDING_SIZE_ORIG]
    ext_vec = vec[EMBEDDING_SIZE_ORIG:]
    if ext_vec[0]:
        return "<unk>"
    elif ext_vec[1]:
        return "<newline>"
    elif ext_vec[2]:
        return "<pad>"
    else:
        return w2v.similar_by_vector(base_vec)[0][0]

In [6]:
print(decode_vec(v, w2v))
print(decode_vec(v_newline, w2v))
print(decode_vec(v_pad, w2v))
print(decode_vec(v_unk, w2v))

hello
<newline>
<pad>
<unk>


In [7]:
def tokenize_song(song, buffer_length):
    tokens = song

    x_train = []
    y_train = []
    for i in range(0, len(song)):
        if i+buffer_length+1 >= len(tokens):
            pad_length = (i+buffer_length+1) - len(tokens)
            tokens += ['<pad>'] * pad_length

        x_train.append(tokens[i:i+buffer_length])
        y_train.append(tokens[i+buffer_length])

    return x_train,y_train

In [8]:
token_vocab = {'<pad>', '<unk>'}
songs = []
with open("data/sentences.txt", "r") as f:
    for line in f.readlines():
        tokens = [token for token in line.rstrip().split(" ")]
        songs.append(tokens)
        token_vocab = token_vocab.union(set(tokens))

In [9]:
x_vec = []
y_vec = []
for song in songs:
    x_vec_i, y_vec_i = tokenize_song(song, SEQUENCE_LENGTH)
    x_vec.extend(x_vec_i)
    y_vec.extend(y_vec_i)
print(len(x_vec))
print(x_vec[0])

114929
['look', 'at', 'her', 'face', ',', 'it']


In [10]:
from pathlib import Path

def load_vocab(path):
    vocab = list()
    with path.open("r") as f:
        for line in f.readlines():
            vocab.append(line.rstrip())
    return vocab

def write_vocab(path):
    with path.open("w") as f:
        for word in words:
            f.write(word + "\n")
        
# Read or create vocab path
vocab_path = Path("vocab.txt")
words = list()
if vocab_path.is_file():
    words = load_vocab(vocab_path)   
else:
    write_vocab(vocab_path)
    words = list(token_vocab)

vocab_size = len(words)
print("Vocab size:", vocab_size)
print("W2V vocab size:", len(w2v.vocab))
word2idx = { word:i for i,word in enumerate(words) }
idx2word = { i:word for i,word in enumerate(words) }

Vocab size: 5111
W2V vocab size: 400000


In [11]:
def one_hot_encode(word, word2idx):
    v = np.zeros((len(word2idx, )))
    v[word2idx[word]] = 1
    return v

def one_hot_decode(word, idx2word):
    return idx2word[np.argmax(word)]

In [12]:
print(word2idx["hello"])
print(np.argmax(one_hot_encode("hello", word2idx)))

1448
1448


## Train/Test split

In [13]:
from sklearn.model_selection import train_test_split
import math

# 80% Train, 10% Dev, 10% Test
X_train, X_test, Y_train, Y_test = train_test_split(x_vec, y_vec, test_size=0.2)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_test, Y_test, test_size=0.5)

In [14]:
print("Total size:", len(x_vec))
print("Training size:", len(X_train))
print("Development set size:", len(X_dev))
print("Test set size:", len(X_test))

Total size: 114929
Training size: 91943
Development set size: 11493
Test set size: 11493


In [15]:
from random import shuffle
def generate_batches(data_length, mini_batch_size):
    for begin in range(0, data_length, mini_batch_size):
        end = min(begin + mini_batch_size, data_length)
        yield begin, end

def load_batch(xs, ys, begin, end):
    batch_size = end-begin
    
    x_train = np.zeros((batch_size, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    y_train = np.zeros((batch_size, vocab_size))
    
    xs_batch = xs[begin:end]
    ys_batch = ys[begin:end]
    
    c = list(zip(xs_batch, ys_batch))
    shuffle(c)
    xs_batch, ys_batch = zip(*c)
    
    for i in range(batch_size):
        x_train[i] = encode_words(xs_batch[i], w2v)
        y_train[i] = one_hot_encode(ys_batch[i], word2idx)
    
    return x_train, y_train

In [16]:
batches = generate_batches(len(X_train), 512)
begin, end = next(batches)

x_train, y_train = load_batch(X_train, Y_train, begin, end)
print(x_train.shape, y_train.shape)

print("X train")
for j in range(10):
    ws = ' '.join([decode_vec(x_train[j][i], w2v) for i in range(SEQUENCE_LENGTH)])
    print(ws)
    
print("\nY train")
for j in range(10):
    print(one_hot_decode(y_train[j], idx2word))

(512, 6, 103) (512, 5111)
X train
they ask for daddy <newline> hey
in a free world <newline> i
my oasis in the night ,
home , <newline> searching for this
) <newline> like i always do
hey darling ( have to tell
baby do n't want me no
goes where you can find me
day of your life <newline> gon
and to receive <newline> for every

Y train
hey
whistle
yeah
land
<newline>
you
more
,
na
little


## Machine Learning Time

In [62]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout, GRU
from keras.layers import LeakyReLU
def build_model1(vocab_size):
    model = Sequential()
    model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, EMBEDDING_SIZE), return_sequences=True))
    model.add(Dropout(0.4))
    model.add(LSTM(1024))
    model.add(Dropout(0.4))
    model.add(Dense(2048))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer="adam", metrics = ['accuracy'])
    return model

def build_model2(vocab_size):
    model = Sequential()
    model.add(LSTM(512, input_shape=(SEQUENCE_LENGTH, EMBEDDING_SIZE)))
    model.add(Dropout(0.5))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer="adam", metrics = ['accuracy'])
    return model

model1 = build_model1(vocab_size)
model2 = build_model2(vocab_size)
models = [model1, model2]
for (i, model) in enumerate(models):
    weights_path= Path(f"weights_model{i+1}.h5")
    if weights_path.is_file():
        model.load_weights(weights_path.resolve())

In [63]:
batches = list(generate_batches(len(X_train), 4096))
shuffle(batches)
print("Batches:", len(batches))

Batches: 23


In [100]:
for (i, model) in enumerate(models):
    print("Model:", i)
    for begin, end in batches:
        x_batch, y_batch = load_batch(X_train, Y_train, begin, end)
        model.fit(x_batch, y_batch, batch_size=256, epochs=4)

Model: 0
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4


Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model: 1
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4


Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [120]:
for (i, model) in enumerate(models):
    model.save_weights(f"weights_model{i+1}.h5")

## Test on development set

In [102]:
def sample(preds, temperature=1.0):
    preds = preds.reshape(preds.shape[1])
    arr = np.asarray(preds).astype('float64')
    log_preds_scaled = np.log(arr) / temperature
    preds_scaled = np.exp(log_preds_scaled)
    softmaxed = preds_scaled / np.sum(preds_scaled)
    probas = np.random.multinomial(1, softmaxed, 1)
    return np.argmax(probas)

In [116]:
samples = list(zip(X_dev, Y_dev))[:100]

def evaluate_model(model, samples, temperature=1.4):
    perplexity = 0
    accuracy = 0
    
    for x, y in samples:
        words_seq = encode_words(x, w2v).reshape(1, SEQUENCE_LENGTH, EMBEDDING_SIZE)
        preds = model.predict(words_seq)[0]
        likelihood = preds[word2idx[y]]
        perplexity += np.log2(likelihood)

        y_hat = idx2word[sample(model.predict(words_seq), temperature=temperature)]
        if y_hat == y:
            accuracy += 1

    num_samples = len(samples)
    perplexity = np.power(2, perplexity * -1/num_samples)
    
    return accuracy / num_samples, perplexity

In [117]:
for (i, model) in enumerate(models):
    acc, perp = evaluate_model(model, samples)
    print(f"Model {i}: acc {round(acc*100, 2)}%, perp {round(perp, 2)}")

  after removing the cwd from sys.path.


Model 0: acc 39.0%, perp 50.01
Model 1: acc 21.0%, perp 32.26


In [119]:
#words = ["never", "gonna", "give", "you", "up", ","]
words = ["look", "at", "her", "face", ",", "it"]
#words = ["when", "there", "'s", "a", "dark", "storm"]
#words = ["do", "better", ",", "who", "better", "?"]
words_seq = encode_words(words, w2v)
words_seq = words_seq.reshape(1, SEQUENCE_LENGTH, EMBEDDING_SIZE)
#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

result = words
for j in range(60):
    word = idx2word[sample(model2.predict(words_seq), temperature=1.4)]
    #word = one_hot_decode(model.predict(words_seq), idx2word)
    result.append(word)
    
    new_words = np.zeros((1, SEQUENCE_LENGTH, EMBEDDING_SIZE))
    for i in range(SEQUENCE_LENGTH-1):
        new_words[0, i] = words_seq[0, i+1]
    new_words[0, SEQUENCE_LENGTH-1] = encode_word(word, w2v)
    words_seq = new_words

#print(' '.join([decode_vec(words[0][i], w2v) for i in range(SEQUENCE_LENGTH)]))

print(' '.join(result))
    #words = new_words
        #new_words[0] = words[0, 1]
        #new_words[0, 1] = words[0, 2]
        #new_words[0, 2] = words[0, 3]
        #new_words[0, 3] = encode_word(word, word2idx)
    

look at her face , it 's only way drum <newline> i 'll have a little call me to show you <pad> , me knows a perfect day <newline> slipping here you that everybody me <newline> but you know what lovers something lonely <newline> just sing short it between baby say too lookin ( hehehe ) <newline> but there what who but yet ever dare can
