# 1) Installing Necessary Packages

In [1]:
!pip install dynet

Collecting dynet
[?25l  Downloading https://files.pythonhosted.org/packages/88/f0/01a561a301a8ea9aea1c28f82e108c38cd103964c7a46286ab01757a4092/dyNET-2.1-cp36-cp36m-manylinux1_x86_64.whl (28.1MB)
[K     |████████████████████████████████| 28.1MB 1.2MB/s 
Installing collected packages: dynet
Successfully installed dynet-2.1


# 2) Importing Packages

In [2]:
import json
import numpy as np
import dynet as dy
import nltk
import math
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# 3) Functions

In [0]:
def read_glove():
    word2vec = {}
    embeddings = []
    word2idx = {}
    with open('/content/drive/My Drive/SPRING 2020/BBM497/Assignment4/glove.6B.50d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            word2idx[word] = len(embeddings)
            vec = np.asarray(values[1:], dtype='float32')
            word2vec[word] = vec
            embeddings.append(vec)
    return np.array(embeddings), word2idx


def read_data():
    poems = []
    with open("/content/drive/My Drive/SPRING 2020/BBM497/Assignment4/unim_poem.json") as data:
        poems_and_ids = json.loads(data.read())
    for p in poems_and_ids[:100]:
        poems.append(p["poem"])
    return poems


def add_tags(poems):
    for i, p in enumerate(poems):
        p = p.replace("\n", " eol ")
        poems[i] = "bos {} eos".format(p)
    return poems


def create_bi_grams(poems_with_tags):
    poems_bi_gram = []
    for p in poems_with_tags:
        p_bi_gram = list(nltk.bigrams(p.split()))
        poems_bi_gram.extend(p_bi_gram)
    return poems_bi_gram


def get_vocab(poems_with_tags):
    vocab = []
    for p in poems_with_tags:
        vocab.extend(p.split())
    vocab = list(set(vocab))
    return vocab


def get_probs(word, params, word2idx):
    dy.renew_cg()

    W1 = params["pW1"]
    b1 = params["pb1"]
    W2 = params["pW2"]
    b2 = params["pb2"]
    lookup_table = params["look_up"]

    try:
        x = lookup_table[word2idx[word]]
    except KeyError:
        x = lookup_table[word2idx["unk"]]

    U = dy.tanh(W1 * x + b1)
    y = W2 * U + b2
    probs = dy.softmax(y)
    return probs


def predict_next_word(word, vocab, params, word2idx):
    probs = get_probs(word, params, word2idx)
    probs = probs.npvalue()

    cum_prob = np.cumsum(probs)
    random_prob = np.random.rand()

    predicted_word_prob = min(cum_prob[cum_prob >= random_prob])
    predicted_word_index = list(cum_prob).index(predicted_word_prob)
    predicted_word = vocab[predicted_word_index]
    return predicted_word


def generate_poem(vocab, params, word2idx):
    initial = "bos"
    num_of_new_line = 0
    poem = ["bos"]
    while True:
        next_word = predict_next_word(initial, vocab, params, word2idx)
        initial = next_word

        if next_word == "eol":
            num_of_new_line += 1
        if next_word == "eos" or num_of_new_line == 5:
            poem.append(next_word)
            break
        poem.append(next_word)
    return " ".join(poem)


def calculate_perplexity(poem, params, word2idx, vocab_word_to_index):
    poem_words = poem.split()
    poem_bi_gram = list(nltk.bigrams(poem_words))

    total_probs = 0
    for w1, w2 in poem_bi_gram:
        probs = get_probs(w1, params, word2idx)
        probs = probs.npvalue()
        total_probs += math.log2(probs[vocab_word_to_index[w2]])
    perplexity = 2 ** ((-1 / len(poem_words)) * total_probs)
    return perplexity





# Modeling

In [16]:
embeddings, word2idx = read_glove()  # read glove
poems = read_data()  # read poems
poems_with_tags = add_tags(poems)  # add tags to poems
vocab = get_vocab(poems_with_tags)  # get vocab of the poems
poems_bi_gram = create_bi_grams(poems_with_tags)  # create bi_gram of the poems

vocab_word_to_index = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)
hidden_neuron_size = 50
input_neuron_size = 50
EPOCHS = 20

model = dy.Model()
trainer = dy.SimpleSGDTrainer(model)

pW1 = model.add_parameters((hidden_neuron_size, input_neuron_size))
pb1 = model.add_parameters(hidden_neuron_size)
pW2 = model.add_parameters((vocab_size, hidden_neuron_size))
pb2 = model.add_parameters(vocab_size)
lookup_table = model.add_lookup_parameters((len(word2idx), input_neuron_size), init=embeddings)

params = {"pW1": pW1, "pW2": pW2, "pb1": pb1, "pb2": pb2, "look_up": lookup_table}

for epoch in range(EPOCHS):
  epoch_loss = 0.0
  for (w1, w2) in poems_bi_gram:
    probs = get_probs(w1, params, word2idx)
    loss = -dy.log(dy.pick(probs, vocab_word_to_index[w2]))
    epoch_loss += loss.scalar_value()

    loss.backward()
    trainer.update()

  if epoch % 1 == 0:
    print("Epoch %d. loss = %f" % (epoch, epoch_loss))


Epoch 0. loss = 26508.540470
Epoch 1. loss = 22741.818008
Epoch 2. loss = 20066.946369
Epoch 3. loss = 18006.316627
Epoch 4. loss = 16313.344533
Epoch 5. loss = 15071.908056
Epoch 6. loss = 14218.855844
Epoch 7. loss = 13629.933046
Epoch 8. loss = 13150.505507
Epoch 9. loss = 12892.602535
Epoch 10. loss = 12709.631286
Epoch 11. loss = 12665.339040
Epoch 12. loss = 12534.489432
Epoch 13. loss = 12537.475384
Epoch 14. loss = 12356.554114
Epoch 15. loss = 12302.613100
Epoch 16. loss = 12410.077007
Epoch 17. loss = 12404.600083
Epoch 18. loss = 12515.664844
Epoch 19. loss = 12525.350399


In [19]:
generated_poem = generate_poem(vocab, params, word2idx)
print(generated_poem.replace("eol", "\n"))
print("Perplexity of the generated sentence: %.3f" % calculate_perplexity(generated_poem, params, word2idx, vocab_word_to_index))

bos he heard my heart 
 only thing 
 wondered if he won my heart 
 i thought my heart 
 only thing was 

Perplexity of the generated sentence: 3.198
