In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-04-27 13:22:45.641472: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv('../data/haiku.csv')
data = data.replace("/", " / ", regex=True)
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se..."
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil..."
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ..."
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl..."


In [3]:
def tokenize(sentence: str):
    return list(sentence.lower().split())

def vectorize(tokens):
    vocab, index = {}, 1  # start indexing from 1
    vocab['<pad>'] = 0  # add a padding token
    for token in tokens:
        token = token.strip()
        if token not in vocab:
            vocab[token] = index
            index += 1
    return vocab

def find_max_length(vectorized_poems):
    max_length = 0
    for poem in vectorized_poems:
        # if len(poem)
        max_length = max(max_length, len(poem))   
    return max_length

In [4]:
all_text = " ".join(data["processed_title"].to_list())
tokens = tokenize(all_text)
vocab_map = vectorize(tokens)

In [5]:
data["vectorized"] = data["processed_title"].apply(lambda x: [vocab_map[t.strip()] for t in x.lower().split()])
data = data[[a.count(4) <= 2 for a in data['vectorized']]]
data = data[data['vectorized'].apply(lambda x: len(x) <= 19)]
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords,vectorized
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se...","[1, 2, 3, 4, 5, 6, 2, 7, 8, 4, 9, 10, 11, 12, 13]"
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for...","[14, 15, 16, 17, 4, 18, 19, 20, 21, 4, 22, 23,..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil...","[25, 4, 26, 4, 27]"
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ...","[28, 6, 29, 4, 30, 31, 32, 4, 33, 34]"
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl...","[35, 12, 36, 37, 4, 38, 12, 36, 39, 40, 41, 4,..."


In [6]:
data_vectorized_list = data["vectorized"].to_list()

In [7]:
padded_data = pad_sequences(data_vectorized_list, padding='post')

In [8]:
haikus = np.array(padded_data)
haikus

array([[    1,     2,     3, ...,     0,     0,     0],
       [   14,    15,    16, ...,     0,     0,     0],
       [   25,     4,    26, ...,     0,     0,     0],
       ...,
       [  572,  6718, 24145, ...,     0,     0,     0],
       [ 1138,  7924,  1572, ...,     0,     0,     0],
       [  422, 24147,     4, ...,     0,     0,     0]], dtype=int32)

In [67]:
window_size = 6

In [68]:
training_examples = []

for haiku in haikus:
    for i in range(len(haiku) - window_size + 1):
        input_words = haiku[i:i+window_size-1]
        output_word = haiku[i+window_size-1]
        training_examples.append((input_words, output_word))
training_examples[:20]

[(array([1, 2, 3, 4, 5], dtype=int32), 6),
 (array([2, 3, 4, 5, 6], dtype=int32), 2),
 (array([3, 4, 5, 6, 2], dtype=int32), 7),
 (array([4, 5, 6, 2, 7], dtype=int32), 8),
 (array([5, 6, 2, 7, 8], dtype=int32), 4),
 (array([6, 2, 7, 8, 4], dtype=int32), 9),
 (array([2, 7, 8, 4, 9], dtype=int32), 10),
 (array([ 7,  8,  4,  9, 10], dtype=int32), 11),
 (array([ 8,  4,  9, 10, 11], dtype=int32), 12),
 (array([ 4,  9, 10, 11, 12], dtype=int32), 13),
 (array([ 9, 10, 11, 12, 13], dtype=int32), 0),
 (array([10, 11, 12, 13,  0], dtype=int32), 0),
 (array([11, 12, 13,  0,  0], dtype=int32), 0),
 (array([12, 13,  0,  0,  0], dtype=int32), 0),
 (array([14, 15, 16, 17,  4], dtype=int32), 18),
 (array([15, 16, 17,  4, 18], dtype=int32), 19),
 (array([16, 17,  4, 18, 19], dtype=int32), 20),
 (array([17,  4, 18, 19, 20], dtype=int32), 21),
 (array([ 4, 18, 19, 20, 21], dtype=int32), 4),
 (array([18, 19, 20, 21,  4], dtype=int32), 22)]

In [273]:
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, SimpleRNN, GRU
from sklearn.model_selection import train_test_split

In [275]:
import keras.backend as K

def masked_loss_function(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 0), K.floatx())
    loss = K.sparse_categorical_crossentropy(y_true, y_pred)
    masked_loss = loss * mask
    return K.sum(masked_loss) / K.sum(mask)


In [327]:
vocab_size = len(vocab_map)
embedding_size = 128
input_length = window_size - 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(32),
    Dense(512, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])


In [328]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss=masked_loss_function, optimizer=optimizer, metrics=['accuracy'])

In [329]:
x = np.array([example[0] for example in training_examples[:10000]])
y = np.array([example[1] for example in training_examples[:10000]])

In [330]:
model.fit(x, y, batch_size=32, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fd608dde890>

In [331]:
vocab_map_inv = dict([(value, key) for key, value in vocab_map.items()])
vocab_map_inv

{0: '<pad>',
 1: "there's",
 2: 'nothing',
 3: 'inside',
 4: '/',
 5: 'there',
 6: 'is',
 7: 'outside',
 8: 'me',
 9: 'i',
 10: 'search',
 11: 'on',
 12: 'in',
 13: 'hope.',
 14: 'from',
 15: 'whole',
 16: 'we',
 17: 'crumble',
 18: 'forever',
 19: 'lost',
 20: 'to',
 21: 'chaos',
 22: 'never',
 23: 'one',
 24: 'again',
 25: 'indistinctiveness',
 26: 'immeasurability',
 27: 'capitalism',
 28: 'internet',
 29: 'down',
 30: 'obligations',
 31: 'go',
 32: 'bye-bye',
 33: 'office',
 34: 'rejoices',
 35: 'cotton',
 36: 'my',
 37: 'mouth',
 38: 'needles',
 39: 'blood',
 40: 'and',
 41: 'bones',
 42: 'hammers',
 43: 'head.',
 44: 'mighty',
 45: 'hummingbird',
 46: 'drinks',
 47: 'a',
 48: "grapefruit's",
 49: 'blossom;',
 50: 'blots',
 51: 'out',
 52: 'an',
 53: 'airplane',
 54: 'downvotes',
 55: 'fall',
 56: 'as',
 57: 'sharp',
 58: 'snowflakes',
 59: 'of',
 60: 'early',
 61: 'winter,',
 62: 'execution.',
 63: 'seven',
 64: 'ships',
 65: 'tonight',
 66: 'guess',
 67: "should've",
 68: 'said'

In [332]:
def generate_poem(input_words):
    vectorized_input = [vocab_map[word] for word in input_words]
    print(f"User specified words {input_words} which were vectorized as {vectorized_input}")
    output_poem = input_words
    for i in range(19 - window_size - 1):
        input = np.array(vectorized_input[i:i+window_size -1]).reshape((1,window_size -1))
        prediction = np.array(model.predict(input, verbose=0))
        new_word_vector = (prediction[0].argsort()[::-1])[np.random.randint(0, 1)]
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        output_poem.append(new_word)
    output = " ".join(output_poem)
    print(f"OUTPUT POEM: {output}")

        
                                   


In [333]:
generate_poem(["internet", "is", "down", "and", "i"])

User specified words ['internet', 'is', 'down', 'and', 'i'] which were vectorized as [28, 6, 29, 40, 9]
OUTPUT POEM: internet is down and i / pure war over it <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [334]:
generate_poem(["fun", "today", "and", "tomorrow", "together"])

User specified words ['fun', 'today', 'and', 'tomorrow', 'together'] which were vectorized as [158, 648, 40, 1660, 2213]
OUTPUT POEM: fun today and tomorrow together / when i had that eat my dashboard / and into the


In [335]:
generate_poem(["the", "bear", "and", "i", "cherry"])

User specified words ['the', 'bear', 'and', 'i', 'cherry'] which were vectorized as [85, 1014, 40, 9, 1264]
OUTPUT POEM: the bear and i cherry / tiny it's crickets / at your daily rest of me. <pad>


In [336]:
generate_poem(["i", "am", "coming", "to", "a"])

User specified words ['i', 'am', 'coming', 'to', 'a'] which were vectorized as [9, 155, 2353, 20, 47]
OUTPUT POEM: i am coming to a worlds <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [262]:
def pick_random_words(vocab_map, number):
    vocabs = list(vocab_map.keys())
    inds = list(np.random.randint(0, len(vocabs), number))
    output_words = []
    for i in inds:
        output_words.append(vocabs[i])
    return output_words

In [341]:
generate_poem(pick_random_words(vocab_map, 5))

User specified words ['gnarled,', 'claiming', 'rasping', 'caffine', 'limbs'] which were vectorized as [4414, 11891, 12004, 11931, 5376]
OUTPUT POEM: gnarled, claiming rasping caffine limbs / under a gift, running, bye / never-ending oceans / pretending that


In [368]:
generate_poem(pick_random_words(vocab_map, 5))

User specified words ['squeeze', 'mask', 'tree-', 'loves!', 'routine'] which were vectorized as [13581, 3258, 21235, 19684, 6553]
OUTPUT POEM: squeeze mask tree- loves! routine / congealed the false reality / then whiskey calls to you <pad>
