In [79]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [80]:
data = pd.read_csv('../data/haiku.csv')
data = data.replace("/", " / ", regex=True)
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se..."
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil..."
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ..."
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl..."


In [81]:
def tokenize(sentence: str):
    tokens = sentence.lower().split()
    tokens = [process_token(t) for t in tokens]
    return tokens

def process_token(token: str):
    if token.strip() == "/":
        return token
    return re.sub(r'[^\w\s]', '', token.strip())

def vectorize(tokens):
    vocab, index = {}, 1
    vocab['<pad>'] = 0
    for token in tokens:
        token = token.strip()
        if token not in vocab:
            vocab[token] = index
            index += 1
    return vocab

def find_max_length(vectorized_poems):
    max_length = 0
    for poem in vectorized_poems:
        max_length = max(max_length, len(poem))   
    return max_length

In [82]:
all_text = " ".join(data["processed_title"].to_list())
tokens = tokenize(all_text)
vocab_map = vectorize(tokens)

In [83]:
vocab_map

{'<pad>': 0,
 'theres': 1,
 'nothing': 2,
 'inside': 3,
 '/': 4,
 'there': 5,
 'is': 6,
 'outside': 7,
 'me': 8,
 'i': 9,
 'search': 10,
 'on': 11,
 'in': 12,
 'hope': 13,
 'from': 14,
 'whole': 15,
 'we': 16,
 'crumble': 17,
 'forever': 18,
 'lost': 19,
 'to': 20,
 'chaos': 21,
 'never': 22,
 'one': 23,
 'again': 24,
 'indistinctiveness': 25,
 'immeasurability': 26,
 'capitalism': 27,
 'internet': 28,
 'down': 29,
 'obligations': 30,
 'go': 31,
 'byebye': 32,
 'office': 33,
 'rejoices': 34,
 'cotton': 35,
 'my': 36,
 'mouth': 37,
 'needles': 38,
 'blood': 39,
 'and': 40,
 'bones': 41,
 'hammers': 42,
 'head': 43,
 'mighty': 44,
 'hummingbird': 45,
 'drinks': 46,
 'a': 47,
 'grapefruits': 48,
 'blossom': 49,
 'blots': 50,
 'out': 51,
 'an': 52,
 'airplane': 53,
 'downvotes': 54,
 'fall': 55,
 'as': 56,
 'sharp': 57,
 'snowflakes': 58,
 'of': 59,
 'early': 60,
 'winter': 61,
 'execution': 62,
 'seven': 63,
 'ships': 64,
 'tonight': 65,
 'guess': 66,
 'shouldve': 67,
 'said': 68,
 'goodb

In [84]:
data["vectorized"] = data["processed_title"].apply(lambda x: [vocab_map[t] for t in tokenize(x)])
data = data[[a.count(4) <= 2 for a in data['vectorized']]]
max_length = find_max_length(data["vectorized"])
data = data[data['vectorized'].apply(lambda x: len(x) <= 19)]
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords,vectorized
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se...","[1, 2, 3, 4, 5, 6, 2, 7, 8, 4, 9, 10, 11, 12, 13]"
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for...","[14, 15, 16, 17, 4, 18, 19, 20, 21, 4, 22, 23,..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil...","[25, 4, 26, 4, 27]"
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ...","[28, 6, 29, 4, 30, 31, 32, 4, 33, 34]"
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl...","[35, 12, 36, 37, 4, 38, 12, 36, 39, 40, 41, 4,..."


In [85]:
data_vectorized_list = data["vectorized"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
haikus = np.array(padded_data)
haikus

array([[    1,     2,     3, ...,     0,     0,     0],
       [   14,    15,    16, ...,     0,     0,     0],
       [   25,     4,    26, ...,     0,     0,     0],
       ...,
       [  549,  5557,   424, ...,     0,     0,     0],
       [ 1084,  6422,  1464, ...,     0,     0,     0],
       [  414, 10429,     4, ...,     0,     0,     0]], dtype=int32)

In [254]:
window_size = 4

In [255]:
training_examples = []

for haiku in haikus:
    for i in range(len(haiku) - window_size + 1):
        input_words = haiku[i:i+window_size-1]
        output_word = haiku[i+window_size-1]
        training_examples.append((input_words, output_word))
training_examples[:20]

[(array([1, 2, 3], dtype=int32), 4),
 (array([2, 3, 4], dtype=int32), 5),
 (array([3, 4, 5], dtype=int32), 6),
 (array([4, 5, 6], dtype=int32), 2),
 (array([5, 6, 2], dtype=int32), 7),
 (array([6, 2, 7], dtype=int32), 8),
 (array([2, 7, 8], dtype=int32), 4),
 (array([7, 8, 4], dtype=int32), 9),
 (array([8, 4, 9], dtype=int32), 10),
 (array([ 4,  9, 10], dtype=int32), 11),
 (array([ 9, 10, 11], dtype=int32), 12),
 (array([10, 11, 12], dtype=int32), 13),
 (array([11, 12, 13], dtype=int32), 0),
 (array([12, 13,  0], dtype=int32), 0),
 (array([13,  0,  0], dtype=int32), 0),
 (array([0, 0, 0], dtype=int32), 0),
 (array([14, 15, 16], dtype=int32), 17),
 (array([15, 16, 17], dtype=int32), 4),
 (array([16, 17,  4], dtype=int32), 18),
 (array([17,  4, 18], dtype=int32), 19)]

In [256]:
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, SimpleRNN, GRU
from sklearn.model_selection import train_test_split
tf.keras.backend.clear_session()

In [257]:
vocab_size = len(vocab_map)
embedding_size = 128
input_length = window_size - 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(32),
    Dense(512, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])


In [258]:
import keras.backend as K
def masked_loss(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 0), K.floatx())
    loss = K.sparse_categorical_crossentropy(y_true, y_pred)
    masked_loss = loss * mask
    return K.sum(masked_loss) / K.sum(mask)

In [259]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss=masked_loss, optimizer=optimizer, metrics=['accuracy'])

In [260]:
x = np.array([example[0] for example in training_examples[:30000]])
y = np.array([example[1] for example in training_examples[:30000]])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.1)

In [261]:
tf.keras.backend.clear_session()
model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [243]:
vocab_map_inv = dict([(value, key) for key, value in vocab_map.items()])

In [248]:
def generate_poem(input_words: list):
    while len(input_words) < window_size - 1:
        input_words.insert(0, "<pad>")
    vectorized_input = [vocab_map[word] for word in input_words]
    print(f"User specified words {input_words} which were vectorized as {vectorized_input}")
    output_poem = input_words
    
    for i in range(19 - window_size - 1):
        input = np.array(vectorized_input[i:i+window_size-1]).reshape((1, window_size-1))
        prediction = np.array(model.predict(input, verbose=0))
        # new_word_vector = (prediction[0].argsort()[::-1])[np.random.randint(0,1)]
        new_word_vector = (prediction[0].argsort()[::-1])[0]
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        output_poem.append(new_word)
    output = " ".join(output_poem)
    print(f"OUTPUT POEM: {output}")

In [249]:
generate_poem(["once", "i"])

User specified words ['once', 'i'] which were vectorized as [1587, 9]
OUTPUT POEM: once i have been a haiku / i am not a haiku / i am not a


In [250]:
generate_poem(["college", "students", "are"])

User specified words ['college', 'students', 'are'] which were vectorized as [3571, 4015, 76]
OUTPUT POEM: college students are <pad> the <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [251]:
generate_poem(["fun", "today", "and", "tomorrow", "together"])

User specified words ['fun', 'today', 'and', 'tomorrow', 'together'] which were vectorized as [156, 620, 40, 1543, 2013]
OUTPUT POEM: fun today and tomorrow together / the / to a worst been the bad / a same / i haiku


In [252]:
generate_poem(["the", "bear", "and", "i", "will"])

User specified words ['the', 'bear', 'and', 'i', 'will'] which were vectorized as [85, 968, 40, 9, 160]
OUTPUT POEM: the bear and i will breaks flowers see be my / / <pad> life the and <pad> <pad> same fit


In [253]:
generate_poem(["you","are", "a", "movie"])

User specified words ['you', 'are', 'a', 'movie'] which were vectorized as [113, 76, 47, 4401]
OUTPUT POEM: you are a movie not fool / / <pad> but and <pad> me not <pad> me / <pad> me


In [236]:
def pick_random_words(vocab_map, number):
    vocabs = list(vocab_map.keys())
    inds = list(np.random.randint(0, len(vocabs), number))
    output_words = []
    for i in inds:
        output_words.append(vocabs[i])
    return output_words

In [238]:
generate_poem(pick_random_words(vocab_map, window_size-1))

User specified words ['sesquidecember', 'fairytale'] which were vectorized as [11301, 14997]
OUTPUT POEM: sesquidecember fairytale / when are be <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [145]:
generate_poem(pick_random_words(vocab_map, 5))

User specified words ['coulnt', 'rockefeller', 'cashmere', 'designing', 'weiner'] which were vectorized as [16218, 10467, 7594, 12341, 668]
OUTPUT POEM: coulnt rockefeller cashmere designing weiner / but i just be things <pad> <pad> <pad> <pad> <pad> <pad>


In [240]:
generate_poem(["hi", "park", "ball", "stick", "water"])

User specified words ['hi', 'park', 'ball', 'stick', 'water'] which were vectorized as [4104, 919, 4036, 2121, 500]
OUTPUT POEM: hi park ball stick water <pad> of snow / <pad> on / behind <pad> and happy in <pad> on /
