In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-05-01 19:38:25.641441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv('../data/haiku.csv')
data = data.replace("/", " / ", regex=True)
data = data.dropna()
split_by_row = data["processed_title"].str.split(" / ", n = 3, expand = True)
data["row_1"] = split_by_row[0]
data["row_2"] = split_by_row[1]
data["row_3"] = split_by_row[2]
data.head() 

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords,row_1,row_2,row_3
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se...",There's nothing inside,There is nothing outside me,I search on in hope.
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for...",From whole we crumble,Forever lost to chaos,Never one again
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil...",Indistinctiveness,Immeasurability,Capitalism
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ...",Internet is down,Obligations go bye-bye,Office rejoices
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl...",Cotton in my mouth,Needles in my blood and bones,Hammers in my head.


In [3]:
def tokenize(row: str):
    tokens = row.lower().split() # lower punctuations
    tokens = [process_token(t) for t in tokens] # take off punctuations 
    return tokens # returns an array words in a row 

def process_token(token: str):
    return re.sub(r'[^\w\s]', '', token.strip()) # gets rid of punctuations

def vectorize(tokens):
    vocab, index = {}, 1
    vocab['<pad>'] = 0
    for token in tokens:
        token = token.strip()
        if token not in vocab:
            vocab[token] = index
            index += 1
    return vocab # returns an dictionary (words to indices)

In [4]:
# row1 = " ".join(data["row_1"].to_list())
# row2 = " ".join(data["row_2"].to_list())
# row3 = " ".join(data["row_3"].to_list())
# # tokenize each row 
# tokens1 = tokenize(row1)
# tokens2 = tokenize(row2)
# tokens3 = tokenize(row3)
# # vectorizing each row
# vocab_map1 = vectorize(row1)
# vocab_map2 = vectorize(row2)
# vocab_map3 = vectorize(row3)

# for the entire poem
all_dictionary = " ".join(data["processed_title"].to_list())
tokens = tokenize(all_dictionary)
all_vocab_map = vectorize(tokens)

In [5]:
all_vocab_map

{'<pad>': 0,
 'theres': 1,
 'nothing': 2,
 'inside': 3,
 '': 4,
 'there': 5,
 'is': 6,
 'outside': 7,
 'me': 8,
 'i': 9,
 'search': 10,
 'on': 11,
 'in': 12,
 'hope': 13,
 'from': 14,
 'whole': 15,
 'we': 16,
 'crumble': 17,
 'forever': 18,
 'lost': 19,
 'to': 20,
 'chaos': 21,
 'never': 22,
 'one': 23,
 'again': 24,
 'indistinctiveness': 25,
 'immeasurability': 26,
 'capitalism': 27,
 'internet': 28,
 'down': 29,
 'obligations': 30,
 'go': 31,
 'byebye': 32,
 'office': 33,
 'rejoices': 34,
 'cotton': 35,
 'my': 36,
 'mouth': 37,
 'needles': 38,
 'blood': 39,
 'and': 40,
 'bones': 41,
 'hammers': 42,
 'head': 43,
 'mighty': 44,
 'hummingbird': 45,
 'drinks': 46,
 'a': 47,
 'grapefruits': 48,
 'blossom': 49,
 'blots': 50,
 'out': 51,
 'an': 52,
 'airplane': 53,
 'downvotes': 54,
 'fall': 55,
 'as': 56,
 'sharp': 57,
 'snowflakes': 58,
 'of': 59,
 'early': 60,
 'winter': 61,
 'execution': 62,
 'seven': 63,
 'ships': 64,
 'tonight': 65,
 'guess': 66,
 'shouldve': 67,
 'said': 68,
 'goodby

In [6]:
data["vectorized"] = data["processed_title"].apply(lambda x: [all_vocab_map[t] for t in tokenize(x)])
data["row_vector_1"] =  data["row_1"].apply(lambda x: [all_vocab_map[t] for t in tokenize(x)])
data["row_vector_2"] = data["row_2"].apply(lambda x: [all_vocab_map[t] for t in tokenize(x)])
data["row_vector_3"] = data["row_3"].apply(lambda x: [all_vocab_map[t] for t in tokenize(x)])
data = data[[a.count(4) <= 2 for a in data['vectorized']]]
data = data[data['vectorized'].apply(lambda x: len(x) <= 19)]
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords,row_1,row_2,row_3,vectorized,row_vector_1,row_vector_2,row_vector_3
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se...",There's nothing inside,There is nothing outside me,I search on in hope.,"[1, 2, 3, 4, 5, 6, 2, 7, 8, 4, 9, 10, 11, 12, 13]","[1, 2, 3]","[5, 6, 2, 7, 8]","[9, 10, 11, 12, 13]"
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for...",From whole we crumble,Forever lost to chaos,Never one again,"[14, 15, 16, 17, 4, 18, 19, 20, 21, 4, 22, 23,...","[14, 15, 16, 17]","[18, 19, 20, 21]","[22, 23, 24]"
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil...",Indistinctiveness,Immeasurability,Capitalism,"[25, 4, 26, 4, 27]",[25],[26],[27]
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ...",Internet is down,Obligations go bye-bye,Office rejoices,"[28, 6, 29, 4, 30, 31, 32, 4, 33, 34]","[28, 6, 29]","[30, 31, 32]","[33, 34]"
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl...",Cotton in my mouth,Needles in my blood and bones,Hammers in my head.,"[35, 12, 36, 37, 4, 38, 12, 36, 39, 40, 41, 4,...","[35, 12, 36, 37]","[38, 12, 36, 39, 40, 41]","[42, 12, 36, 43]"


In [7]:
data_vectorized_list = data["vectorized"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
haikus = np.array(padded_data)

row1_vectorized_list = data["row_vector_1"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
row1_haiku = np.array(padded_data)

row2_vectorized_list = data["row_vector_2"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
row2_haiku = np.array(padded_data)

row3_vectorized_list = data["row_vector_3"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
row3_haiku = np.array(padded_data)

In [8]:
window_size = 2

In [9]:
training_row1 = []
# row 1
for word1 in row1_haiku:
  for i in range(len(word1) - window_size + 1):
    input_words = word1[i: i+window_size - 1]
    output_words = word1[i + window_size - 1]
    training_row1.append((input_words, output_words))

training_row2 = []
# row 2
for word2 in row1_haiku:
  for i in range(len(word2) - window_size + 1):
    input_words = word2[i: i+window_size - 1]
    output_words = word2[i + window_size - 1]
    training_row2.append((input_words, output_words))

training_row3 = []
# row 2
for word3 in row1_haiku:
  for i in range(len(word3) - window_size + 1):
    input_words = word3[i: i+window_size - 1]
    output_words = word3[i + window_size - 1]
    training_row3.append((input_words, output_words))

In [10]:
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, SimpleRNN, GRU
from sklearn.model_selection import train_test_split
tf.keras.backend.clear_session()

In [11]:
vocab_size = len(all_vocab_map)
embedding_size = 128
input_length = window_size - 1
model1 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(32),
    Dense(512, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

model2 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(32),
    Dense(512, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

model3 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(32),
    Dense(512, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])


2023-05-01 19:38:39.239939: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
import keras.backend as K
def masked_loss(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 0), K.floatx())
    loss = K.sparse_categorical_crossentropy(y_true, y_pred)
    masked_loss = loss * mask
    return K.sum(masked_loss) / K.sum(mask)

In [13]:
optimizer1 = tf.keras.optimizers.Adam(learning_rate = 0.001)
optimizer2 = tf.keras.optimizers.Adam(learning_rate = 0.001)
optimizer3 = tf.keras.optimizers.Adam(learning_rate = 0.001)
model1.compile(loss=masked_loss, optimizer=optimizer1, metrics=['accuracy'])
model2.compile(loss=masked_loss, optimizer=optimizer2, metrics=['accuracy'])
model3.compile(loss=masked_loss, optimizer=optimizer3, metrics=['accuracy'])

In [14]:
x1 = np.array([example[0] for example in training_row1[:15000]])
y1 = np.array([example[1] for example in training_row1[:15000]])
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size= 0.1)

x2 = np.array([example[0] for example in training_row2[:15000]])
y2 = np.array([example[1] for example in training_row2[:15000]])
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size= 0.1)

x3 = np.array([example[0] for example in training_row3[:15000]])
y3 = np.array([example[1] for example in training_row3[:15000]])
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size= 0.1)

In [15]:
tf.keras.backend.clear_session()
model1.fit(x_train1, y_train1, batch_size=128, epochs=3, validation_data=(x_test1, y_test1))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8af8e67550>

In [16]:
tf.keras.backend.clear_session()
model2.fit(x_train2, y_train2, batch_size=128, epochs=3, validation_data=(x_test2, y_test2))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8af785c0a0>

In [17]:
tf.keras.backend.clear_session()
model3.fit(x_train3, y_train3, batch_size=128, epochs=3, validation_data=(x_test3, y_test3))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8ae78f07f0>

In [18]:
vocab_map_inv = dict([(value, key) for key, value in all_vocab_map.items()])
vocab_map_inv

{0: '<pad>',
 1: 'theres',
 2: 'nothing',
 3: 'inside',
 4: '',
 5: 'there',
 6: 'is',
 7: 'outside',
 8: 'me',
 9: 'i',
 10: 'search',
 11: 'on',
 12: 'in',
 13: 'hope',
 14: 'from',
 15: 'whole',
 16: 'we',
 17: 'crumble',
 18: 'forever',
 19: 'lost',
 20: 'to',
 21: 'chaos',
 22: 'never',
 23: 'one',
 24: 'again',
 25: 'indistinctiveness',
 26: 'immeasurability',
 27: 'capitalism',
 28: 'internet',
 29: 'down',
 30: 'obligations',
 31: 'go',
 32: 'byebye',
 33: 'office',
 34: 'rejoices',
 35: 'cotton',
 36: 'my',
 37: 'mouth',
 38: 'needles',
 39: 'blood',
 40: 'and',
 41: 'bones',
 42: 'hammers',
 43: 'head',
 44: 'mighty',
 45: 'hummingbird',
 46: 'drinks',
 47: 'a',
 48: 'grapefruits',
 49: 'blossom',
 50: 'blots',
 51: 'out',
 52: 'an',
 53: 'airplane',
 54: 'downvotes',
 55: 'fall',
 56: 'as',
 57: 'sharp',
 58: 'snowflakes',
 59: 'of',
 60: 'early',
 61: 'winter',
 62: 'execution',
 63: 'seven',
 64: 'ships',
 65: 'tonight',
 66: 'guess',
 67: 'shouldve',
 68: 'said',
 69: 'go

In [19]:
def generate_poem(input_words: list):
    while len(input_words) < window_size - 1:
        input_words.insert(0, "<pad>")
    vectorized_input = [all_vocab_map[word] for word in input_words]
    print(f"User specified words {input_words} which were vectorized as {vectorized_input}")
    output_poem = input_words
    
    for i in range(5 - window_size - 1):
        input = np.array(vectorized_input[i:i+window_size-1]).reshape((1, window_size-1))
        prediction = np.array(model1.predict(input, verbose=0))
        new_word_vector = (prediction[0].argsort()[::-1])[0]
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        print(new_word)
        output_poem.append(new_word)
        print("run")
    output_poem.append(" / ")
    for i in range(7 - window_size - 1):
        input = np.array(vectorized_input[i:i+window_size-1]).reshape((1, window_size-1))
        prediction = np.array(model2.predict(input, verbose=0))
        new_word_vector = (prediction[0].argsort()[::-1])[0]
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        output_poem.append(new_word)
    output_poem.append(" / ")
    for i in range(5 - window_size - 1):
        input = np.array(vectorized_input[i:i+window_size-1]).reshape((1, window_size-1))
        prediction = np.array(model3.predict(input, verbose=0))
        new_word_vector = (prediction[0].argsort()[::-1])[0]
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        output_poem.append(new_word)

    output = " ".join(output_poem)
    print(f"OUTPUT POEM: {output}")

In [20]:
generate_poem(["you","are", "a", "movie"])

User specified words ['you', 'are', 'a', 'movie'] which were vectorized as [113, 76, 47, 4400]

run

run
OUTPUT POEM: you are a movie    /       /   
