In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-04-28 10:25:17.684943: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def tokenize(sentence: str):
    tokens = sentence.lower().split()
    tokens = [process_token(t) for t in tokens]
    return tokens

def process_token(token: str):
    if token.strip() == "/":
        return token
    return re.sub(r'[^\w\s]', '', token.strip())

def vectorize(tokens):
    vocab, index = {}, 1
    vocab['<pad>'] = 0
    for token in tokens:
        token = token.strip()
        if token not in vocab:
            vocab[token] = index
            index += 1
    return vocab

def find_max_length(vectorized_poems):
    max_length = 0
    for poem in vectorized_poems:
        max_length = max(max_length, len(poem))   
    return max_length

In [5]:
data = pd.read_csv('../data/haiku.csv')
data = data.replace("/", " / ", regex=True)
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se..."
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil..."
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ..."
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl..."


In [6]:
all_text = " ".join(data["processed_title"].to_list())
tokens = tokenize(all_text)
vocab_map = vectorize(tokens)

In [8]:
data["vectorized"] = data["processed_title"].apply(lambda x: [vocab_map[t] for t in tokenize(x)])
data = data[[a.count(4) <= 2 for a in data['vectorized']]]
max_length = find_max_length(data["vectorized"])
data = data[data['vectorized'].apply(lambda x: len(x) <= 19)]
data.head()
data_vectorized_list = data["vectorized"].to_list()
padded_data = pad_sequences(data_vectorized_list, padding='post')
haikus = np.array(padded_data)
haikus

array([[    1,     2,     3, ...,     0,     0,     0],
       [   14,    15,    16, ...,     0,     0,     0],
       [   25,     4,    26, ...,     0,     0,     0],
       ...,
       [  549,  5557,   424, ...,     0,     0,     0],
       [ 1084,  6422,  1464, ...,     0,     0,     0],
       [  414, 10429,     4, ...,     0,     0,     0]], dtype=int32)

In [9]:
x = haikus[:, :19-1]  # input consists of all words except the last
y = haikus[:, 1:]

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [11]:
# training_examples = []
# window_size = 4
# for haiku in haikus:
#     for i in range(len(haiku) - window_size + 1):
#         input_words = haiku[i:i+window_size-1]
#         output_word = haiku[i+window_size-1]
#         training_examples.append((input_words, output_word))
# training_examples[:20]

## Transformer

In [12]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization
from keras.layers import MultiHeadAttention, TimeDistributed, Add, Concatenate
from keras.models import Model

In [13]:
# Encoder
encoder_inputs = Input(shape=(16,), name='encoder_inputs')
embedding_dim = 32
hidden_dim = 256
x = Embedding(len(vocab_map)+1, embedding_dim)(encoder_inputs)
# Self-attention layer
attn_output = MultiHeadAttention(num_heads=4, key_dim=embedding_dim)(x, x)
x = Add()([x, attn_output])
x = LayerNormalization(epsilon=1e-6)(x)
# Feed-forward layer
ff_output = TimeDistributed(Dense(hidden_dim, activation='relu'))(x)
ff_output = TimeDistributed(Dense(embedding_dim))(ff_output)
x = Add()([x, ff_output])
encoder_outputs = LayerNormalization(epsilon=1e-6)(x)
# -----
# Decoder
decoder_inputs = Input(shape=(15,), name='decoder_inputs')
x = Embedding(len(vocab_map)+1, embedding_dim)(decoder_inputs)
# Self-attention layer
attn_output1 = MultiHeadAttention(num_heads=4, key_dim=embedding_dim)(x, x)
x = Add()([x, attn_output1])
x = LayerNormalization(epsilon=1e-6)(x)
# Encoder-decoder attention layer
attn_output2 = MultiHeadAttention(num_heads=4, key_dim=embedding_dim)(x, encoder_outputs)
x = Add()([x, attn_output2])
x = LayerNormalization(epsilon=1e-6)(x)
decoder_outputs = TimeDistributed(Dense(len(vocab_map)+1, activation='softmax'))(x)

2023-04-28 10:27:02.022116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
# Define the model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

In [15]:
x = haikus[:, :-3]
y = haikus[:, 3:]

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [18]:
model.fit(x=[x_train, y_train[:, :-1]], y=y_train[:, 1:], batch_size=256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9d70780550>

In [19]:
vocab_map_inv = dict([(value, key) for key, value in vocab_map.items()])

In [23]:
def generate_poem(input_words: list):
    window_size = 3
    # while len(input_words) < window_size - 1:
    #     input_words.insert(0, "<pad>")
    vectorized_input = [vocab_map[word] for word in input_words]
    print(f"User specified words {input_words} which were vectorized as {vectorized_input}")
    output_poem = input_words
    
    for i in range(13):
        input = np.array([np.pad(vectorized_input, (0, 16 - len(vectorized_input)), constant_values=len(vocab_map))])
        prediction = model.predict([input, np.zeros((1, 15))], verbose = 0)
        new_word_vector = np.argmax(prediction[0, i])
        vectorized_input.append(new_word_vector)
        new_word = vocab_map_inv[new_word_vector]
        output_poem.append(new_word)
    output = " ".join(output_poem)
    print(f"OUTPUT POEM: {output}")

In [25]:
generate_poem(["hello", "where", "it"])

User specified words ['hello', 'where', 'it'] which were vectorized as [220, 653, 129]
0
1
2
3
4
5
6
7
8
9
10
11
12
OUTPUT POEM: hello where it <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
