In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, SimpleRNN, GRU, Dropout, GlobalAveragePooling1D, Bidirectional
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={"figure.dpi": 300})
tf.keras.backend.clear_session()

In [7]:
data = pd.read_csv('../data/haiku.csv')
data = data.replace("/", " / ", regex=True)
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,id,processed_title,ups,keywords
0,0,1020ac,There's nothing inside / There is nothing outs...,5,"[('inside', 0.5268), ('outside', 0.3751), ('se..."
1,1,107cob,From whole we crumble / Forever lost to chaos ...,1,"[('chaos', 0.5962), ('crumble', 0.4749), ('for..."
2,2,109a51,Indistinctiveness / Immeasurability / Capitalism,3,"[('indistinctiveness', 0.7664), ('immeasurabil..."
3,3,10eysi,Internet is down / Obligations go bye-bye / Of...,9,"[('office', 0.5033), ('obligations', 0.4663), ..."
4,4,10f79k,Cotton in my mouth / Needles in my blood and b...,1,"[('needles', 0.5314), ('cotton', 0.4806), ('bl..."


In [8]:
def process_raw_haiku(haiku):
    tokens = tokenize(haiku)
    return " ".join(tokens)

def tokenize(sentence: str):
    tokens = sentence.lower().split()
    tokens = [process_token(t) for t in tokens]
    return tokens

def process_token(token: str):
    if token.strip() == "/":
        return token
    return re.sub(r'[^\w\s]', '', token.strip())

Create corpus

In [9]:
data["processed_title"] = data["processed_title"].apply(lambda x: process_raw_haiku(x))
corpus = data["processed_title"].to_list()

In [10]:
corpus[:3]

['theres nothing inside / there is nothing outside me / i search on in hope',
 'from whole we crumble / forever lost to chaos / never one again',
 'indistinctiveness / immeasurability / capitalism']

In [16]:
s = ['à', 'á', 'è', 'é', 'í', 'ñ', 'ó', 'ö', 'ü', 'ā', 'ī', 'ı', 'ō', 'ǝ', 'ɥ', 'ʇ', 'א', 'ב', 'ד', 'י', 'ל', 'מ', 'נ', 'ע', 'ר', 'ת', 'あ', 'い', 'え', 'お', 'か', 'が', 'き', 'ぎ', 'く', 'け', 'し', 'す', 'ず', 'せ', 'た', 'ち', 'っ', 'て', 'で', 'と', 'ど', 'な', 'に', 'ぬ', 'の', 'は', 'ぱ', 'び', 'べ', 'ま', 'む', 'も', 'ゃ', 'や', 'よ', 'ら', 'り', 'る', 'れ', 'わ', 'を', 'ザ', 'ピ', 'ム', 'ー', '上', '下', '古', '夏', '天', '好', '子', '山', '川', '広', '彼', '慮', '憂', '放', '方', '无', '日', '明', '春', '景', '月', '木', '村', '樹', '死', '気', '水', '江', '池', '浪', '海', '消', '涙', '涼', '漏', '無', '目', '真', '着', '空', '紅', '緑', '美', '者', '色', '花', '落', '葉', '蒼', '蛙', '見', '触', '込', '迷', '遠', '鏡', '降', '雨', '雪', '雷', '音', '飛', '食', '鬱', '鱗', '𝐴', '𝐷', '𝐿', '𝑁', '𝑇', '𝑎', '𝑐', '𝑑', '𝑒', '𝑓', '𝑔', '𝑖', '𝑙', '𝑛', '𝑜', '𝑟', '𝑠', '𝑡', '𝑢', '𝑤', '𝒶', '𝒷', '𝒸', '𝒹', '𝒻', '𝒽', '𝒾', '𝓀', '𝓁', '𝓂', '𝓃', '𝓅', '𝓇', '𝓈', '𝓉', '𝓊', '𝓋', '𝓌', '𝓎', '𝓏', '𝘈', '𝘏', '𝘕', '𝘞', '𝘢', '𝘣', '𝘤', '𝘥', '𝘦', '𝘨', '𝘩', '𝘪', '𝘯', '𝘰', '𝘳', '𝘴', '𝘵', '𝘺']
filtered_corpus = [string for string in corpus if not any(symbol in string for symbol in s)]

In [18]:
vocab = sorted(set(''.join(filtered_corpus)))

In [21]:
char_to_index = {char: index for index, char in enumerate(vocab)}
index_to_char = {index: char for index, char in enumerate(vocab)}

In [51]:
# Generate input and target sequences
input_sequences = []
target_sequences = []
for line in filtered_corpus[0:2000]:
    for i in range(1, len(line)):
        seq = line[:i]
        input_sequences.append([char_to_index[ch] for ch in seq])
        target_sequences.append(char_to_index[line[i]])

input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=50, padding='pre')
target_sequences = np.array(target_sequences)

# # Step 4: Create input and target sequences
# input_sequences = padded_sequences[:, :-1]
# target_sequences = padded_sequences[:, 1:]

In [52]:
vocab_size = len(vocab)
embedding_size = 16
input_length = 50
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=input_length),
    GRU(64),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

In [53]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.00001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [54]:
input_sequences

array([[ 0,  0,  0, ...,  0,  0, 32],
       [ 0,  0,  0, ...,  0, 32, 20],
       [ 0,  0,  0, ..., 32, 20, 17],
       ...,
       [14, 24, 33, ..., 37,  0, 18],
       [24, 33, 17, ...,  0, 18, 17],
       [33, 17,  0, ..., 18, 17, 17]], dtype=int32)

In [55]:
input_sequences.shape

(138270, 50)

In [56]:
target_sequences.shape

(138270,)

In [57]:
model.fit(input_sequences, target_sequences, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9d247c31f0>

In [58]:
def generate_haiku(seed_text, model, max_len, num_chars, char_to_idx, idx_to_char):
    generated_text = seed_text
    for i in range(max_len):
        input_sequence = [char_to_idx[ch] for ch in generated_text]
        input_sequence = pad_sequences([input_sequence], maxlen=max_len, padding='pre')
        probs = model.predict(input_sequence)[0]
        probs /= probs.sum() # Normalize probabilities to sum to 1
        idx = np.random.choice(num_chars, p=probs)
        char = idx_to_char[idx]
        generated_text += char
        if char == '\n':
            break
    return generated_text

In [61]:
generate_haiku("a spring day", model, 50, vocab_size, char_to_index, index_to_char)



'a spring day dddyn rfeoi ekiwhcuh  tcek/wgun  infriektook fuei'