In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_parquet("hf://datasets/shahules786/PoetryFoundationData/data/train-00000-of-00001-486832872ed96d17.parquet")
print(f"\n\n======\n\n")

print(df.columns, len(df))
newyork = df[df['author'].isin(["John Ashbery", "Barbara Guest", "James Schuyler", "Kenneth Koch", "Frank O'Hara"])]
shake = df[df['author'] == 'William Shakespeare']

print(f"Shakespeare: {len(shake)} examples\nNew Yorkers: {len(newyork)} examples")
print(f"Shakespeare avg length: {np.average([len(poem) for poem in shake['content']])}\nNew Yorkers avg length: {np.average([len(poem) for poem in newyork['content']])}")

  from .autonotebook import tqdm as notebook_tqdm






Index(['poem name', 'content', 'author', 'type', 'age'], dtype='object') 13854
Shakespeare: 85 examples
New Yorkers: 81 examples
Shakespeare avg length: 1468.5058823529412
New Yorkers avg length: 1810.6049382716049


In [3]:
def load_embedding_model():
    """ Load GloVe Vectors
        Return:
            wv_from_bin: All 400000 embeddings, each length 50
    """
    import gensim.downloader as api
    wv_from_bin = api.load("glove-wiki-gigaword-50")
    # wv_from_bin = api.load("glove.6B/glove.6B.50d.txt")
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin
wv_from_bin = load_embedding_model()

Loaded vocab size 400000


In [4]:
def process_poem(poem) :
  out = poem.replace('\n', ' ')
  out = out.lower()
  out = re.sub(r'[^a-zA-Z ]+', '', out)
  return [word for word in out.split(' ') if word!='']


newyork_processed = [process_poem(newyork['content'].iloc[i]) for i in range(len(newyork))]
newyork_labels = [0 for i in range(len(newyork))]
shake_processed = [process_poem(shake['content'].iloc[i]) for i in range(len(shake))]
shake_labels = [1 for i in range(len(shake))]
# processed_poems = [process_poem(df['content'].iloc[i]) for i in range(len(df))]
# print(processed_poems[0])
# #newyork_labels = [0 for i in range(len(df))]

processed_poems = newyork_processed + shake_processed
# perm = np.random.permutation(len(processed_poems))
# processed_poems = processed_poems[perm]
# labels = labels[perm]

vocab = sorted(list(set([word for poem in processed_poems for word in poem])))

word_to_idx = {word:idx for idx, word in enumerate(vocab)}
idx_to_word = {idx:word for idx, word in enumerate(vocab)}
input_sequences = []
labels = []

for poem in processed_poems :
  for i in range(len(poem)) :
    poem[i] = word_to_idx[poem[i]]
    if i < len(poem) - 1:
      n_gram_sequence = poem[max(0, i-99):i+1]
      input_sequences.append(n_gram_sequence)
    if i > 0:
      labels.append(poem[i])

print(processed_poems[0])
print(input_sequences[5], labels[5])

# print(processed_poems[0])

# embedding_matrix = np.zeros((len(vocab), 50))
# bad_count = 0
# for i, word in enumerate(vocab):
#     try:
#         embedding_matrix[i] = wv_from_bin.get_vector(word)
#     except:
#         print("this is bad", word)
#         bad_count += 1
# print(f"Total Bad Words: {bad_count} out of total vocab {len(vocab)}")

[3916, 277, 1130, 5168, 2838, 5188, 5137, 7516, 4134, 8039, 2896, 6310, 5526, 4076, 3548, 318, 5514, 4891, 1130, 2305, 3292, 56, 1549, 7151, 795, 2619, 351, 7546, 1442, 8442, 0, 6303, 386, 2551, 4242, 558, 7529, 3884, 2557, 8358, 3430, 3333, 2384, 7508, 8582, 4980, 4777, 7508, 8582, 238, 7546, 1374, 5137, 4277, 6393, 4695, 8442, 1754, 7516, 1740, 5520, 5220, 7255, 5516, 5107, 4092, 1267, 5713, 5107, 1268, 5073, 7542, 318, 1470, 7670, 4875, 8115, 5107, 224, 981, 7516, 4004, 3916, 2309, 7571, 4781, 351, 3692, 8202, 5188, 5107, 8589, 6249, 117, 867, 1663, 8442, 455, 238, 2932, 3135, 455, 3884, 4271, 2932, 3884, 7946, 4271, 8242, 3925, 5185, 2084, 238, 8242, 3925, 7516, 4561, 7516, 4423, 5107, 4260, 5163, 5107, 4331, 8286, 8325, 4593, 1581, 0, 4769, 6904, 7670, 541, 2910, 3766, 1660, 3122, 5107, 4271, 1535, 2127, 6559, 3766, 7571, 4781, 7510, 3409, 6485, 8049, 122, 3692, 4089, 7510, 3692, 844, 7695, 4833, 5137, 4875, 5238, 6850, 5399, 5107, 7561, 351, 7546, 1374, 7670, 4577, 7546, 318, 572

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = max(len(sequence) for sequence in input_sequences)
print(f"Max Length = {max_length}")
predictors = pad_sequences(input_sequences, maxlen=max_length, padding='post', truncating='post')

Max Length = 100


In [6]:
print(len(predictors))

44818


In [2]:
import pickle
import tensorflow as tf

# f = open('predictors.pickle', 'wb')
# pickle.dump(predictors, f)
# f.close()
# f = open('labels.pickle', 'wb')
# pickle.dump(labels, f)
# f.close()

f = open('predictors.pickle', 'rb')
predictors = pickle.load(f)
f.close()
f = open('labels.pickle', 'rb')
labels = pickle.load(f)
f.close()

2024-11-16 07:46:20.326448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-16 07:46:21.691856: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/lib
2024-11-16 07:46:21.691887: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-11-16 07:46:21.856517: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 07:46:24.402799

In [7]:
perm = np.random.permutation(len(predictors))
print("yay")
shuffled_predictors = np.array(predictors)[perm]
shuffled_labels = np.array(labels)[perm]
print('nice')
training_data = shuffled_predictors[:-5000]
training_labels = shuffled_labels[:-5000]
print('cool')
validation_data = shuffled_predictors[-5000:]
validation_labels = shuffled_labels[-5000:]

yay
nice
cool


## LSTM Model

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
import tensorflow.keras.utils as ku

# len(vocab) = 161619
model_lstm = tf.keras.Sequential()
e = Embedding(len(vocab), 10, input_length = 100)  # used to be a trainable embedding layer to 10d
model_lstm.add(e)
model_lstm.add(LSTM(100))
model_lstm.add(Dropout(0.1))
model_lstm.add(Dense(len(vocab), activation='softmax'))   # used to have len(vocab) neurons
# lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(0.1, decay_rate=0.1, decay_steps=1000)
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model_lstm.summary())
print(len(vocab))
print(training_labels[0])



None
8602
3925


In [19]:
model_lstm.fit(np.array(training_data), np.array(ku.to_categorical(training_labels, num_classes=len(vocab))), epochs=20, verbose=1)

Epoch 1/20
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 48ms/step - accuracy: 0.0487 - loss: 6.9433
Epoch 2/20
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 62ms/step - accuracy: 0.0536 - loss: 6.8389
Epoch 3/20
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 58ms/step - accuracy: 0.0544 - loss: 6.7489
Epoch 4/20
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 62ms/step - accuracy: 0.0549 - loss: 6.6343
Epoch 5/20
[1m1245/1245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 78ms/step - accuracy: 0.0581 - loss: 6.5044
Epoch 6/20
[1m 196/1245[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:22[0m 79ms/step - accuracy: 0.0609 - loss: 6.3617

KeyboardInterrupt: 

In [15]:
loss, accuracy = model_lstm.evaluate(np.array(validation_data), np.array(ku.to_categorical(validation_labels, num_classes=len(vocab))), verbose=1)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.0493 - loss: 7.1799


In [16]:
print(loss, accuracy)

7.213222503662109 0.04659999907016754


In [18]:
model_lstm(training_data[0])

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input [3517. 3766. 4260. 7546. 8513. 7929. 3643. 1572. 3925.  541. 5184. 8286.
 3333. 3282. 7695. 8023. 7643. 7516. 4523.  586. 7670. 3126. 8279.  386.
 8358. 5582.  174. 8242. 7284. 5163. 4383. 5163.  814. 5035. 3925. 8242.
 7645.  238. 7535. 8242. 5023. 2878.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.]. Expected shape (None, 100), but input has incompatible shape (100,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(100,), dtype=int32)
  • training=None
  • mask=None