In [52]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional, Dense, Embedding, LSTM
from keras.utils import pad_sequences

In [53]:
def clean(data: pd.DataFrame):
    data["train"] = data[data["train"] != ""]
    data.dropna(subset=["train"], inplace=True)

def getLines(song: str):
    return song.split("\n")


In [54]:
FILEPATH = "./etc/imagine_dragons_lyrics.json"
data = pd.read_json(FILEPATH) #["train"]

# Remove empty entries
clean(data)

# Split each song into the constituent lines
data["lyrics"] = data["train"].apply(getLines)

In [55]:
corpus = [item for sublist in data["lyrics"].values for item in sublist]

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

# We add 1 for the token used to pad sequences
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print("Total words: ", total_words)

# This will be used to pad the other sequences
max_sequence_len = 0

# Generate input sequences of ngrams to feed to the LSTM model
input_sequences = []
for line in corpus:
    line_tokens = tokenizer.texts_to_sequences([line])[0]
    tokens_len = len(line_tokens)
    max_sequence_len = max(max_sequence_len, tokens_len)

    # Create ngrams for feeding into LSTM
    # Starts from 1 because otherwise we'll get an empty list as the first element
    for token in range(1, tokens_len):
        n_gram_sequence = line_tokens[:token+1]
        input_sequences.append(n_gram_sequence)

print(max_sequence_len)

# Pad the sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding="pre"))

features, labels = input_sequences[:, :-1], input_sequences[:,-1]

# One-hot encode the labels
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)

print(features)

Total words:  3361
32
[[  0   0   0 ...   0   0 320]
 [  0   0   0 ...   0 320 151]
 [  0   0   0 ...   0   0 748]
 ...
 [  0   0   0 ... 551   8  19]
 [  0   0   0 ...   8  19  15]
 [  0   0   0 ...  19  15  53]]


In [59]:
EMBEDDING_DIMS = 256

# Construct the model
model = tf.keras.Sequential([
    Embedding(input_dim=total_words, output_dim=EMBEDDING_DIMS, input_length=max_sequence_len-1),
    Bidirectional(LSTM(256, return_sequences=True)), # stateful=True,
    Bidirectional(LSTM(128)), # stateful=True,
    Dense(128, activation="relu"),
    Dense(total_words, activation="softmax")
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              loss="categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 31, 256)           860416    
                                                                 
 bidirectional_20 (Bidirecti  (None, 31, 512)          1050624   
 onal)                                                           
                                                                 
 bidirectional_21 (Bidirecti  (None, 256)              656384    
 onal)                                                           
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 3361)              433569    
                                                                 
Total params: 3,033,889
Trainable params: 3,033,889
No

In [61]:
EPOCHS = 100
history = model.fit(features, y, epochs=EPOCHS, verbose=1)

Epoch 1/100
  15/1876 [..............................] - ETA: 14:07 - loss: 6.0964 - accuracy: 0.0271

KeyboardInterrupt: 