In [37]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [38]:
file_path = './rap_lyrics.txt'

In [39]:
# Get verses
verses = []
current_verse = []

with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()

        if line:  # Non-empty line
            current_verse.extend(line.split())
        elif current_verse:  # Empty line, but we have words in the current verse
            verses.append(current_verse)
            current_verse = []

    if current_verse:  # If there's a verse left after reading the file
        verses.append(current_verse)


In [40]:
# Shuffle the verses here so that train/test is independent of rapper 

import random
random.shuffle(verses)

In [41]:
len(verses)

7836

In [42]:
# Tokenize verses (words --> numbers)
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(verses)
sequences = tokenizer.texts_to_sequences(verses)

In [80]:
# Split data into RNN-style splits 
# (words 1-50 predict word 51, words 2-51 predict word 52, etc.)

features = []
labels = []

training_length = 50

# Iterate through the sequences of tokens (for us, verses)
for seq in sequences:

    # Create multiple training examples from each sequence
    for i in range(training_length, len(seq)):
        
        # Extract the features and label
        extract = seq[i - training_length:i + 1]

        # Set the features and label
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)

In [81]:
features.shape

(620442, 50)

In [83]:
# One-hot the labels (i.e. Y)
num_words = len(tokenizer.word_index) + 1
label_array = np.zeros((len(features), num_words), dtype=np.int8)

for example_index, word_index in enumerate(labels):
    label_array[example_index, word_index] = 1
    

In [84]:
label_array.shape

(620442, 33434)

In [85]:
# Split into train/test
n = len(features)
train_ratio = 0.8
split = int(n*train_ratio)

train_x = features[:split]
train_y = label_array[:split]

test_x = features[split:]
test_y = label_array[split:]

In [86]:
train_y.shape

(496353, 33434)

In [48]:
# Now at a point where we've split the text data into features and labels via Tokenizer 
# (and one-hotted the labels into "label_array")

# Might be time to head back to Brownlee's tutorial to figure out how to split data to X/Y
# (Before that, stay on the Koehrsen tutorial to get embeddings)

# Then, on to RNN building!

In [49]:
# Steps:
# (1) Generate embeddings of features (Koehrsen)
# (2) Split train/test (Brownlee)
# (3) Build RNN model (Koehrsen)

In [50]:
# Pre-trained embeddings

# Load in embeddings
glove_vectors = './glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None)

# Extract the vectors and words
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

# Create lookup of words to vectors
word_lookup = {word: vector for word, vector in zip(words, vectors)}

# New matrix to hold word embeddings
embedding_matrix = np.zeros((num_words, vectors.shape[1]))

In [51]:
for i, word in enumerate(tokenizer.word_index.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [None]:
# Vocabulary time

# num_words: size of vocabulary
# training_length: input feature length in the time direction (i.e. 50 words)

In [87]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

# Embedding layer
model.add(
    Embedding(input_dim=num_words,
              #input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [88]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 100)         3343400   
                                                                 
 masking_3 (Masking)         (None, None, 100)         0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                42240     
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 33434)             2173210   
                                                                 
Total params: 5,563,010
Trainable params: 2,219,610
No

In [None]:
model.fit(train_x, train_y, epochs=20, batch_size=1, verbose=2)

Epoch 1/20
