In [22]:
import numpy as np
import pandas as pd
import gensim
import re
from gensim.parsing.preprocessing import strip_punctuation, strip_non_alphanum
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Bidirectional, LSTM, Flatten, TimeDistributed, Dense, Dropout, Embedding, Input
from sklearn.model_selection import train_test_split
import copy
import math

In [2]:
# import Kaggle lyrics data into csv
df_songs = pd.read_csv('all_lyrics.csv', index_col=0)
df_songs = df_songs.dropna(subset=['Title', 'Lyric'])
df_songs.head()

Unnamed: 0,Artist,Title,Album,Date,Lyric,Year
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018
3,Ariana Grande,Side To Side,Dangerous Woman,2016-05-20,ariana grande nicki minaj i've been here all ...,2016
4,Ariana Grande,​​no tears left to cry,Sweetener,2018-04-20,right now i'm in a state of mind i wanna be in...,2018


In [3]:
# split into x (lyrics) and corresponding y (titles)
def split_x_y(df):
    lyrics = []
    titles = []

    for index,row in df.iterrows():
        lyrics.append(row['Lyric'])
        titles.append(row['Title'])
    
    return lyrics, titles

In [4]:
# pre-process lyrics and titles consistenly
def pre_process_lyrics_and_titles(lyrics: list, titles: list):
    for idx in range(len(lyrics)):
        song_lyrics = lyrics[idx]
        song_lyrics = strip_punctuation(song_lyrics) # remove punctuation
        song_lyrics = strip_non_alphanum(song_lyrics)
        song_lyrics = song_lyrics.replace('\u200b', '')
        song_lyrics = song_lyrics.lower() # make all text lowercase
        song_lyrics = word_tokenize(song_lyrics)
        lyrics[idx] = ['<s>'] + song_lyrics + ['</s>']

        title = titles[idx]
        title = strip_punctuation(title) # remove punctuation
        title = strip_non_alphanum(title)
        title = title.replace('\u200b', '') # remove punctuation
        title = title.lower() # make all text lowercase 
        title = word_tokenize(title)
        titles[idx] = ['<s>'] + title + ['</s>']
        
    return lyrics, titles

In [5]:
lyrics, titles = split_x_y(df_songs.head(2000))
lyrics, titles = pre_process_lyrics_and_titles(lyrics, titles)

In [6]:
# train word embeddings
EMBEDDINGS_SIZE = 200

def train_word_embeddings(lyrics, titles):
    sg = 1
    window = 5
    vector_size = EMBEDDINGS_SIZE
    min_count = 1
    model_lyrics = Word2Vec(sentences=lyrics, size=vector_size, window=window, min_count=min_count, sg=sg)
    model_lyrics.save("model_lyrics")
    model_lyrics.wv.save_word2vec_format('model_lyrics.txt', binary=False)

    model_titles = Word2Vec(sentences=titles, size=vector_size, window=window, min_count=min_count, sg=sg)
    model_titles.save("model_titles")
    model_titles.wv.save_word2vec_format('model_titles.txt', binary=False)
    return model_lyrics, model_titles

In [7]:
model_lyrics, model_titles = train_word_embeddings(lyrics, titles)

In [8]:
# encode data into integers

def encode_data(lyrics, titles):
    tokenizer_lyrics = Tokenizer()
    tokenizer_titles = Tokenizer()
    
    # fit the tokenizer on your data
    tokenizer_lyrics.fit_on_texts(lyrics)
    tokenizer_titles.fit_on_texts(titles)
    
    # convert your data to sequences
    lyrics_encoded = tokenizer_lyrics.texts_to_sequences(lyrics)
    titles_encoded = tokenizer_titles.texts_to_sequences(titles)
    
    return lyrics_encoded, titles_encoded, tokenizer_lyrics, tokenizer_titles

In [9]:
lyrics_encoded, titles_encoded, tokenizer_lyrics, tokenizer_titles = encode_data(lyrics, titles)

## MODEL 1: RNN with LSTM --> using pretrained embeddings from Word2Vec

In [10]:
# read in embeddings from folder

def read_embeddings():
    model_lyrics_loaded = KeyedVectors.load_word2vec_format("model_lyrics.txt")
    model_titles_loaded = KeyedVectors.load_word2vec_format("model_titles.txt")

    word_to_embedding_lyrics = {}
    index_to_embedding_lyrics = {}
    for word in model_lyrics_loaded.wv.vocab:
        embedding = model_lyrics_loaded.wv[word]
        word_to_embedding_lyrics[word] = embedding
        index = tokenizer_lyrics.word_index[word]
        index_to_embedding_lyrics[index] = embedding

    word_to_embedding_titles = {}
    index_to_embedding_titles = {}
    for word in model_titles_loaded.wv.vocab:
        embedding = model_titles_loaded.wv[word]
        word_to_embedding_titles[word] = embedding
        index = tokenizer_titles.word_index[word]
        index_to_embedding_titles[index] = embedding
        
    return (model_lyrics_loaded, model_titles_loaded, word_to_embedding_lyrics, 
index_to_embedding_lyrics, word_to_embedding_titles, index_to_embedding_titles)

In [11]:
embeddings = read_embeddings()
model_lyrics_loaded = embeddings[0]
model_titles_loaded = embeddings[1]
word_to_embedding_lyrics = embeddings[2]
index_to_embedding_lyrics = embeddings[3]
word_to_embedding_titles = embeddings[4]
index_to_embedding_titles = embeddings[5]

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [12]:
def data_generator(lyrics_encoded, titles_encoded, index_to_embedding_lyrics, index_to_embedding_titles, batch_size):
    while True:

        # Loop over the batches
        for idx in range(len(lyrics_encoded)):
            start_idx = idx
            if (idx+batch_size < len(lyrics_encoded)):   
                end_idx = idx + batch_size
            else: 
                end_idx = batch_size
                batch_X_lyrics = lyrics_encoded[start_idx:end_idx]
                batch_y = titles_encoded[start_idx:end_idx]
                
                start_idx = 0
                end_idx = batch_size - (len(lyrics_encoded) - idx)
                batch_X_lyrics += lyrics_encoded[start_idx:end_idx]
                batch_y += titles_encoded[start_idx:end_idx]

            # Get the current batch of encoded lyrics and titles
            batch_X_lyrics = lyrics_encoded[start_idx:end_idx]
            batch_y = titles_encoded[start_idx:end_idx]

            # Calculate the maximum length of the sequences in the batch
            max_seq_length = max(len(seq) for seq in batch_X_lyrics + batch_y)

            # Pad the sequences to the maximum length
            batch_X_lyrics = pad_sequences(batch_X_lyrics, maxlen=max_seq_length, padding='post')
            batch_y = pad_sequences(batch_y, maxlen=max_seq_length, padding='post')
            
            # Create an array of shape (batch_size, sequence_length, embedding_dim) to hold the embeddings for the current batch of lyrics
            batch_X_lyrics_emb = np.array([[
                # if the index is in the index_to_embedding_lyrics dictionary, use its embedding
                # otherwise, use a zero vector of the appropriate length
                index_to_embedding_lyrics.get(idx, np.zeros(len(index_to_embedding_lyrics[1]))) for idx in seq
            ] for seq in batch_X_lyrics])
            
            # Create an array of shape (batch_size, sequence_length, vocab_size) to hold the one-hot encodings for the current batch of titles
            batch_y_onehot = to_categorical(batch_y, num_classes=len(index_to_embedding_titles))
            
            # Yield the current batch of embeddings for lyrics and titles
            yield (batch_X_lyrics_emb, batch_y_onehot)
            batch_X_lyrics_emb = []
            batch_y_onehot = []
            idx = end_idx # set back for next loop

In [13]:
#Examples
#initialize data_generator
num_sequences_per_batch = 32 # this is the batch size
steps_per_epoch_lyrics = len(lyrics_encoded)//num_sequences_per_batch  # Number of batches per epoch
train_generator_lyrics = data_generator(lyrics_encoded, titles_encoded, index_to_embedding_lyrics, index_to_embedding_titles, 32)

sample=next(train_generator_lyrics) # this is how you get data out of generators
print(sample[0].shape) 
print(sample[1].shape) 

(32, 793, 200)
(32, 793, 1974)


In [14]:
def train_model(generator, index_to_embedding_lyrics, index_to_embedding_titles, batch_size=128, epochs=5, validation_split=0.2, dropout_rate=0.2, lstm_units=5):
    # Split the data into training and validation sets
    X, y = next(generator)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_split, shuffle=False)
    steps_per_epoch = math.ceil(len(X_train) / batch_size)

    # Build the model
    model = Sequential()
    model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True), input_shape=(None, len(index_to_embedding_lyrics[1]))))
    model.add(Dropout(dropout_rate))
    model.add(TimeDistributed(Dense(units=len(index_to_embedding_titles), activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())
    # Train the model
    model.fit(X_train, y_train, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=(X_val, y_val), verbose=1)
    loss, accuracy = model.evaluate(X_val, y_val, steps=steps_per_epoch, verbose=1)
    print("Validation loss:", loss)
    print("Validation accuracy:", accuracy)
    return model


In [15]:
# hyperparameter tuning 

lstm_units = [5, 7]
dropout_rate = [0.1, 0.2, 0.25]
batch_size = [32]
epochs = [3, 7]

train_test_generator_split = 0.7

steps_per_epoch = len(lyrics_encoded) // num_sequences_per_batch
generator_train = data_generator(lyrics_encoded[:round(len(lyrics_encoded) * train_test_generator_split)], titles_encoded[:round(len(titles_encoded) * train_test_generator_split)], index_to_embedding_lyrics, index_to_embedding_titles, 32)
for units in lstm_units:
    for rate in dropout_rate:
        for size in batch_size:
            for epoch in epochs:
                print("running with lstm units: ", units, "\nrunning with dropout rate: ", rate, "\nrunning with batch size: ", size, "\nrunning with epochs: ", epoch)
                val_accuracy = train_model(generator_train, index_to_embedding_lyrics, index_to_embedding_titles, batch_size=size, epochs=epoch, dropout_rate=rate, lstm_units=units)
                print(val_accuracy)

running with lstm units:  5 
running with dropout rate:  0.1 
running with batch size:  32 
running with epochs:  3
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, None, 10)         8240      
 l)                                                              
                                                                 
 dropout (Dropout)           (None, None, 10)          0         
                                                                 
 time_distributed (TimeDistr  (None, None, 1974)       21714     
 ibuted)                                                         
                                                                 
Total params: 29,954
Trainable params: 29,954
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Please report this to the TensorFlow team. When filin

In [16]:
train_test_generator_split = 0.7
# create data generator with 70% of the data for training
train_generator = data_generator(lyrics_encoded[:round(len(lyrics_encoded) * train_test_generator_split)], titles_encoded[:round(len(titles_encoded) * train_test_generator_split)], index_to_embedding_lyrics, index_to_embedding_titles, 32)
# create data generator with 30% of the data for testing
test_generator = data_generator(lyrics_encoded[round(len(lyrics_encoded) * train_test_generator_split):], titles_encoded[round(len(titles_encoded) * train_test_generator_split):], index_to_embedding_lyrics, index_to_embedding_titles, 32)
# train best model on the best hyperparameters
best_model = train_model(train_generator, index_to_embedding_lyrics, index_to_embedding_titles, batch_size=64, epochs=7, dropout_rate=0.2, lstm_units=5)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_12 (Bidirecti  (None, None, 10)         8240      
 onal)                                                           
                                                                 
 dropout_12 (Dropout)        (None, None, 10)          0         
                                                                 
 time_distributed_12 (TimeDi  (None, None, 1974)       21714     
 stributed)                                                      
                                                                 
Total params: 29,954
Trainable params: 29,954
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause:

In [41]:
def convert_one_hot_vectors_to_words(one_hot_vectors, tokenizer_title):
    encoding_list = []
    title_list = []
    len_title = 0
    reverse_word_map = dict(map(reversed, tokenizer_title.word_index.items()))
    for i in range(len(one_hot_vectors)):
        if (one_hot_vectors[i][0] == 1): # remove all zero vectors from padded y_test input
            continue
        else:
            encoding_list.append(list(one_hot_vectors[i]).index(1)) # add index of title (index where 1 is in encoding)
    
    for encoding in encoding_list:
        title_list.append(reverse_word_map[encoding]) # get word from index
     
    len_title = len(title_list)
    
    actual_title = ' '.join(title_list)
    
    return actual_title, len_title

In [42]:
def generate_title(model, test_generator, tokenizer):

    X_test, y_test = next(test_generator)
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) # map index to word
    all_predicted_titles = []
    all_actual_titles = []
    
    for i in range(len(X_test)):
        current_song = np.expand_dims(X_test[i], axis=0)
        actual_title, len_actual_title = convert_one_hot_vectors_to_words(y_test[i], tokenizer)

        predictions = model.predict(current_song)

        # Get the most likely word for each time step in the output sequence
        predicted_indices = np.argmax(predictions, axis=1)[0]
        

        # Convert the predicted indices to words
        predicted_words = [reverse_word_map[idx] for idx in predicted_indices if idx != 0][0:len_actual_title]

        # Join the predicted words to form the title
        title = ' '.join(predicted_words)
        
        print("Actual title: ", actual_title)
        print("Predicted title: ", title)
        
        all_predicted_titles.append(title)
        all_actual_titles.append(actual_title)
        
    return all_actual_titles, all_predicted_titles

In [43]:
generate_title(best_model, test_generator, tokenizer_titles)

[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> bitter sweet symphony </s>
Predicted title:  why <s> hold why favor
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> o fly on extended version </s>
Predicted title:  is chinese back make never like freestyle
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> a head full of dreams live in buenos aires </s>
Predicted title:  partition chinese bout first gone touch live chinese 11 partition vida
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> now my feet won t touch the ground </s>
Predicted title:  man chinese the mix song it life chinese chinese interlude
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> moses </s>
Predicted title:  heart music only
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> rainy day </s>
Predicted title:  life chinese do life
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> moving to mars </s>
Predicted title:  head all lost all check
[0. 1. 0. ... 0. 0. 0.]
Actual title:  <s> careful where you stand </s>
Predicted title:  at chinese girls ocean edit single

(['<s> bitter sweet symphony </s>',
  '<s> o fly on extended version </s>',
  '<s> a head full of dreams live in buenos aires </s>',
  '<s> now my feet won t touch the ground </s>',
  '<s> moses </s>',
  '<s> rainy day </s>',
  '<s> moving to mars </s>',
  '<s> careful where you stand </s>',
  '<s> i ran away </s>',
  '<s> in the sun </s>',
  '<s> midnight kygo remix </s>',
  '<s> bigger stronger </s>',
  '<s> help is round the corner </s>',
  '<s> brothers sisters </s>',
  '<s> animals </s>',
  '<s> talk leaked version </s>',
  '<s> things i don t understand </s>',
  '<s> crests of waves </s>',
  '<s> such a rush </s>',
  '<s> 2000 miles </s>',
  '<s> 1 36 </s>',
  '<s> believe in love </s>',
  '<s> easy to please </s>',
  '<s> one i love </s>',
  '<s> o reprise </s>',
  '<s> amor argentina live in buenos aires </s>',
  '<s> wotw potp live in jordan version </s>',
  '<s> super bowl 50 halftime show </s>',
  '<s> imagine </s>',
  '<s> lethal drug </s>',
  '<s> colour spectrum live in b

## MODEL 2: RNN with Embedding Layer + Generation w/Shannon's Method

In [27]:
# pad each sequence of lyrics to a length of 130
batch_X_lyrics = pad_sequences(lyrics_encoded, maxlen=130, padding='post')
# pad each sequence of titles to a length of 130
batch_y = pad_sequences(titles_encoded, maxlen=130, padding='post')
# create one hot vectors for each title for RNN processing, num_classes adds 1 to account for padding token
batch_y_onehot = to_categorical(batch_y, num_classes=len(tokenizer_titles.word_index)+1)
# get total number of lyrics and titles for input shapes for the RNN
total_words_lyrics = len(tokenizer_lyrics.word_index) + 1
total_words_titles = len(tokenizer_titles.word_index) + 1

# split data into training and testing after creating batches (using train_test_split from Model 1)
X_train, X_test, y_train, y_test = train_test_split(batch_X_lyrics, batch_y_onehot, test_size=0.20)

In [29]:
def train_model_two(train_padseq, y_train, total_words_lyrics, total_words_titles, maxlen):
    # baseline model using embedding layers and simpleRNN
    model = Sequential()
    # embedding layer --> compressing input shape of total number of lyrics to dense vectors of length 200
    model.add(Embedding(total_words_lyrics, 200))
    # LSTM layer
    model.add(Bidirectional(LSTM(5, dropout=0.5, recurrent_dropout=0.50, activation='tanh', return_sequences=True)))
    # Dense output layer of output shape of total number of titles
    model.add(TimeDistributed(Dense(total_words_titles, activation='softmax')))

    print(model.summary())
    
    # compile model using optimizer, loss function, metrics
    model.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
            )
    
    # fit model to the data
    history = model.fit(train_padseq, y_train,
    batch_size=128,
    epochs=3
    )
    
    return model

model_two = train_model_two(X_train, y_train, total_words_lyrics, total_words_titles, 130)

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 200)         6889800   
                                                                 
 bidirectional_16 (Bidirecti  (None, None, 10)         8240      
 onal)                                                           
                                                                 
 time_distributed_16 (TimeDi  (None, None, 1975)       21725     
 stributed)                                                      
                                                                 
Total params: 6,919,765
Trainable params: 6,919,765
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.


In [30]:
def generate_titles(model, X_test, y_test, tokenizer, temperature=1.0):
    # Map index to word
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) 
    
    all_predicted_titles = []
    all_actual_titles = []
    
    for i in range(len(X_test)):
        # get the current lyrics
        current_song = np.expand_dims(X_test[i], axis=0)
        # get the actual title (using helper function from part 1)
        actual_title, actual_title_len = convert_one_hot_vectors_to_words(y_test[i], tokenizer) # Get actual title from label

        # create probability distribution for words
        predictions = model.predict(current_song)

        # Initialize list to store predicted words
        predicted_words = []

        # Sample one word at a time, excluding start and end characters
        for timestep in range(actual_title_len - 2):
            probs = predictions[0][timestep]
            # Exclude start and end characters
            probs[0] = 0
            probs[-1] = 0
            # Apply temperature scaling
            probs = np.power(probs, 1.0/temperature)
            probs /= np.sum(probs)
            # Sample from the probability distribution
            word_idx = np.random.choice(len(probs), p=probs)
            # Convert index to word
            word = reverse_word_map.get(word_idx, 'unk')
            predicted_words.append(word)

        title = ' '.join(predicted_words)
        # add sentence start and end characters to predicted title
        title = "<s> " + title + " </s>"
        
        print("Actual title: ", actual_title)
        print("Predicted title: ", title)
        
        all_predicted_titles.append(title)
        all_actual_titles.append(actual_title)
        
    return all_predicted_titles

all_predicted_titles = generate_titles(model_two, X_test, y_test, tokenizer_titles, 0.6)
all_predicted_titles

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Actual title:  <s> step on over </s>
Predicted title:  <s> invented thrill nala</s>
Actual title:  <s> love is everything </s>
Predicted title:  <s> daylight sydney night</s>
Actual title:  <s> stigma </s>
Predicted title:  <s> november</s>
Actual title:  <s> awake </s>
Predicted title:  <s> freestyle</s>
Actual title:  <s> new york live </s>
Predicted title:  <s> rebel bigger 轉</s>
Actual title:  <s> share </s>
Predicted title:  <s> 잠시</s>
Actual title:  <s> animals </s>
Predicted title:  <s> bonds</s>
Actual title:  <s> yellow live in buenos aires </s>
Predi

['<s> invented thrill nala</s>',
 '<s> daylight sydney night</s>',
 '<s> november</s>',
 '<s> freestyle</s>',
 '<s> rebel bigger 轉</s>',
 '<s> 잠시</s>',
 '<s> bonds</s>',
 '<s> edit new summer 1999 hills</s>',
 '<s> yourself</s>',
 '<s> motto lust moses</s>',
 '<s> black 학교의눈물 d about settle</s>',
 '<s> florida own under da</s>',
 '<s> gay bands cream</s>',
 '<s> strawberry</s>',
 '<s> light control days</s>',
 '<s> intoxicated around yoncé ballad</s>',
 '<s> hide that f</s>',
 '<s> dance left boy need</s>',
 '<s> siembab</s>',
 '<s> money more street</s>',
 '<s> wizard ペップセ show theatre pain</s>',
 '<s> as bria 11 dollar critiquing</s>',
 '<s> cred catalyst 6pm rather jaded</s>',
 '<s> tú mind èkó goode stand 꺼줄래</s>',
 '<s> madonna c into cred coluccio</s>',
 '<s> bangtan florida kevorkian</s>',
 '<s> seoul faded sex</s>',
 '<s> logic donk</s>',
 '<s> terjemahan city road jochen</s>',
 '<s> fantasy 흔한 nonstop answer</s>',
 '<s> off</s>',
 '<s> seeb 힙합성애자</s>',
 '<s> west trivia all xo