# Generating Lyrics with LSTM Networks


In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, CuDNNLSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,LambdaCallback
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
#note make sure to change your runtime hardware accelerator to include a GPU
device_name = tf.test.gpu_device_name()

Using TensorFlow backend.


To access files on Colab, you can import data directly to your google drive account and access it from there

In [27]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Data preprocessing


1.   Get a dataset of a particular artist's songs
2.   Create one string of all the songs
3.   Chunk this string into n-word sequences having the next word in the string be a sequence's target variable (explained clearer in paper)



In [28]:
data = pd.read_csv('PATH_TO_DATA/taylor_swift.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Lyrics
0,0,tim mcgraw,\n\r\nhe said the way my blue eyes shined\nput...
1,1,picture to burn,"\n\r\nstate the obvious, i didn't get my perfe..."
2,2,teardrops on my guitar,\n\r\ndrew looks at me.\ni fake a smile so he ...
3,3,a place in this world,"\n\r\ni don't know what i want, so don't ask m..."
4,4,cold as you,\n\r\nyou have a way of coming easily to me\na...


#### Make 1 string with all files

In [29]:
text = ""
test = []
for song in data.Lyrics:
    text = text + song
    test.append(song.split(' '))

print('average taylor swift song length')
x = [len(t) for t in test]
sum(x) / len(x)


average taylor swift song length


307.14

### Prepare Dataset
-chunk data into sequences

In [0]:
tokenizer = Tokenizer()
def data_prep(data):
    text = data.lower().split('\n')
    tokenizer.fit_on_texts(text)
    total_words = len(tokenizer.word_index) + 1
    #convert our list of split words into sequences
    sequences = []
    for line in text:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram = token_list[:i+1]
            sequences.append(n_gram)
    #find the longest sequences length so we can pad the others to that len
    max_len = max([len(x) for x in sequences])
    #we add our paddings to the beginning of each val
    sequences = np.array(pad_sequences(sequences, maxlen=max_len, padding='pre'))
    print(sequences.shape)
    X, y = sequences[:,:-1], sequences[:,-1]
    y = ku.to_categorical(y, num_classes=total_words)
    return X, y, total_words, max_len
    

In [31]:
X,y,total_words,max_len = data_prep(text)
max_len

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

(30729, 25)


In [32]:
print(X.shape)
print(X[1].shape)
print(max_len)
print(total_words)

(30729, 24)
(24,)
25
2446


## Build our Model

## We build a 1 Layer LSTM Model W/an Embedding and Dropout Layer (hyperparameter justifications are in our paper)
- Through testing we found that adam was the best optimizer for text generation
- We used categorical crossentropy for loss as this was a multi-class classification problem. The network has a dense output layer with n-nodes where n represents all the unique words in our corpus.

In [10]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_len-1))
model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 50)            122300    
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 128)               92160     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2446)              315534    
Total params: 529,994
Trainable params: 529,994
Non-trainable params: 0
_________________________________________________________________


We chose to run our network for 200 epochs 

In [0]:
model.fit(X, y, epochs=200, verbose=1)

## Pretrained model can be found on GitHub

In [0]:
model.load_weights('PATH_TO_DATA/data/final_tswift_model_weights.h5')

## Generating text with our model

In [0]:
def generate_text(epoch, logs):
    num_of_words_to_gen = 50
    seed_text = "those georgia stars"
    if epoch % 10 == 0:
      print('----- Generating text after Epoch: %d' % epoch)
      
      for j in range(num_of_words_to_gen):
          #format token as a model input
          seed_token = tokenizer.texts_to_sequences([seed_text])[0]
          seed_token = pad_sequences([seed_token], maxlen=max_len-1, padding='pre')
          #predict
          predicted = model.predict_classes(seed_token, verbose=0)
          output_word = ""
          for word, index in tokenizer.word_index.items():
              if index == predicted:
                  output_word = word
                  break
          seed_text += " " + output_word
      print(seed_text)
      print('\n')

generate_text = LambdaCallback(on_epoch_end=generate_text)

In [0]:

ind_to_word = dict((v,k) for k,v in tokenizer.word_index.items())


In [0]:
def generate_text(seed_text, num_of_words_to_gen):
#     num_of_words_to_gen = 14144
    #num_of_words_to_gen = 10
    for j in range(num_of_words_to_gen):
        #format token as a model input
        seed_token = tokenizer.texts_to_sequences([seed_text])[0]
        seed_token = pad_sequences([seed_token], maxlen=max_len-1, padding='pre')
        #predict
        predicted = model.predict_classes(seed_token, verbose=0)
        output_word = ind_to_word[predicted[0]]
#         for word, index in tokenizer.word_index.items():
#             if index == predicted:
#                 output_word = word
#                 break
        seed_text = seed_text + " " + output_word
    print('\n')
    return seed_text

In [23]:
generate_text('those georgia stars to shame that night', 200)





"those georgia stars to shame that night you need to feel like i like you and i didn't wonder you i loved you down to listen to you but 'bout you feel come to light right now christmas love you alone for me yeah yeah yeah yeah yeah yeah yeah i i i i i i i i i i i begging you fragile i was only everything i have to long gone i bring you now finally breathe at you met you like right like why all i feel it all right now same things happen when you were into right through all right all in time is a drawer we’d knowing rains in the team around now we need you’ but i'm pacing back up for the back of us i where tim incredible \r you and right in this room at night lives right here snow radio wouldn't wanna' back door for us to be small old bitter anymore onto trees dreaming tried cheer memories come back turning every asking trust you've killing me to be sorry the time for you said \r and you hope me why ain't right here me year i'm mad is you feel the one day to the whole"