!pip install tensorflow

In [55]:
import pandas as pd
import numpy as np
import string, os 
import tensorflow as tf

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional 
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential

In [1]:
dataset_path = "~/Downloads/archive-2/lyrics-data.csv'"
sequence_len = 655


In [56]:

def prepare_data(path, target_lang='en', max_row=800):
    """
    filters dataframe with target language and max number of rows to consider in the training
    """
    df = pd.read_csv(path)
    df.drop(['ALink','SName','SLink'],axis=1,inplace=True)
    lang_filter_song_df = df.query(f"language == '{target_lang}'")
    lang_filter_song_df = lang_filter_song_df[:max_row]
    return lang_filter_song_df


Unnamed: 0,Lyric,language
count,379854,365296
unique,371181,52
top,Instrumental,en
freq,2087,191814


In [60]:
english_song_df = prepare_data(dataset_path)

In [61]:
english_song_df.shape

(191814, 2)

In [None]:
def tokenize(data, max_len_filter=sequence_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['Lyric'].astype(str).str.lower())
    total_words = len(tokenizer.word_index)+1  # total index + 1
    tokenized_sentences = tokenizer.texts_to_sequences(data['Lyric'].astype(str))  # words into integer type by classifying them according to word dictionary
    tokenized_sentences = [s for s in tokenized_sentences if len(s) <= sequence_len]
    return tokenized_sentences

In [63]:
tokenized_sentences = tokenize(english_song_df)

In [2]:
def n_gram_and_pad_sequences(tokenized_sentences):
    """
    n-gram to from contiguous sequence of n items for the given tokenized_sentences
    pad by adding zeros at the beginnig, till they are all of the same (max) length
    """
    input_sequences = list()
    for i in tokenized_sentences:
        for t in range(1, len(i)):
            n_gram_sequence = i[:t+1]
            input_sequences.append(n_gram_sequence)
    max_sequence_len = max([len(x) for x in input_sequences]) 
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    return input_sequences

In [None]:
input_sequences = n_gram_and_pad_sequences(tokenized_sentences)

### Training set 

In [65]:
# Dividing the data into X, y -----> the training set, and the labels to be predicted
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
# Takes all elemnts in each row, except the last element, and places them in X
# while labels takes the last element (the element which we should predict)

y = tf.keras.utils.to_categorical(labels, num_classes=total_words) # One hot encoding
# number of classes is now equal to the number of unique words in the song lyrics


### Model Creation and Training

In [18]:
# create model
model = Sequential()
model.add(Embedding(total_words, 40, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(250)))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')


In [19]:
history = model.fit(X, y, batch_size=32, epochs=30,callbacks=[earlystop], validation_split = 0.3)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

KeyboardInterrupt: 

In [None]:
model.save("lyrics_generation_model.h5")

In [77]:
from tensorflow.keras.models import load_model
model = load_model("/Users/ak2g/Desktop/lyrics_models/lyrics_generation_model.h5")

In [80]:
def generate_lyrics(lyrics_hint, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_len, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                # Converting from numeric to string
                output_word = word
                break
        input_text += " " + output_word
    return input_text

In [81]:
generate_lyrics("the starts in the sky", 10)



'the starts in the sky we up yo want me we fast hey be you'