In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils.np_utils as ku 
import numpy as np 
import pandas as pd
from nltk import word_tokenize, ngrams
import contractions
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [2]:
def create_model(predictors, label, max_sequence_len, total_words, num_epochs):
    
    model = Sequential()
    model.add(Embedding(total_words, 32, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))
    # model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
    model.fit(predictors, label, epochs=num_epochs, verbose=1, callbacks=[earlystop])
    print(model.summary())
    return model 

In [3]:
def csv_to_lst(file):
    """
    Grabs the lyric section of the given csv and convert it to a string
    Parameters:
        file (CSV): file path where csv is located
    Returns:
        A list string representing all of the text section in the csv
    """
    lyrics_df = pd.read_csv(file) 
#     lyrics_df = lyrics_df

    lyrics_list = list(lyrics_df['lyrics'])


    return lyrics_list

print('done')

done


In [4]:
def pre_process_all_lyrics(lyric_list):
    processed = []
    for lyrics in lyric_list:
#         print(lyrics)
        processed.append(pre_process_text(lyrics))
    return processed

In [5]:
def pre_process_line(line):
    EMBED = 'embed'
    word_list = word_tokenize(line)
    
   
    #index = word_list.index('Lyrics')
    #word_list = word_list[index + 1:]
    
    no_brackets_list = []
    is_inside = False
    punctuation_list = '''!()-[];:'"\,<>./?@#$%^&*_~”“’‘--...``'''''
    
    for word in word_list:
        if word == '[':
            is_inside = True
        if word == ']':
            is_inside = False
        else:
            if not is_inside:
                if word not in punctuation_list:
                    no_brackets_list.append(word.lower())

    if len(no_brackets_list) > 0:
        end_word = no_brackets_list[len(no_brackets_list)-1]   
        if EMBED in end_word:
#             print(end_word)
            embed_location = end_word.index('embed') 
            # all lyric genius data comes with word 'Embed' at the end
            substr = end_word[:embed_location]
            if len(substr) > 0:
                no_brackets_list[len(no_brackets_list)-1] = substr
        
        end_word = no_brackets_list[len(no_brackets_list)-1]  
        new_end_word = ''
        # also sometimes has a number before embed in last word
        for c in end_word:
            if c.isdigit():
                break
            else:
                new_end_word = new_end_word + c
        if len(end_word) > 0:
            no_brackets_list[len(no_brackets_list)-1] = new_end_word
            
    ret = []
    for word in no_brackets_list:
        if len(word) < 0:
            break
        #idk why this is like this 
        elif word == "''":
            break
        else:
            ret.append(word)
    return ret

In [6]:
def pre_process_text(lyrics):
    """
    Preprocesses the text to add start and end tokens to each sentence
    Parameters:
        lst (list): list of lyrics 
    Returns:
        List of List of words with start and end tokens
    """
    
    lyrics = contractions.fix(lyrics)
#     print(lyrics)
    line_list = lyrics.split("\n")
    line_list = line_list[1:]
    
    ret = []
    for line in line_list:
        if not len(line) == 0:
            to_be_appended = pre_process_line(line)
            if len(to_be_appended) > 0:
                ret.append(to_be_appended)
    return ret

In [7]:
def dataset_preparation(data):

    # basic cleanup
#     corpus = data.lower().split("\n")
    # tokenization	
    corpus = flatten_lines(data)
    print(corpus[:10])
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    print('tokenization:')
    print(tokenizer.word_index)
    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    print('input_sequences:')
    print(input_sequences)
    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    print('padded input_sequences:')
    print(input_sequences)
    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    
    print('X predictors:')
    print(predictors)
    print('y labels:')
    print(label)
    label = ku.to_categorical(label, num_classes=total_words)
    
    return predictors, label, max_sequence_len, total_words

In [8]:
def flatten_lines(data):
    corpus = []
    for song in data:
        for line in song:
            line_string = ""
            for word in line: 
                line_string = line_string + " " + word
            corpus.append(line_string)
    return corpus

In [9]:
lst = csv_to_lst('taylorswift.csv')
data = pre_process_all_lyrics(lst)[:10]
# print(data)

In [10]:
# for song in data:
#     for line in song:
#         print(line)

In [11]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)

[' i walked through the door with you the air was cold', " but something 'bout it felt like home somehow", " and i left my scarf there at your sister 's house", ' and you have still got it in your drawer even now', ' oh your sweet disposition and my wide-eyed gaze', ' we are singin in the car getting lost upstate', ' autumn leaves fallin down like pieces into place', ' and i can picture it after all these days', ' and i know it is long gone and', " that magic 's not here no more"]
tokenization:
input_sequences:
[[1, 180], [1, 180, 293], [1, 180, 293, 3], [1, 180, 293, 3, 121], [1, 180, 293, 3, 121, 58], [1, 180, 293, 3, 121, 58, 2], [1, 180, 293, 3, 121, 58, 2, 3], [1, 180, 293, 3, 121, 58, 2, 3, 228], [1, 180, 293, 3, 121, 58, 2, 3, 228, 7], [1, 180, 293, 3, 121, 58, 2, 3, 228, 7, 181], [17, 137], [17, 137, 138], [17, 137, 138, 4], [17, 137, 138, 4, 122], [17, 137, 138, 4, 122, 19], [17, 137, 138, 4, 122, 19, 99], [17, 137, 138, 4, 122, 19, 99, 294], [5, 1], [5, 1, 295], [5, 1, 295, 9

In [12]:
print('predictors: ', len(predictors))
print(predictors[:3])
print('labels: ', label.shape)
print(label[:3])
print('max sequence len: ', max_sequence_len)
print('total words: ', total_words)

predictors:  5007
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 180]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 180 293]]
labels:  (5007, 813)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
max sequence len:  18
total words:  813


In [None]:
model = create_model(predictors, label, max_sequence_len, total_words, 100)

2022-04-29 16:28:49.943893: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

In [None]:
def generate_text(tokenizer, seed_text, next_words, max_sequence_len):
#     lines = []
    LINE_LEN = np.random.randint(1, max_sequence_len)
    words = []
    words.extend(seed_text.split())
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
#         print('-----')
#         print(predicted.shape)
        choice = np.argmax(predicted)
#         choice = np.random.choice(len(predicted[0]), p=predicted[0])
#         choice = sample(predicted[0])
#         print('choice: ', choice)
#         print(index_of)
        word = tokenizer.index_word[choice]
        print('word: ', word)
        if (len(words) % LINE_LEN == 0): # new line time
#             print(words)
#             print(len(words))
            words.append('\n')
            seed_text = word
        else:
            words.append(word)
            seed_text = seed_text + " " + word
#         seed_text = word
#         output_word = ""
#         for word, index in tokenizer.word_index.items():
#             if index == predicted[0]:
#                 output_word = word
#                 break
#         seed_text += " " + output_word
    return words

In [None]:
def sample(preds, temperature=0.5):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def format_lyrics(words):
    lyr = ""
    for word in words:
        lyr = lyr + " " + word
    return lyr

In [None]:
generated = generate_text(tokenizer, "you", 15, max_sequence_len)
lyrics = format_lyrics(generated)
print('final lyrics:')
print(lyrics)

In [None]:
generated2 = generate_text(tokenizer, "mom", 20, max_sequence_len)
print(generated2)

In [None]:
generated3 = generate_text(tokenizer, "this is", 20, max_sequence_len)
print(generated3)

In [None]:
generated4 = generate_text(tokenizer, "his", 20, max_sequence_len)
print(generated4)