In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils.np_utils as ku 
import numpy as np 
import pandas as pd
from nltk import word_tokenize, ngrams
import contractions
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

Notes:

Model -
2 LSTM (and associated dropout layers)
achieved 50% accuracy after 100 epochs

Data -
taylor swift songs truncated to the first 50 songs, represented as variable sized ngrams of sequences from line length in song

Results -
kinda makes sense, some direct quotes sometimes

In [76]:
def create_model(predictors, label, max_sequence_len, total_words, num_epochs):
    
    model = Sequential()
    model.add(Embedding(total_words, 32, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
    model.fit(predictors, label, epochs=num_epochs, verbose=1, callbacks=[earlystop])
    print(model.summary())
    return model 

In [54]:
def csv_to_lst(file):
    """
    Grabs the lyric section of the given csv and convert it to a string
    Parameters:
        file (CSV): file path where csv is located
    Returns:
        A list string representing all of the text section in the csv
    """
    lyrics_df = pd.read_csv(file) 
#     lyrics_df = lyrics_df

    lyrics_list = list(lyrics_df['lyrics'])


    return lyrics_list

done


In [55]:
def pre_process_all_lyrics(lyric_list):
    processed = []
    for lyrics in lyric_list:
#         print(lyrics)
        processed.append(pre_process_text(lyrics))
    return processed

In [56]:
def pre_process_line(line):
    EMBED = 'embed'
    word_list = word_tokenize(line)
    
   
    #index = word_list.index('Lyrics')
    #word_list = word_list[index + 1:]
    
    no_brackets_list = []
    is_inside = False
    punctuation_list = '''!()-[];:'"\,<>./?@#$%^&*_~”“’‘--...``'''''
    
    for word in word_list:
        if word == '[':
            is_inside = True
        if word == ']':
            is_inside = False
        else:
            if not is_inside:
                if word not in punctuation_list:
                    no_brackets_list.append(word.lower())

    if len(no_brackets_list) > 0:
        end_word = no_brackets_list[len(no_brackets_list)-1]   
        if EMBED in end_word:
#             print(end_word)
            embed_location = end_word.index('embed') 
            # all lyric genius data comes with word 'Embed' at the end
            substr = end_word[:embed_location]
            if len(substr) > 0:
                no_brackets_list[len(no_brackets_list)-1] = substr
        
        end_word = no_brackets_list[len(no_brackets_list)-1]  
        new_end_word = ''
        # also sometimes has a number before embed in last word
        for c in end_word:
            if c.isdigit():
                break
            else:
                new_end_word = new_end_word + c
        if len(end_word) > 0:
            no_brackets_list[len(no_brackets_list)-1] = new_end_word
            
    ret = []
    for word in no_brackets_list:
        if len(word) < 0:
            break
        #idk why this is like this 
        elif word == "''":
            break
        else:
            ret.append(word)
    return ret

In [57]:
def pre_process_text(lyrics):
    """
    Preprocesses the text to add start and end tokens to each sentence
    Parameters:
        lst (list): list of lyrics 
    Returns:
        List of List of words with start and end tokens
    """
    
    lyrics = contractions.fix(lyrics)
#     print(lyrics)
    line_list = lyrics.split("\n")
    line_list = line_list[1:]
    
    ret = []
    for line in line_list:
        if not len(line) == 0:
            to_be_appended = pre_process_line(line)
            if len(to_be_appended) > 0:
                ret.append(to_be_appended)
    return ret

In [58]:
def dataset_preparation(data):

    # basic cleanup
#     corpus = data.lower().split("\n")
    # tokenization	
    corpus = flatten_lines(data)
    print(corpus[:10])
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    print('tokenization:')
    print(tokenizer.word_index)
    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    print('input_sequences:')
    print(input_sequences)
    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    print('padded input_sequences:')
    print(input_sequences)
    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    
    print('X predictors:')
    print(predictors)
    print('y labels:')
    print(label)
    label = ku.to_categorical(label, num_classes=total_words)
    
    return predictors, label, max_sequence_len, total_words

In [59]:
def flatten_lines(data):
    corpus = []
    for song in data:
        for line in song:
            line_string = ""
            for word in line: 
                line_string = line_string + " " + word
            corpus.append(line_string)
    return corpus

In [77]:
lst = csv_to_lst('taylorswift.csv')
data = pre_process_all_lyrics(lst)[:50]
# print(data)

In [78]:
# for song in data:
#     for line in song:
#         print(line)

In [79]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)

[' i walked through the door with you the air was cold', " but something 'bout it felt like home somehow", " and i left my scarf there at your sister 's house", ' and you have still got it in your drawer even now', ' oh your sweet disposition and my wide-eyed gaze', ' we are singin in the car getting lost upstate', ' autumn leaves fallin down like pieces into place', ' and i can picture it after all these days', ' and i know it is long gone and', " that magic 's not here no more"]
tokenization:


input_sequences:
[[1, 291], [1, 291, 146], [1, 291, 146, 3], [1, 291, 146, 3, 176], [1, 291, 146, 3, 176, 44], [1, 291, 146, 3, 176, 44, 2], [1, 291, 146, 3, 176, 44, 2, 3], [1, 291, 146, 3, 176, 44, 2, 3, 373], [1, 291, 146, 3, 176, 44, 2, 3, 373, 13], [1, 291, 146, 3, 176, 44, 2, 3, 373, 13, 262], [21, 147], [21, 147, 182], [21, 147, 182, 4], [21, 147, 182, 4, 177], [21, 147, 182, 4, 177, 20], [21, 147, 182, 4, 177, 20, 97], [21, 147, 182, 4, 177, 20, 97, 408], [5, 1], [5, 1, 278], [5, 1, 278, 8], [5, 1, 278, 8, 292], [5, 1, 278, 8, 292, 17], [5, 1, 278, 8, 292, 17, 53], [5, 1, 278, 8, 292, 17, 53, 14], [5, 1, 278, 8, 292, 17, 53, 14, 409], [5, 1, 278, 8, 292, 17, 53, 14, 409, 66], [5, 1, 278, 8, 292, 17, 53, 14, 409, 66, 183], [5, 2], [5, 2, 33], [5, 2, 33, 117], [5, 2, 33, 117, 52], [5, 2, 33, 117, 52, 4], [5, 2, 33, 117, 52, 4, 12], [5, 2, 33, 117, 52, 4, 12, 14], [5, 2, 33, 117, 52, 4, 12, 14, 525], [5, 2, 33, 117, 52, 4, 12, 14, 525, 113], [5, 2, 33, 117, 52, 4, 12, 14, 525, 113

In [81]:
print('predictors: ', len(predictors))
# print(predictors[:3])
print('labels: ', label.shape)
# print(label[:3])
print('max sequence len: ', max_sequence_len)
print('total words: ', total_words)

predictors:  19137
labels:  (19137, 2090)
max sequence len:  19
total words:  2090


In [83]:
model = create_model(predictors, label, max_sequence_len, total_words, 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100


Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 18, 32)            66880     
                                                                 
 lstm_6 (LSTM)               (None, 18, 150)           109800    
                                                                 
 dropout_3 (Dropout)         (None, 18, 150)           0         
                                                                 
 lstm_7 (LSTM)               (None, 100)               100400    
                                                                 
 dropout_4 (Dropout)         (N

In [114]:
def generate_text(tokenizer, seed_text, next_words, max_sequence_len):
#     lines = []
    LINE_LEN = np.random.randint(1, max_sequence_len)
    words = []
    words.extend(seed_text.split())
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
#         print('-----')
#         print(predicted.shape)
#         choice = np.argmax(predicted)
        choice = np.random.choice(len(predicted[0]), p=predicted[0])
#         choice = sample(predicted[0])
#         print('choice: ', choice)
#         print(index_of)
        word = tokenizer.index_word[choice]
        words.append(word)
#         print('word: ', word)
        if (len(words) % LINE_LEN == 0): # new line time
#             print(words)
#             print(len(words))
            words.append('\n')
            seed_text = word
        else:
            seed_text = seed_text + " " + word
#         seed_text = word
#         output_word = ""
#         for word, index in tokenizer.word_index.items():
#             if index == predicted[0]:
#                 output_word = word
#                 break
#         seed_text += " " + output_word
    return format_lyrics(words)

In [115]:
def sample(preds, temperature=1):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [116]:
def format_lyrics(words):
    lyr = ""
    for word in words:
        lyr = lyr + " " + word
    return lyr

In [126]:
generated = generate_text(tokenizer, "you", 50, max_sequence_len)
print(generated)

 you should take the ground running each away with me now i am mad 
 into me trying to i ever work and the ballet then i owe 
 all the heart is folklore from some weight of you it was like 
 they say i do not like the ending to a my


In [127]:
generated2 = generate_text(tokenizer, "i", 50, max_sequence_len)
print(generated2)

 i would swing for to 
 be certain i go 
 pérignon i just made 
 me want to take 
 the eagles year down 
 the stairs i was 
 that it is a 
 string 's actin now 
 he have got problems 
 on the car keys 
 pérignon you would have 
 i sneak again i 
 want to


In [123]:
generated3 = generate_text(tokenizer, "this is", 50, max_sequence_len)
print(generated3)

 this is our 
 album on 
 the pool 
 that is 
 not it 
 was rare 
 i was 
 my man 
 i never 
 did not 
 weeping in 
 your wildest 
 rumors 's 
 have why 
 is at 
 the flickers 
 the really 
 precedes me 
 like a 
 million little 
 letters that 
 i can 
 i think 
 about the 
 ties were 
 dare


In [124]:
generated4 = generate_text(tokenizer, "his", 50, max_sequence_len)
print(generated4)

 his leaves falling down oh persona at friends i heard you say 
 then canceled my breaks loose good if the clothes was the 
 more that i had that all now i had you over 
 the stairs i was there i was there i was there 
 is daughter of this guitar big


In [125]:
generated5 = generate_text(tokenizer, "my", 50, max_sequence_len)
print(generated5)

 my hands are shaking down in my name you oh 
 lord save me i am a crumpled when i 
 remember it all too well yeah i would go 
 floor on my sweatshirt down around saving take my 
 tlc was quite od id my day times you 
 are not my type you


# Sources

## Dataset
## Text Preprocessing
## Word2Vec Visualizations
## Model Creation
## Text Generation