In [1]:
from tensorflow.keras.callbacks import LambdaCallback
import pandas as pd
import numpy as np
import sys
from functions import *
import random
from keras.utils import np_utils
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
# import dataset - see example
data = pd.read_csv ('datasets/pop.csv')

In [3]:
#Lining up all the lyrics to create corpus
Corpus =''
for listitem in data.lyrics:
    Corpus += listitem
    
Corpus = Corpus.lower() #converting all alphabets to lowecase 
print("Number of unique characters:", len(set(Corpus)))

Number of unique characters: 470


In [4]:
print("The unique characters:",sorted(set(Corpus)))

The unique characters: ['\n', ' ', '!', '"', '&', "'", '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¿', 'à', 'á', 'é', 'í', 'ï', 'ñ', 'ó', 'ú', 'ü', 'ı', 'œ', 'е', '\u2005', '\u200a', '\u200b', '–', '—', '‘', '’', '“', '”', '…', '\u205f', '中', '在', '站', '間', '가', '각', '간', '갈', '감', '강', '같', '개', '거', '걱', '건', '걸', '것', '게', '겐', '겔', '겠', '겨', '결', '계', '고', '곳', '공', '과', '관', '교', '구', '국', '굴', '그', '금', '기', '긴', '길', '김', '깊', '까', '깜', '깨', '꺾', '껄', '께', '꼴', '꽃', '꽤', '꾸', '꿈', '꿨', '끄', '끝', '끼', '끽', '낌', '나', '난', '날', '남', '났', '낯', '내', '낸', '냐', '너', '넌', '널', '넘', '네', '녀', '녁', '놀', '놓', '누', '눈', '는', '니', '다', '닥', '단', '달', '닮', '당', '닿', '대', '더', '던', '덜', '데', '도', '돌', '됐', '되', '될', '두', '둘', '둠', '득', '든', '듣', '들', '듭', '듯', '따', '때', '떠', '떤', '떨', '또', '뚜', '뛰', '뜬', '뜻', '라', '란', 

In [5]:
Corpus = re.sub("[^A-Za-z0-9'\.\n]"," ",Corpus)
print("The unique characters:",sorted(set(Corpus)))

The unique characters: ['\n', ' ', "'", '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
print(Corpus)


i got my driver's license last week
just like we always talked about
'cause you were so excited for me
to finally drive up to your house
but today  i drove through the suburbs
crying 'cause you weren't around

and you're probably with that blonde girl
who always made me doubt
she's so much older than me
she's everything i'm insecure about
yeah  today  i drove through the suburbs
'cause how could i ever love someone else 

and i know we weren't perfect
but i've never felt this way for no one
and i just can't imagine
how you could be so okay now that i'm gone
guess you didn't mean what you wrote in that song about me
'cause you said forever  now i drive alone past your street

and all my friends are tired
of hearing how much i miss you  but
i kinda feel sorry for them
'cause they'll never know you the way that i do
yeah  today  i drove through the suburbs
and pictured i was driving home to you

and i know we weren't perfect
but i've never felt this way for no one  oh
and i just can't im

In [7]:
# Storing all the unique characters present in my corpus to bult a mapping dic. 
symb = sorted(list(set(Corpus)))

L_corpus = len(Corpus) #length of corpus
L_symb = len(symb) #length of total unique characters

#Building dictionary to access the vocabulary from indices and vice versa
mapping = dict((c, i) for i, c in enumerate(symb))
reverse_mapping = dict((i, c) for i, c in enumerate(symb))

print("Total number of characters:", L_corpus)
print("Number of unique characters:", L_symb)

Total number of characters: 571679
Number of unique characters: 40


In [8]:
#Splitting the Corpus in equal length of strings and output target
length = 40
features = []
targets = []
for i in range(0, L_corpus - length, 1):
    feature = Corpus[i:i + length]
    target = Corpus[i + length]
    features.append([mapping[j] for j in feature])
    targets.append(mapping[target])
    
    
L_datapoints = len(targets)
print("Total number of sequences in the Corpus:", L_datapoints)

Total number of sequences in the Corpus: 571639


In [9]:
# reshape X and normalize
X = (np.reshape(features, (L_datapoints, length, 1)))/ float(L_symb)

# one hot encode the output variable
y = np_utils.to_categorical(targets)

In [10]:
chars = sorted(list(set(Corpus)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

seqlen = 40
step = seqlen
sentences = []
for i in range(0, len(Corpus) - seqlen - 1, step):
    sentences.append(Corpus[i: i + seqlen + 1])

x = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, (char_in, char_out) in enumerate(zip(sentence[:-1], sentence[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1


model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

def sample(preds, temperature=1.0):
    """Helper function to sample an index from a probability array."""
    preds = np.asarray(preds).astype('float64')
    preds = np.exp(np.log(preds) / temperature)  # softmax
    preds = preds / np.sum(preds)                #
    probas = np.random.multinomial(1, preds, 1)  # sample index
    return np.argmax(probas)                     #


def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text."""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(Corpus) - seqlen - 1)
    
    # Q5: What does diversity do?
    for diversity in [0.2, 0.5, 1.0]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = Corpus[start_index: start_index + seqlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, seqlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.
            
            # Q6: What is the dimensionality of `preds`? Why do we input `preds[0, -1]` to the `sample` function?
            preds = model.predict(x_pred, verbose=0)
            next_index = sample(preds[0, -1], diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(x, y,
          batch_size=128,
          epochs=200,
          callbacks=[print_callback])

In [None]:
# Enter model name below, EX: 'user_pop_model'
model_name = 'ARTIST_NAME_HERE'
model.save('models/' + model_name)