In [1]:
import keras
from keras.models import Sequential
from keras.layers import Activation,LSTM,Dense
from keras.optimizers import Adam
import pandas as pd
import numpy as np
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [2]:
df=pd.read_csv('songdata.csv')['text']

In [3]:
df.describe()

count                                                 57650
unique                                                57494
top       Chestnuts roasting on an open fire  \nJack Fro...
freq                                                      6
Name: text, dtype: object

In [4]:
data=np.array(df)

In [5]:
corpus=''
for ix in range(len(data)):
    corpus+=data[ix]
corpus = corpus.lower()

In [6]:
vocab=list(set(corpus))
char_ix={c:i for i,c in enumerate(vocab)}
ix_char={i:c for i,c in enumerate(vocab)}

In [7]:
char_ix

{'\n': 16,
 ' ': 20,
 '!': 5,
 '"': 30,
 "'": 18,
 '(': 4,
 ')': 22,
 ',': 14,
 '-': 23,
 '.': 42,
 '0': 29,
 '1': 37,
 '2': 41,
 '3': 11,
 '4': 43,
 '5': 10,
 '6': 44,
 '7': 27,
 '8': 31,
 '9': 7,
 ':': 0,
 '?': 26,
 '[': 28,
 ']': 47,
 'a': 46,
 'b': 9,
 'c': 32,
 'd': 15,
 'e': 6,
 'f': 12,
 'g': 36,
 'h': 21,
 'i': 13,
 'j': 8,
 'k': 34,
 'l': 17,
 'm': 2,
 'n': 1,
 'o': 25,
 'p': 19,
 'q': 45,
 'r': 3,
 's': 35,
 't': 33,
 'u': 39,
 'v': 49,
 'w': 24,
 'x': 38,
 'y': 48,
 'z': 40}

In [8]:
maxlen=40
vocab_size=len(vocab)
print(vocab_size)

50


In [None]:
sentences=[]
next_char=[]
for i in range(len(corpus)-maxlen-1):
    sentences.append(corpus[i:i+maxlen])
    next_char.append(corpus[i+maxlen])
split_count = int(0.02 * 67998416)
sentences_test = sentences[split_count:]
next_char_test = next_char[split_count:]
sentences = sentences[:split_count]
next_char = next_char[:split_count]

In [None]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, maxlen, vocab_size), dtype=np.bool)
        y = np.zeros((batch_size, vocab_size), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, char_ix[w]] = 1
            y[i, char_ix[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [None]:
model=Sequential()
model.add(LSTM(128,input_shape=(maxlen,vocab_size)))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer=Adam(lr=0.01),loss='categorical_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [None]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, maxlen))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [None]:
BATCH_SIZE = 128
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(vocab),
    40,
    10
)
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]
model.fit_generator(generator(sentences, next_char, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=10,
    callbacks=callbacks_list,
    validation_data=generator(sentences_test, next_char_test, BATCH_SIZE)
                    ,validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10

In [None]:
#serialize model to JSON  serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
import random
txt = corpus
generated=''
start_index=random.randint(0,len(txt)-maxlen-1)
sent=txt[start_index:start_index+maxlen]
generated+=sent
for i in range(1900):
    x_sample=generated[i:i+maxlen]
    x=np.zeros((1,maxlen,vocab_size))
    for j in range(maxlen):
        x[0,j,char_ix[x_sample[j]]]=1
    probs=model.predict(x)
    probs=np.reshape(probs,probs.shape[1])
    ix=np.random.choice(range(vocab_size),p=probs.ravel())
    generated+=ix_char[ix]

In [None]:
print(generated)