In [34]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import pandas as pd

In [5]:
data = pd.read_pickle('../data/taylor_swift.pkl', compression='gzip')
data.head(2)

Unnamed: 0,Name,Lyrics
0,tim mcgraw,\n\r\nhe said the way my blue eyes shined\nput...
1,picture to burn,"\n\r\nstate the obvious, i didn't get my perfe..."


#### Make 1 string with all files

In [14]:
text = ""
for song in data.Lyrics:
    text = text + song
text[:100]

'\n\r\nhe said the way my blue eyes shined\nput those georgia stars to shame that night\ni said, "that\'s a'

#### prepare dataset
-chunk data into sequences

In [30]:
tokenizer = Tokenizer()
def data_prep(data):
    text = data.lower().split('\n')
    tokenizer.fit_on_texts(text)
    total_words = len(tokenizer.word_index) + 1
    #convert our list of split words into sequences
    sequences = []
    for line in text:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram = token_list[:i+1]
            sequences.append(n_gram)
    #find the longest sequences length so we can pad the others to that len
    max_len = max([len(x) for x in sequences])
    #we add our paddings to the beginning of each val
    sequences = np.array(pad_sequences(sequences, maxlen=max_len, padding='pre'))
    print(sequences.shape)
    X, y = sequences[:,:-1], sequences[:,-1]
    y = ku.to_categorical(y, num_classes=total_words)
    return X, y, total_words, max_len
    

In [31]:
X,y,total_words,max_len = data_prep(text)

(30729, 25)


In [39]:
print(X.shape)
print(X[1].shape)
print(max_len)
print(total_words)

(30729, 24)
(24,)
25
2446


In [40]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_len-1))
model.add(LSTM(150))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 24, 50)            122300    
_________________________________________________________________
lstm_5 (LSTM)                (None, 150)               120600    
_________________________________________________________________
dropout_4 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2446)              369346    
Total params: 612,246
Trainable params: 612,246
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xb2d7440f0>

In [46]:
def generate_text(seed_text, num_of_words_to_gen, max_len):
    for j in range(num_of_words_to_gen):
        #format token as a model input
        seed_token = tokenizer.texts_to_sequences([seed_text])[0]
        seed_token = pad_sequences([seed_token], maxlen=max_len-1, padding='pre')
        #predict
        predicted = model.predict_classes(seed_token, verbose=0)
        print(predicted)
        pred = model.predict(seed_token)
        print('model pred: ', pred)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [48]:
seed_text = 'those georgia stars'
next_words = 50
generate_text(seed_text, next_words, max_len)

[4]
model pred:  [[1.7799412e-09 9.6874461e-03 3.0316454e-03 ... 2.4212113e-13
  1.5816868e-14 4.6232840e-09]]
[109]
model pred:  [[3.06171899e-07 3.34405974e-02 3.51045877e-02 ... 1.24907338e-08
  1.72145349e-07 1.19167045e-02]]
[21]
model pred:  [[7.7609319e-10 6.4764470e-02 1.2180117e-02 ... 7.4218999e-12
  5.5445388e-11 1.7969816e-07]]
[8]
model pred:  [[3.8437267e-09 9.2474863e-02 2.7643953e-02 ... 1.0018329e-07
  6.3868676e-08 6.0716772e-04]]
[97]
model pred:  [[1.8030312e-07 6.2273843e-06 1.2547576e-05 ... 3.6097994e-05
  7.3631475e-04 8.0861728e-06]]
[893]
model pred:  [[1.06896216e-07 1.80245081e-07 4.36654273e-06 ... 1.12209541e-07
  1.53980466e-07 8.68665637e-08]]
[14]
model pred:  [[7.1827877e-10 5.6894529e-03 5.2324412e-03 ... 1.3776544e-12
  1.6224958e-13 2.6695899e-09]]
[1791]
model pred:  [[3.6279127e-07 4.0804148e-03 5.3533819e-04 ... 2.4894882e-07
  1.0122638e-07 1.5113659e-03]]
[4]
model pred:  [[6.6890244e-08 3.1477783e-03 1.8150777e-03 ... 6.3096078e-10
  1.7007104

"those georgia stars and that's like a little kid of cards and we're away and we're lying on me at it all puts her'd her'd bleachers you were newest her'd bleachers bleachers yeah yeah yeah yeah fall down to be down it right will her'd bleachers bleachers right newest her'd bleachers bleachers bleachers"