In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [2]:
# tokenize and clean text now

In [3]:
import spacy

In [4]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])  # ner = named entity recognition

In [5]:
nlp.max_length = 1198623  # sometimes you have to set the length longer than default for number of words in text

In [6]:
def separate_punc(doc_text):
    
    # make sure you're not tracking too much punctuation and whatnot
    # otherwise when you generate text you might generate just a ton of periods
    # the string below is provided by keras for convenience
    # helps us stay interested in relationships between words
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [7]:
d = read_file('talk.txt')

In [8]:
tokens = separate_punc(d)

In [9]:
tokens

['zimmerman',
 'this',
 'is',
 'little',
 'richard',
 '...',
 '..',
 'little',
 'richard',
 "'s",
 'got',
 'a',
 'lot',
 'of',
 'expression',
 'zimmerman',
 'you',
 'got',
 'ta',
 'have',
 'some',
 'kind',
 'of',
 'expression',
 'zimmerman',
 'there',
 "'s",
 'no',
 'expression',
 'i',
 'met',
 'her',
 'at',
 'a',
 'dance',
 'st.',
 'paul',
 'minnesota',
 '...',
 'i',
 'walk',
 'the',
 'line',
 'because',
 'you',
 "'re",
 'mine',
 'because',
 'you',
 "'re",
 'mine',
 '...',
 'zimmerman',
 'rhythm',
 'and',
 'blues',
 'zimmerman',
 'ah',
 'rhythm',
 'and',
 'blues',
 'you',
 'see',
 'is',
 'something',
 'that',
 'you',
 'really',
 'ca',
 "n't",
 'quite',
 'explain',
 'see',
 'when',
 'you',
 'hear',
 'a',
 'song',
 'rhythm',
 'and',
 'blues',
 'when',
 'you',
 'hear',
 'it',
 "'s",
 'a',
 'good',
 'rhythm',
 'and',
 'blues',
 'song',
 'chills',
 'go',
 'up',
 'your',
 'spine',
 '...',
 'zimmerman',
 'when',
 'you',
 'hear',
 'a',
 'song',
 'li',
 'ke',
 'that',
 'but',
 'when',
 'you',


In [10]:
len(tokens)

69863

In [11]:
## now create sequence of tokens, pass in first 25 words of a sentence and have NN predict the 26th word

In [12]:
train_len = 15 + 1

text_sequences = []
for i in range(train_len, len(tokens)):
    seq = tokens[i - train_len:i]
    
    text_sequences.append(seq)

In [13]:
type(text_sequences)

list

In [14]:
text_sequences[0]

['zimmerman',
 'this',
 'is',
 'little',
 'richard',
 '...',
 '..',
 'little',
 'richard',
 "'s",
 'got',
 'a',
 'lot',
 'of',
 'expression',
 'zimmerman']

In [15]:
text_sequences[1]

['this',
 'is',
 'little',
 'richard',
 '...',
 '..',
 'little',
 'richard',
 "'s",
 'got',
 'a',
 'lot',
 'of',
 'expression',
 'zimmerman',
 'you']

In [16]:
' '.join(text_sequences[0])

"zimmerman this is little richard ... .. little richard 's got a lot of expression zimmerman"

In [17]:
# so this is one token over
' '.join(text_sequences[1])

"this is little richard ... .. little richard 's got a lot of expression zimmerman you"

In [18]:

' '.join(text_sequences[2])

"is little richard ... .. little richard 's got a lot of expression zimmerman you got"

In [19]:
# format to numerical system that keras can understand
from keras.preprocessing.text import Tokenizer

In [20]:
tokenizer = Tokenizer()

In [21]:
tokenizer.fit_on_texts(text_sequences)

In [22]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [23]:
sequences[0]

[475, 10, 8, 196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475]

In [24]:
sequences[1]  # so text has now been put in numerical form representing an ID for a particular word

[10, 8, 196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2]

In [25]:
tokenizer.index_word

{1: 'the',
 2: 'you',
 3: 'i',
 4: 'a',
 5: 'on',
 6: 'to',
 7: "'s",
 8: 'is',
 9: 'of',
 10: 'this',
 11: 'it',
 12: 'that',
 13: 'and',
 14: 'thank',
 15: 'song',
 16: 'we',
 17: 'right',
 18: 'here',
 19: 'all',
 20: 'guitar',
 21: 'in',
 22: 'my',
 23: 'he',
 24: 'do',
 25: 'tonight',
 26: 'for',
 27: 'one',
 28: 'now',
 29: 'na',
 30: 'gon',
 31: "n't",
 32: '   ',
 33: 'was',
 34: 'about',
 35: 'know',
 36: '’s',
 37: 'there',
 38: 'called',
 39: 'everybody',
 40: 'from',
 41: 'wanna',
 42: 'me',
 43: 'out',
 44: 'be',
 45: 'drums',
 46: 'they',
 47: 'playing',
 48: 'bass',
 49: '...',
 50: 'with',
 51: 'got',
 52: "'re",
 53: 'him',
 54: 'time',
 55: 'who',
 56: 'but',
 57: 'have',
 58: '  ',
 59: 'just',
 60: 'people',
 61: 'songs',
 62: 'so',
 63: 'anyway',
 64: 'an',
 65: 'up',
 66: 'ha',
 67: 'new',
 68: 'man',
 69: 'play',
 70: 'sing',
 71: 'band',
 72: 'are',
 73: 'she',
 74: 'like',
 75: 'what',
 76: 'back',
 77: 'old',
 78: 'introduce',
 79: "'ll",
 80: "'m",
 81: 'get'

In [26]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

475 : zimmerman
10 : this
8 : is
196 : little
630 : richard
49 : ...
166 : ..
196 : little
630 : richard
7 : 's
51 : got
4 : a
91 : lot
9 : of
1519 : expression
475 : zimmerman


In [27]:
tokenizer.word_counts

OrderedDict([('zimmerman', 273),
             ('this', 16770),
             ('is', 17731),
             ('little', 892),
             ('richard', 190),
             ('...', 4038),
             ('..', 1111),
             ("'s", 19130),
             ('got', 3723),
             ('a', 28293),
             ('lot', 2237),
             ('of', 17406),
             ('expression', 47),
             ('you', 33024),
             ('ta', 656),
             ('have', 3440),
             ('some', 2224),
             ('kind', 1312),
             ('there', 5520),
             ('no', 1392),
             ('i', 28640),
             ('met', 560),
             ('her', 2080),
             ('at', 1424),
             ('dance', 80),
             ('st.', 96),
             ('paul', 160),
             ('minnesota', 64),
             ('walk', 32),
             ('the', 40064),
             ('line', 96),
             ('because', 256),
             ("'re", 3696),
             ('mine', 688),
             ('rhythm', 800),

In [28]:
vocabulary_size = len(tokenizer.word_counts)

In [29]:
vocabulary_size  # number of unique words across data set

3916

In [30]:
sequences

[[475, 10, 8, 196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475],
 [10, 8, 196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2],
 [8, 196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51],
 [196, 630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264],
 [630, 49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57],
 [49, 166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92],
 [166, 196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143],
 [196, 630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9],
 [630, 7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519],
 [7, 51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519, 475],
 [51, 4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519, 475, 37],
 [4, 91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519, 475, 37, 7],
 [91, 9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519, 475, 37, 7, 138],
 [9, 1519, 475, 2, 51, 264, 57, 92, 143, 9, 1519, 475, 37, 7, 138, 1519],
 [1519, 

In [31]:
type(sequences)

list

In [32]:
import numpy as np

In [33]:
sequences = np.array(sequences)

In [34]:
sequences

array([[ 475,   10,    8, ...,    9, 1519,  475],
       [  10,    8,  196, ..., 1519,  475,    2],
       [   8,  196,  630, ...,  475,    2,   51],
       ...,
       [  13,  336,  113, ...,   36,  336,   47],
       [ 336,  113,  123, ...,  336,   47,    5],
       [ 113,  123,   23, ...,   47,    5,   48]])

In [35]:
# train_test_split to separate the first columns as features and very last column as label/answer
from keras.utils import to_categorical

In [36]:
X = sequences[:,:-1]  # for every row, grab every column but the last column

In [37]:
y = sequences[:,:-1]  # for every row, grab the last column

In [38]:
y = to_categorical(y, num_classes=vocabulary_size+1)  # one-hot encoding

In [39]:
y

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [40]:
seq_len = X.shape[1]

In [41]:
X.shape

(69847, 15)

In [42]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding  # lstm for sequences, embedding for vocabulary

In [43]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 15, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [44]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 15)            58755     
_________________________________________________________________
lstm (LSTM)                  (None, 15, 150)           99600     
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense (Dense)                (None, 150)               22650     
_________________________________________________________________
dense_1 (Dense)              (None, 3917)              591467    
Total params: 953,072
Trainable params: 953,072
Non-trainable params: 0
_________________________________________________________________


In [45]:
from keras.utils import to_categorical

In [46]:
X = sequences[:,:-1]

In [47]:
y = sequences[:,-1]

In [48]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [49]:
seq_len = X.shape[1]

In [50]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 15)            58755     
_________________________________________________________________
lstm_2 (LSTM)                (None, 15, 150)           99600     
_________________________________________________________________
lstm_3 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_2 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_3 (Dense)              (None, 3917)              591467    
Total params: 953,072
Trainable params: 953,072
Non-trainable params: 0
_________________________________________________________________


In [51]:
from pickle import dump, load

In [52]:
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/30

Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 

Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<tensorflow.python.keras.callbacks.History at 0x7fb8c43df0a0>

In [53]:
model.save('my_bobdylan.h5')

In [54]:
dump(tokenizer, open('my_bobdylantokenizer', 'wb'))  # need to save tokenizer as well

In [55]:
## generate new text based off of seed trained input

In [56]:
from keras.preprocessing.sequence import pad_sequences

In [57]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # assigns probability across vocabulary for most probable next word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] 
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

In [58]:
text_sequences[0]

['zimmerman',
 'this',
 'is',
 'little',
 'richard',
 '...',
 '..',
 'little',
 'richard',
 "'s",
 'got',
 'a',
 'lot',
 'of',
 'expression',
 'zimmerman']

In [59]:
import random
random.seed(101)


In [64]:
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)

generated = generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=20)

print("**************\nINPUT\n**************\n")
print(seed_text)

print("\n\n**************\nGENERATED\n**************\n\n")
print(generated)

**************
INPUT
**************

survive ” nicole contos a resilient woman this is theme time radio hour we ’re talking


**************
GENERATED
**************


about kissing which means we ’re centering on cupidity and not stupidity lucinda williams wrote a song all about passionate
