In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [2]:
sentences = [
    'I love coding',
    'I hate coding',
    'I enjoy learning new technologies',
    'I dislike debugging late at night'
]

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

{'<OOV>': 1, 'i': 2, 'coding': 3, 'love': 4, 'hate': 5, 'enjoy': 6, 'learning': 7, 'new': 8, 'technologies': 9, 'dislike': 10, 'debugging': 11, 'late': 12, 'at': 13, 'night': 14}
[[2, 4, 3], [2, 5, 3], [2, 6, 7, 8, 9], [2, 10, 11, 12, 13, 14]]


In [3]:
test_data = [
    'I admire creative ideas',
    'I avoid complex problems'
]

test_sequences = tokenizer.texts_to_sequences(test_data)

print(test_sequences)

[[2, 1, 1, 1], [2, 1, 1, 1]]


In [4]:
tokenizer = Tokenizer()

data = """I don't know who to trust, no surprise
Everyone feels so far away from me
Heavy thoughts sift through dust and the lies

Tryin' not to break, but I'm so tired of this deceit
Every time I try to make myself get back up on my feet
(I) All I ever think about is this, all the tiring time between
And how trying to put my trust in you just takes so much out of me

Take everything from the inside
And throw it all away
'Cause I swear, for the last time
I won't trust myself with you

Tension is building inside, steadily
Everyone feels so far away from me
Heavy thoughts forcing their way out of me

Tryin' not to break, but I'm so tired of this deceit
Every time I try to make myself get back up on my feet
(I) All I ever think about is this, all the tiring time between
And how trying to put my trust in you just takes so much out of me

Take everything from the inside
And throw it all away
'Cause I swear, for the last time
I won't trust myself with you

I won't waste myself on you, you, you
Waste myself on you, you, you

I'll take everything from the inside
And throw it all away
'Cause I swear, for the last time
I won't trust myself with you
Everything from the inside
And just throw it all away
'Cause I swear, for the last time
I won't trust myself with you, you, you"""

corpus = data.lower().split("\n")

In [5]:
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(total_words)

79


In [6]:
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [7]:
import numpy as np

In [8]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

In [9]:
ys = keras.utils.to_categorical(labels, num_classes=total_words)

In [10]:
from tensorflow.keras import layers

In [11]:
model = keras.Sequential([
    layers.Embedding(total_words, 240, input_length=max_sequence_len-1),
    layers.Bidirectional(layers.LSTM(256)),
    layers.Dense(total_words, activation='softmax')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.005),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [12]:
history = model.fit(xs, ys, epochs=150, verbose=0)

In [14]:
seed_text = "i don't know"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    predicted = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted, axis=-1)[0]
    
    output_word = tokenizer.index_word[predicted_index]
    seed_text += " " + output_word

print(seed_text)

i don't know who to trust no surprise
