In [1]:
dataset = """
  Hi! My name is Alex.
  Hello Alex, nice to meet you. What is your name?
  My name is Sarah. How are you today?
  I am doing great, thank you! And you?
  I am fine too. Did you have lunch?
  Yes, I had some pasta. What about you?
  I ate rice and vegetables. It was delicious.
  That sounds nice. What are you doing later?
  I plan to watch a movie. Do you like movies?
  Yes, I love movies! Which one are you going to watch?
  I want to watch Inception. Have you seen it?
  Yes, I have. It is one of my favorite movies!
  That's awesome. Maybe we can watch together next time.
  Sure, that would be fun. Do you also play games?
  Yes, I play video games sometimes. What about you?
  I enjoy playing chess with my friends.
  Wow, chess is a very smart game. Do you play often?
  Yes, almost every weekend. Do you like sports?
  I like football and basketball. What about you?
  I enjoy playing badminton with my cousin.
  That sounds fun! Let's play together someday.
  Yes, I would love that. Where do you usually play?
  I usually play in the park near my house.
  Great, I can join you next week.
  Perfect! I will be waiting. See you soon!
  Bye, take care!
"""

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts([dataset])

In [15]:
len(tokenizer.word_index)

111

In [None]:
import re

x_train = []
y_train = []

for sentence in re.split(r'[.?!]', dataset):
  sentence = sentence.strip()
  if not sentence:  # skip empty
      continue

  tokenized_sen = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1, len(tokenized_sen)):
    x_train.append(tokenized_sen[:i])
    y_train.append(tokenized_sen[i])

In [12]:
len(y_train)

168

In [7]:
max_len = max(len(x) for x in x_train)

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_padded = pad_sequences(x_train, maxlen=max_len, padding='pre')

In [13]:
x_train_padded.shape

(168, 8)

In [16]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y_train, num_classes=len(tokenizer.word_index)+1)

In [17]:
y.shape

(168, 112)

In [18]:
y[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [20]:
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))



In [21]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model.summary()

In [None]:
model.fit(x_train_padded, y, epochs=100)

In [33]:
text = "It is one of my favorite"

# tokenize
tokenized_text = tokenizer.texts_to_sequences([text])[0]

# pad
padded_text = pad_sequences([tokenized_text], maxlen=max_len, padding='pre')

# predict
import numpy as np
output = np.argmax(model.predict(padded_text))

for word, index in tokenizer.word_index.items():
  if index == output:
    print("Output", word)
    break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Output movies
