In [12]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Dense, LSTM, Activation, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
#generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
  in_text, result = seed_text, seed_text
  for i in range(n_words):
    #encode the text as interger
    encoded = tokenizer.text_to_sequences([in_text])[0]
    encoded = array(encoded)
    #predict a word in the vacabulary
    yhat  = model.predict_classes(encoded, verbose=0)
    #map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    #append to input
    intext, result = out_word, result+ ' '+out_word
  return result

In [4]:
data = """
Trăng ơi... từ đâu đến?

Trăng ơi... từ đâu đến?
Hay từ cánh rừng xa
Trăng hồng như quả chín
Lửng lơ lên trước nhà

Trăng ơi... từ đâu đến?
Hay biển xanh diệu kỳ
Trăng tròn như mắt cá
Chẳng bao giờ chớp mi

Trăng ơi... từ đâu đến?
Hay từ một sân chơi
Trăng bay như quả bóng
Đứa nào đá lên trời

Trăng ơi... từ đâu đến?
Hay từ lời mẹ ru
Thương Cuội không được học
Hú gọi trâu đến giờ!

Trăng ơi... từ đâu đến?
Hay từ đường hành quân
Trăng soi chú bộ đội
Và soi vàng góc sân

Trăng ơi... từ đâu đến?
Trăng đi khắp mọi miền
Trăng ơi có nơi nào
Sáng hơn đất nước em...
"""

In [9]:
#integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print(encoded)
#determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('vocabulary size: ', vocab_size)
#create word -> word sequences
sequences = list()
for i in range(2,len(encoded)):
  sequence = encoded[i-2:i+1]
  sequences.append(sequence)
print('total sequence: ', len(sequences))

[1, 3, 2, 5, 4, 1, 3, 2, 5, 4, 6, 2, 14, 15, 16, 1, 17, 7, 8, 18, 19, 20, 9, 21, 22, 1, 3, 2, 5, 4, 6, 23, 24, 25, 26, 1, 27, 7, 28, 29, 30, 31, 10, 32, 33, 1, 3, 2, 5, 4, 6, 2, 34, 11, 35, 1, 36, 7, 8, 37, 38, 12, 39, 9, 40, 1, 3, 2, 5, 4, 6, 2, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 4, 10, 1, 3, 2, 5, 4, 6, 2, 52, 53, 54, 1, 13, 55, 56, 57, 58, 13, 59, 60, 11, 1, 3, 2, 5, 4, 1, 61, 62, 63, 64, 1, 3, 65, 66, 12, 67, 68, 69, 70, 71]
vocabulary size:  72
total sequence:  123


In [13]:
#pad sequences
max_length = max([len(word) for word in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding = 'pre')
print("Max sequence length: ", max_length)

Max sequence length:  3


In [14]:
sequences = np.array(sequences)
X, y = sequences[:,0], sequences[:,1]
#one hot encode outputs 
y = to_categorical(y, num_classes=vocab_size)

In [15]:
#define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 10)             720       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 72)                3672      
Total params: 16,592
Trainable params: 16,592
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.fit(X,y, epochs=500, verbose=2)

Epoch 1/500
4/4 - 0s - loss: 4.2763 - accuracy: 0.0325
Epoch 2/500
4/4 - 0s - loss: 4.2728 - accuracy: 0.1138
Epoch 3/500
4/4 - 0s - loss: 4.2698 - accuracy: 0.2276
Epoch 4/500
4/4 - 0s - loss: 4.2667 - accuracy: 0.2276
Epoch 5/500
4/4 - 0s - loss: 4.2634 - accuracy: 0.2439
Epoch 6/500
4/4 - 0s - loss: 4.2599 - accuracy: 0.2439
Epoch 7/500
4/4 - 0s - loss: 4.2561 - accuracy: 0.2439
Epoch 8/500
4/4 - 0s - loss: 4.2521 - accuracy: 0.2602
Epoch 9/500
4/4 - 0s - loss: 4.2479 - accuracy: 0.2764
Epoch 10/500
4/4 - 0s - loss: 4.2437 - accuracy: 0.2927
Epoch 11/500
4/4 - 0s - loss: 4.2387 - accuracy: 0.2927
Epoch 12/500
4/4 - 0s - loss: 4.2334 - accuracy: 0.2927
Epoch 13/500
4/4 - 0s - loss: 4.2276 - accuracy: 0.2927
Epoch 14/500
4/4 - 0s - loss: 4.2213 - accuracy: 0.2927
Epoch 15/500
4/4 - 0s - loss: 4.2139 - accuracy: 0.2927
Epoch 16/500
4/4 - 0s - loss: 4.2065 - accuracy: 0.2927
Epoch 17/500
4/4 - 0s - loss: 4.1989 - accuracy: 0.3252
Epoch 18/500
4/4 - 0s - loss: 4.1898 - accuracy: 0.3740
E

<tensorflow.python.keras.callbacks.History at 0x7f86e00f6cc0>

In [22]:
print(generate_seq(model= model,tokenizer= tokenizer, seed_text= max_length-1,n_words= 3))

AttributeError: ignored