<a href="https://colab.research.google.com/github/awadatul1992/DeepNLP/blob/main/DeepNLP_Word_Based_Neural_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

#One word in one word out

In [None]:
# integer encode text
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [None]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [None]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
  sequence = encoded[i-1:i+1]
  sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 1


In [None]:
sequences

[array([2, 1])]

In [None]:
# split into X and y elements
import numpy as np
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [None]:
# one hot encode outputs
from keras.utils import to_categorical
y = to_categorical(y, num_classes=vocab_size)

In [None]:
# define the model
from keras import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils import plot_model
def define_model(vocab_size):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation='softmax'))
  # compile network
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [None]:
model = define_model(vocab_size)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_3 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X, y, epochs=40, verbose=2)

Epoch 1/40
1/1 - 0s - loss: 3.0899 - accuracy: 0.0000e+00
Epoch 2/40
1/1 - 0s - loss: 3.0855 - accuracy: 1.0000
Epoch 3/40
1/1 - 0s - loss: 3.0810 - accuracy: 1.0000
Epoch 4/40
1/1 - 0s - loss: 3.0765 - accuracy: 1.0000
Epoch 5/40
1/1 - 0s - loss: 3.0719 - accuracy: 1.0000
Epoch 6/40
1/1 - 0s - loss: 3.0672 - accuracy: 1.0000
Epoch 7/40
1/1 - 0s - loss: 3.0625 - accuracy: 1.0000
Epoch 8/40
1/1 - 0s - loss: 3.0578 - accuracy: 1.0000
Epoch 9/40
1/1 - 0s - loss: 3.0529 - accuracy: 1.0000
Epoch 10/40
1/1 - 0s - loss: 3.0480 - accuracy: 1.0000
Epoch 11/40
1/1 - 0s - loss: 3.0429 - accuracy: 1.0000
Epoch 12/40
1/1 - 0s - loss: 3.0378 - accuracy: 1.0000
Epoch 13/40
1/1 - 0s - loss: 3.0326 - accuracy: 1.0000
Epoch 14/40
1/1 - 0s - loss: 3.0272 - accuracy: 1.0000
Epoch 15/40
1/1 - 0s - loss: 3.0217 - accuracy: 1.0000
Epoch 16/40
1/1 - 0s - loss: 3.0161 - accuracy: 1.0000
Epoch 17/40
1/1 - 0s - loss: 3.0103 - accuracy: 1.0000
Epoch 18/40
1/1 - 0s - loss: 3.0044 - accuracy: 1.0000
Epoch 19/40
1/1

<tensorflow.python.keras.callbacks.History at 0x7fef1bdbb668>

In [None]:
# evaluate
in_text = 'Jack'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
  if index == yhat:
    print(word)

Jack
and


In [None]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
  in_text, result = seed_text, seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    encoded = np.array(encoded)
    # predict a word in the vocabulary
    yhat = model.predict_classes(encoded, verbose=0)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
  # append to input
    in_text, result = out_word, result + ' ' + out_word
  return result

In [None]:
generate_seq(model, tokenizer, "Jack", 20)

'Jack and jill came tumbling after to fetch a pail of water jack and jill came tumbling after to fetch a'

# Multiple words in one word out

In [None]:
# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

In [None]:
sequences = list()
for line in data.split('\n'):
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 21


In [None]:
sequences

[[2, 1],
 [2, 1, 3],
 [2, 1, 3, 4],
 [2, 1, 3, 4, 5],
 [2, 1, 3, 4, 5, 6],
 [2, 1, 3, 4, 5, 6, 7],
 [8, 9],
 [8, 9, 10],
 [8, 9, 10, 11],
 [8, 9, 10, 11, 12],
 [8, 9, 10, 11, 12, 13],
 [2, 14],
 [2, 14, 15],
 [2, 14, 15, 1],
 [2, 14, 15, 1, 16],
 [2, 14, 15, 1, 16, 17],
 [2, 14, 15, 1, 16, 17, 18],
 [1, 3],
 [1, 3, 19],
 [1, 3, 19, 20],
 [1, 3, 19, 20, 21]]

In [None]:
# pad input sequences
from keras.preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 7


In [None]:
# split into input and output elements
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [None]:
# define the model
def define_model_line_by_line(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=max_length-1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation='softmax'))
  # compile network
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [None]:
model_line_by_line = define_model_line_by_line(vocab_size, max_length)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_line_by_line.fit(X, y, epochs = 500, verbose = 2)

Epoch 1/500
1/1 - 0s - loss: 0.7710 - accuracy: 0.8095
Epoch 2/500
1/1 - 0s - loss: 0.7634 - accuracy: 0.8095
Epoch 3/500
1/1 - 0s - loss: 0.7562 - accuracy: 0.8095
Epoch 4/500
1/1 - 0s - loss: 0.7493 - accuracy: 0.8095
Epoch 5/500
1/1 - 0s - loss: 0.7426 - accuracy: 0.8095
Epoch 6/500
1/1 - 0s - loss: 0.7358 - accuracy: 0.8095
Epoch 7/500
1/1 - 0s - loss: 0.7289 - accuracy: 0.8095
Epoch 8/500
1/1 - 0s - loss: 0.7221 - accuracy: 0.8095
Epoch 9/500
1/1 - 0s - loss: 0.7154 - accuracy: 0.8095
Epoch 10/500
1/1 - 0s - loss: 0.7091 - accuracy: 0.8095
Epoch 11/500
1/1 - 0s - loss: 0.7030 - accuracy: 0.8571
Epoch 12/500
1/1 - 0s - loss: 0.6971 - accuracy: 0.8095
Epoch 13/500
1/1 - 0s - loss: 0.6913 - accuracy: 0.8571
Epoch 14/500
1/1 - 0s - loss: 0.6856 - accuracy: 0.8095
Epoch 15/500
1/1 - 0s - loss: 0.6797 - accuracy: 0.8571
Epoch 16/500
1/1 - 0s - loss: 0.6739 - accuracy: 0.8095
Epoch 17/500
1/1 - 0s - loss: 0.6680 - accuracy: 0.8571
Epoch 18/500
1/1 - 0s - loss: 0.6624 - accuracy: 0.8571
E

<tensorflow.python.keras.callbacks.History at 0x7fef1d2b8fd0>

In [None]:
# generate a sequence from a language model
def generate_seq_line_by_line(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # pre-pad sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
    # predict probabilities for each word
    yhat = model.predict_classes(encoded, verbose=0)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    # append to input
    in_text += ' ' + out_word
  return in_text

In [None]:
print(generate_seq_line_by_line(model_line_by_line, tokenizer, max_length, "Jack", 20))

Jack fell down down and broke his crown crown hill hill hill hill hill hill water water water water water water
