# NLP - Text Generation

In [25]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Import libraries

In [3]:
import tensorflow as tf

In [4]:
import keras

In [5]:
from tensorflow .keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

## Load the Corpus

In [7]:
data = open('beatles.txt').read()

In [8]:
corpus = data.lower().split('\n')

In [30]:
corpus

['yesterday, all my troubles seemed so far away',
 "now it looks as though they're here to stay",
 "oh, i believe in yesterday suddenly, i'm not half the man i used to be",
 "there's a shadow hanging over me.",
 "oh, yesterday came suddenly why she had to go i don't know she wouldn't say",
 'i said something wrong, now i long for yesterday yesterday, love was such an easy game to play',
 'now i need a place to hide away',
 "oh, i believe in yesterday why she had to go i don't know she wouldn't say",
 'i said something wrong, now i long for yesterday yesterday, love was such an easy game to play',
 'now i need a place to hide away',
 'oh, i believe in yesterday',
 'mm mm mm mm mm mm mm when i find myself in times of trouble, mother mary comes to me',
 'speaking words of wisdom, let it be',
 'and in my hour of darkness she is standing right in front of me',
 'speaking words of wisdom, let it be',
 'let it be, let it be, let it be, let it be',
 'whisper words of wisdom, let it be and when

## Tokenization

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [10]:
print(tokenizer.word_index)



In [11]:
print(len(tokenizer.word_index))

1628


In [12]:
print(total_words)

1629


## Generating n-gram sequences

In [15]:
input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[: i+1]
    input_sequences.append(n_gram_sequence)

input_sequences

[[200, 10],
 [200, 10, 12],
 [200, 10, 12, 907],
 [200, 10, 12, 907, 908],
 [200, 10, 12, 907, 908, 42],
 [200, 10, 12, 907, 908, 42, 909],
 [200, 10, 12, 907, 908, 42, 909, 121],
 [22, 13],
 [22, 13, 638],
 [22, 13, 638, 91],
 [22, 13, 638, 91, 217],
 [22, 13, 638, 91, 217, 349],
 [22, 13, 638, 91, 217, 349, 79],
 [22, 13, 638, 91, 217, 349, 79, 4],
 [22, 13, 638, 91, 217, 349, 79, 4, 314],
 [23, 2],
 [23, 2, 160],
 [23, 2, 160, 9],
 [23, 2, 160, 9, 200],
 [23, 2, 160, 9, 200, 520],
 [23, 2, 160, 9, 200, 520, 35],
 [23, 2, 160, 9, 200, 520, 35, 83],
 [23, 2, 160, 9, 200, 520, 35, 83, 910],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3, 78],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3, 78, 2],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3, 78, 2, 911],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3, 78, 2, 911, 4],
 [23, 2, 160, 9, 200, 520, 35, 83, 910, 3, 78, 2, 911, 4, 15],
 [173, 7],
 [173, 7, 912],
 [173, 7, 912, 913],
 [173, 7, 912, 913, 397],
 [173, 

## Padding

In [18]:
# Padding

max_sequence_len = max([len(x) for x in input_sequences]) # list comprehension
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
input_sequences

array([[  0,   0,   0, ...,   0, 200,  10],
       [  0,   0,   0, ..., 200,  10,  12],
       [  0,   0,   0, ...,  10,  12, 907],
       ...,
       [  0,   0,   0, ...,  41,  46,  59],
       [  0,   0,   0, ...,  46,  59,  40],
       [  0,   0,   0, ...,  59,  40,   8]])

In [19]:
max_sequence_len

34

## Model - Bidirectional LSTM

In [20]:
xs, label = input_sequences[:,:-1], input_sequences[:,-1]

In [21]:
label[55]

200

In [22]:
ys = tf.keras.utils.to_categorical(label, num_classes=total_words)
ys[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [23]:
len(ys[0])

1629

In [26]:
model= Sequential()
model.add(Embedding(total_words, 100, input_length= max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words,activation='softmax'))
adam= Adam(lr=0.01)
model.compile(loss='categorical_crossentropy',optimizer=adam, metrics=['accuracy'])
history=model.fit(xs , ys, epochs= 25, verbose=1)
print(model)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
<keras.src.engine.sequential.Sequential object at 0x00000283785CD790>


## Text Generation

In [28]:
seed_text = 'The battle in'
next_words = 20

for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  predicted = np.argmax(model.predict(token_list), axis=-1)
  output_words = ""
  for word, index in tokenizer.word_index.items():
    if index==predicted:
      output_word=word
      break
  seed_text += " " + output_word

  print(seed_text)

The battle in with
The battle in with the
The battle in with the dark
The battle in with the dark tryin
The battle in with the dark tryin to
The battle in with the dark tryin to the
The battle in with the dark tryin to the faces
The battle in with the dark tryin to the faces blackbird
The battle in with the dark tryin to the faces blackbird gone
The battle in with the dark tryin to the faces blackbird gone in
The battle in with the dark tryin to the faces blackbird gone in the
The battle in with the dark tryin to the faces blackbird gone in the dead
The battle in with the dark tryin to the faces blackbird gone in the dead of
The battle in with the dark tryin to the faces blackbird gone in the dead of night
The battle in with the dark tryin to the faces blackbird gone in the dead of night and
The battle in with the dark tryin to the faces blackbird gone in the dead of night and some
The battle in with the dark tryin to the faces blackbird gone in the dead of night and some have
The batt