<a href="https://colab.research.google.com/github/ZahinAwosaf/NLP-Projects/blob/main/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import time
import datetime

import numpy as np
import tensorflow as tf

from tensorflow import keras
from keras import layers, utils, Sequential
from keras.layers import *

In [2]:
np.seterr(divide = 'ignore') 

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [3]:
path = tf.keras.utils.get_file("Alice.txt", "https://raw.githubusercontent.com/ZahinAwosaf/NLP-Projects/main/data/Alice.txt")

Downloading data from https://raw.githubusercontent.com/ZahinAwosaf/NLP-Projects/main/data/Alice.txt


In [4]:
text = open(path, "rb").read().decode(encoding = "utf-8")

In [5]:
text_length = len(text)
print("Length Text:", text_length)

Length Text: 147673


In [7]:
print(text[:200])

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into the
book her sister was 


In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Total unique characters:", vocab_size)

Total unique characters: 70


In [9]:
chars_to_ids = {u:i for i, u in enumerate(chars)}
ids_to_chars = {i:u for i, u in enumerate(chars)}

In [10]:
num_rep = np.array([chars_to_ids[char] for char in text])
print(text[:100])
print(num_rep[:100])

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on
[17 22 15 30 34 19 32  2 23 11  2 18 58 66 57  2 63 51 48  2 32 44 45 45
 52 63 10 22 58 55 48  1  0  1  0 15 55 52 46 48  2 66 44 62  2 45 48 50
 52 57 57 52 57 50  2 63 58  2 50 48 63  2 65 48 61 68  2 63 52 61 48 47
  2 58 49  2 62 52 63 63 52 57 50  2 45 68  2 51 48 61  2 62 52 62 63 48
 61  2 58 57]


In [11]:
num_rep.shape

(147673,)

In [12]:
max_length = 200
batch_size = 64
buffer = 10000
embedding_dim = 256
units = 1024

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices(num_rep)
train_ds = train_ds.batch(max_length + 1, drop_remainder = True)

In [14]:
def split_input_target(text):
    input = text[:-1]
    target = text[1:]

    return input, target

In [15]:
train_ds = train_ds.map(split_input_target).shuffle(buffer).batch(batch_size,
                                                                  drop_remainder = True)
print(train_ds)

<BatchDataset shapes: ((64, 200), (64, 200)), types: (tf.int64, tf.int64)>


In [16]:
def build_model(vocab_size, embedding_dim,
                units, batch_size):
    model = Sequential([
    Embedding(vocab_size, embedding_dim,
              batch_input_shape = [batch_size, None]),
    LSTM(units, return_sequences = True, stateful = True,
         recurrent_initializer = "glorot_uniform"),
    LSTM(units, return_sequences = True, stateful = True,
         recurrent_initializer = "glorot_uniform"),
    Dropout(0.2),
    Dense(vocab_size)
    ])
    return model

In [17]:
model = build_model(vocab_size, embedding_dim, units, batch_size)

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           17920     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 lstm_1 (LSTM)               (64, None, 1024)          8392704   
                                                                 
 dropout (Dropout)           (64, None, 1024)          0         
                                                                 
 dense (Dense)               (64, None, 70)            71750     
                                                                 
Total params: 13,729,350
Trainable params: 13,729,350
Non-trainable params: 0
_________________________________________________________________


In [19]:
loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True)
optimizer = keras.optimizers.Adam()

model.compile(loss = loss,
              optimizer = optimizer)

In [20]:
checkpoint_dir = './checkpoints'+ datetime.datetime.now().strftime("_%Y.%m.%d-%H:%M:%S")
checkpoint_prefix = os.path.join(checkpoint_dir, "checkpoint_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,
                                                         save_weights_only = True)

In [21]:
model.fit(train_ds, epochs = 100,
          callbacks = [checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f4dd734bed0>

In [22]:
model = build_model(vocab_size, embedding_dim, units, batch_size = 1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            17920     
                                                                 
 lstm_2 (LSTM)               (1, None, 1024)           5246976   
                                                                 
 lstm_3 (LSTM)               (1, None, 1024)           8392704   
                                                                 
 dropout_1 (Dropout)         (1, None, 1024)           0         
                                                                 
 dense_1 (Dense)             (1, None, 70)             71750     
                                                                 
Total params: 13,729,350
Trainable params: 13,729,350
Non-trainable params: 0
_________________________________________________________________


In [23]:
def generate_text(model, start_string, diversity, generate):
    
    print("Start String:", start_string)
    print("Generated:")

    input_eval = [chars_to_ids[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    generated = ""

    model.reset_states()
    for i in range(generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / diversity
        predicted_id = tf.random.categorical(predictions, 
                                             num_samples = 1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        generated += ids_to_chars[predicted_id]

    return (generated)

In [26]:
print(generate_text(model, start_string = "Either the well was very deep, or",
                    diversity = 1.0, generate = 200))

Start String: Either the well was very deep, or
Generated:
 she fell very sloty and saythe greet beautiful, but chow only then they bit, choked it was
very like him took change, and she wanted a little quire (ut said, and a confus of siitsef? And it was the 
