<a href="https://colab.research.google.com/github/ZahinAwosaf/NLP-Projects/blob/main/character_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import os
import random
import time

import numpy as np
import tensorflow as tf

from tensorflow import keras
from keras import layers, utils, Sequential
from keras.layers import *

In [25]:
np.seterr(divide = 'ignore') 

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [2]:
path = tf.keras.utils.get_file("Alice.txt", "https://raw.githubusercontent.com/ZahinAwosaf/NLP-Projects/main/data/Alice.txt")

Downloading data from https://raw.githubusercontent.com/ZahinAwosaf/NLP-Projects/main/data/Alice.txt


In [3]:
text = open(path, "rb").read().decode(encoding = "utf-8")

In [4]:
text = text.replace("\n", " ")
text_length = len(text)
print("Length Text:", text_length)

Length Text: 147673


In [5]:
print(text[:250])

CHAPTER I. Down the Rabbit-Hole  Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations i


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Total unique characters:", vocab_size)

Total unique characters: 69


In [7]:
chars_to_ids = {u:i for i, u in enumerate(chars)}
ids_to_chars = {i:u for i, u in enumerate(chars)}

In [8]:
max_length = 50
step = 3
sentences = []
next_chars = []

for i in range(0, text_length - max_length, step):
    sentences.append(text[i : i + max_length])
    next_chars.append(text[i + max_length])

print("Number of sequences:", len(sentences))

Number of sequences: 49208


In [9]:
X = np.zeros((len(sentences), max_length, vocab_size), dtype = np.bool)
y = np.zeros((len(sentences), vocab_size), dtype = np.bool)

for i, sentence in enumerate(sentences):
    for j, char in enumerate(sentence):
        X[i, j, chars_to_ids[char]] = 1
    y[i, chars_to_ids[next_chars[i]]] = 1

In [10]:
units = 128
batch_size = 256

In [11]:
model = Sequential([
    Input(shape=(max_length, vocab_size)),
    LSTM(units),
    Dense(vocab_size, activation = "softmax")
])

model.compile(loss = keras.losses.CategoricalCrossentropy(),
              optimizer = keras.optimizers.RMSprop(learning_rate = 0.01))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               101376    
                                                                 
 dense (Dense)               (None, 69)                8901      
                                                                 
Total params: 110,277
Trainable params: 110,277
Non-trainable params: 0
_________________________________________________________________


In [12]:
def text_sampling_function(predictions, diversity = 1.0):
    predictions = tf.cast(np.asarray(predictions), dtype = tf.float64)
    predictions = np.log(predictions) / diversity
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, predictions, 1)
    
    return np.argmax(probabilities)

In [13]:
model.fit(X, y, batch_size = batch_size, epochs = 300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f212b31d5d0>

In [14]:
def generate_text(model, diversities, 
                  generate, start_string):
    
    for diversity in diversities:
        print()
        print("Diversity:", diversity)

        generated = ""
        sentence = start_string
        print("Starting String:", start_string)

        for i in range(generate):
            X_pred = np.zeros((1, max_length, vocab_size))
            for j, char in enumerate(sentence):
                X_pred[0, j, chars_to_ids[char]] = 1.0
            
            predictions = model.predict(X_pred, verbose = 0)[0]
            next_index = text_sampling_function(predictions, diversity)
            next_char = ids_to_chars[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("Generated:", generated)
        print()

In [40]:
generate_text(model, diversities = [0.2, 0.5, 1.0], generate = 100,
                    start_string = "In another moment down went Alice after it, never ")


Diversity: 0.2
Starting String: In another moment down went Alice after it, never 
 a Gry now and the were three could not at


Diversity: 0.5
Starting String: In another moment down went Alice after it, never 
 when she had someted her head wondent wordend, which was at the 


Diversity: 1.0
Starting String: In another moment down went Alice after it, never 
Generated: over the miling under that she next would she pupting on by try and no did nosered on the song, wond

