<a href="https://colab.research.google.com/github/bikash119/learn_tensorflow/blob/main/nlp_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Character RNN

## Creating the Training Dataset

In [2]:
import tensorflow as tf

filepath= tf.keras.utils.get_file("shakespeare_txt",
                                  origin="https://homl.info/shakespeare")
with open(filepath) as fp:
  shakespeare_tex= fp.read()

Downloading data from https://homl.info/shakespeare


In [4]:
print(shakespeare_tex[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


## Perform Text Vectorization

In [11]:
# Perform character text vectorization
text_vec_layer= tf.keras.layers.TextVectorization(split="character",
                                                  standardize="lower")
text_vec_layer.adapt([shakespeare_tex])

In [13]:
sample_sentence=" I am here to learn nlp"
print(len(sample_sentence))
## Vectorize the sample sentence
text_vec_layer([sample_sentence])

23


<tf.Tensor: shape=(1, 23), dtype=int64, numpy=
array([[ 2,  7,  2,  6, 16,  2,  8,  3, 10,  3,  2,  4,  5,  2, 13,  3,
         6, 10, 11,  2, 11, 13, 24]])>

In [21]:
tokens= text_vec_layer([shakespeare_tex])[0]
# Print some text from original text
print(f" Original Text : {shakespeare_tex[:80]} ")
# Print the vectorized text
print(f" Vectorized tokens: {tokens[:80]}")
print(len(tokens))

 Original Text : First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak. 
 Vectorized tokens: [21  7 10  9  4  2 20  7  4  7 37  3 11 25 12 23  3 21  5 10  3  2 18  3
  2 24 10  5 20  3  3 14  2  6 11 17  2 21 15 10  4  8  3 10 19  2  8  3
  6 10  2 16  3  2  9 24  3  6 26 28 12 12  6 13 13 25 12  9 24  3  6 26
 19  2  9 24  3  6 26 28]
1115394


In [22]:
## Unique token characters in vocabulary
chars_in_vocab= text_vec_layer.get_vocabulary()
top_5_chars= chars_in_vocab[:5]
bottom_5_chars= chars_in_vocab[-5:]
print(f" Number of characters in vocab :{text_vec_layer.vocabulary_size()}")
print(f" top 5 common tokens: {top_5_chars}")
print(f" bottom 5 common tokens: {bottom_5_chars}")

 Number of characters in vocab :41
 top 5 common tokens: ['', '[UNK]', ' ', 'e', 't']
 bottom 5 common tokens: ['x', 'z', '3', '&', '$']


In [30]:
#tokens -= 2 # Remove the 0 (pad) and 1(UNK) token
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size= len(tokens)
print(f" dataset size : {len(tokens)}")

 dataset size : 1115394


## Perpare Dataset

In [46]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds= tf.data.Dataset.from_tensor_slices(sequence)
  ds= ds.window(length+1, shift=1, drop_remainder=True)
  ds= ds.flat_map(lambda window_ds: window_ds.batch(length+1))
  if shuffle:
    ds= ds.shuffle(buffer_size=100_000,seed=seed)
  ds= ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:,1:])).prefetch(1)


In [47]:
length= 100
tf.random.set_seed(42)
train_set= to_dataset(tokens[:100_000],length=length
                      , shuffle=True,seed=42)
valid_set= to_dataset(tokens[1_000_000:1_060_000],length=length)
test_set= to_dataset(tokens[1_600_000:],length=length)

## Creating an Embedding using an Embedding Layer

In [48]:
tf.random.set_seed(42)
embedding_layer = tf.keras.layers.Embedding(input_dim=n_tokens
                                            ,output_dim=16
                                            ,input_length=length)
embedding_layer

<keras.layers.core.embedding.Embedding at 0x7b78a1cc9b40>

In [55]:
sample_sentenc= " I am here to learn NLP"
print(f" Sample Sentenc : {sample_sentence}")
tokens_of_sample_sentence= text_vec_layer(sample_sentence)
print(f" length of tokens from TextVectorizer for sample sentence: {len(tokens_of_sample_sentence)}")
embeddings_of_sample_sentence= embedding_layer(tokens_of_sample_sentence)
print(f" Shape of Embeddings from embeding layer for sample sentence: {embeddings_of_sample_sentence.shape}")


 Sample Sentenc :  I am here to learn nlp
 length of tokens from TextVectorizer for sample sentence: 23
 Shape of Embeddings from embeding layer for sample sentence: (23, 16)


## Building and Training the Char-RNN model


In [None]:
char_model= tf.keras.Sequential([
    embedding_layer
    ,tf.keras.layers.GRU(128, return_sequence=True)
    ,tf.keras.layers.Dense(n_tokens,activation="softmax")
])

char_model.compile(oss=tf.keras.losses.sparse_categorical_crossentropy
                   ,optimizer=tf.keras.optimizers.Nadam()
                   ,metrics=['accuracy'])
model_ckpt= tf.keras.callbacks.ModelCheckPoint("my_shakespeare_model"
                                               ,monitor="val_accuracy"
                                               ,save_best_only=True)
history= char_model.fit(train_set
                        ,validation_data=valid_set
                        ,epochs=2
                        ,callbacks=[model_ckpt])