In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2022-07-29 06:37:26--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2022-07-29 06:37:35 (9.93 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
import tensorflow as tf 
from tensorflow import keras
dataset = keras.utils.text_dataset_from_directory(
 directory="aclImdb", label_mode=None, batch_size=256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " ")) #Replace all <br /> tags, which occur in some reviews, with blank sapce

Found 100006 files belonging to 1 classes.


In [None]:
import keras.layers as layers

In [None]:
from tensorflow.keras.layers import TextVectorization
sequence_length = 100
vocab_size = 15000   #Considering only top 15000 words, everything else treated as [UNK]
text_vectorization = TextVectorization(
 max_tokens=vocab_size, 
 output_mode="int",    # We want to return integer word index sequences
 output_sequence_length=sequence_length,  #we work with inputs and targets of length sequence_length, which is equal to 100. 
 #in reality, it's more like 99, since we offset targets by 1
)
text_vectorization.adapt(dataset)

In [None]:
def prepare_lm_dataset(text_batch):
 vectorized_sequences = text_vectorization(text_batch) #converting batch of strings to batch of integers
 x = vectorized_sequences[:, :-1]   #Create inputs by cutting off last word
 y = vectorized_sequences[:, 1:]    #Create targets by offsetting by 1
 return x, y
lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

In [None]:
class PositionalEmbedding(layers.Layer):

 def __init__(self, sequence_length, input_dim, output_dim, **kwargs): 
  super().__init__(**kwargs)
  self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
  self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim) 
  self.sequence_length = sequence_length
  self.input_dim = input_dim
  self.output_dim = output_dim

 def call(self, inputs):
  length = tf.shape(inputs)[-1]
  positions = tf.range(start=0, limit=length, delta=1)
  embedded_tokens = self.token_embeddings(inputs)
  embedded_positions = self.position_embeddings(positions)
  return embedded_tokens + embedded_positions 

 def compute_mask(self, inputs, mask=None): 
  return tf.math.not_equal(inputs, 0) 

 def get_config(self): 
  config = super().get_config()
  config.update({"output_dim": self.output_dim,"sequence_length": self.sequence_length, "input_dim": self.input_dim})
  return config



class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.denseProj = keras.Sequential([layers.Dense(dense_dim, activation='relu'), layers.Dense(embed_dim), ])
    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.layernorm3 = layers.LayerNormalization()
    self.supports_masking=True

  def get_config(self):
    config = super().get_config()
    config.update({'embed_dim': self.embed_dim, 'dense_dim': self.dense_dim, 'num_heads': self.num_heads})
    return config

  def get_casual_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype='int32')
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)

  def call(self, inputs, encoder_outputs, mask=None):
    casual_mask = self.get_casual_attention_mask(inputs)
    if mask is not None:
      padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype='int32')
      padding_mask = tf.minimum(padding_mask, casual_mask)
    attention_output_1 = self.attention1(query=inputs, value=inputs, key=inputs, attention_mask=casual_mask)
    attention_output_1 = self.layernorm1(inputs + attention_output_1)
    attention_output_2 = self.attention2(query=attention_output_1, value=encoder_outputs, key=encoder_outputs, attention_mask=padding_mask)
    attention_output_2 = self.layernorm2(attention_output_1 + attention_output_2)
    proj_output = self.denseProj(attention_output_2)
    return self.layernorm3(attention_output_2 + proj_output)

In [None]:
from tensorflow.keras import layers
embed_dim = 256
latent_dim = 2048
num_heads = 2
inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = layers.Dense(vocab_size, activation="softmax")(x) 
model = keras.Model(inputs, outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop")
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3865600     ['input_1[0][0]']                
 alEmbedding)                                                                                     
                                                                                                  
 transformer_decoder (Transform  (None, None, 256)   2104576     ['positional_embedding[0][0]',   
 erDecoder)                                                       'positional_embedding[0][0]']   
                                                                                              

In [None]:
import numpy as np

tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))  #maps word indices back to strings, to be used for text decoding

def sample_next(predictions, temperature=1.0):      #implementing temperature
 predictions = np.asarray(predictions).astype("float64")
 predictions = np.log(predictions) / temperature
 exp_preds = np.exp(predictions)
 predictions = exp_preds / np.sum(exp_preds)
 probas = np.random.multinomial(1, predictions, 1)
 return np.argmax(probas)



class TextGenerator(keras.callbacks.Callback):
 def __init__(self, prompt, generate_length, model_input_length, temperatures=(1.,), print_freq=1):
  self.prompt = prompt        #In our case, prompt is "this movie"
  self.generate_length = generate_length    #Length of generated output
  self.model_input_length = model_input_length
  self.temperatures = temperatures   #range of temperatures 
  self.print_freq = print_freq

 def on_epoch_end(self, epoch, logs=None):
  if (epoch + 1) % self.print_freq != 0:
    return
  for temperature in self.temperatures:
    print("== Generating with temperature", temperature)
    sentence = self.prompt 
    for i in range(self.generate_length):
      tokenized_sentence = text_vectorization([sentence])    #Feed current sequence
      predictions = self.model(tokenized_sentence)           #into our model
      next_token = sample_next(predictions[0, i, :])      #Retrieve predictions for last timestep
      sampled_token = tokens_index[next_token]            #and use them to sample a new word
      sentence += " " + sampled_token       #append new word to current sequence and repeat
    print(sentence)

prompt = "This movie"
text_gen_callback = TextGenerator(
 prompt,
 generate_length=50,
 model_input_length=sequence_length,
 temperatures=(0.2, 0.5, 0.7, 1., 1.5)) 

In [None]:
model.fit(lm_dataset, epochs=5, callbacks=[text_gen_callback])

Epoch 1/5
This movie was is about a here [UNK] a [UNK] book [UNK] called like powers a we series were [UNK] going creatures to in take following place 1994 movies a not block as the white band creaky gave era nothing or new with version a shooting plot characters when are the pretty
== Generating with temperature 0.5
This movie is is simply true a the good destruction although keen it on a the [UNK] released of a story [UNK] ive a been few closer years to ago appreciate but the after patience hearing hired how for much lou like finds that that have alist been director able james to
== Generating with temperature 0.7
This movie is is [UNK] a young pathetic guy look like but strange they go do see love it accept out some and [UNK] everything through to laughing be through reduced it captive and that lends sequels a are morgue an and [UNK] feel cons the cheese lamest than right youve the
== Generating with temperature 1.0
This movie is is made best at known first cliched it [UNK] short mi

<keras.callbacks.History at 0x7f8a957ba2d0>