In [None]:
!pip install -q --upgrade keras-nlp tensorflow

#Import Libraries

In [None]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras

policy=keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

#Get Dataset

In [None]:
#download pretrainning data
link_to_dataset="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"

keras.utils.get_file(origin=link_to_dataset,
                     extract=True)
wiki_dir=os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")
#download finetuning data
link_to_fine_tune_dataset="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip"
keras.utils.get_file(origin=link_to_fine_tune_dataset,
                     extract=True)

sst_dir=os.path.expanduser("~/.keras/datasets/SST-2/")

#download vocabulary data
vcab_text_link="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt"
vacab_file=keras.utils.get_file(
    origin=vcab_text_link
)

In [None]:
#preprocessing params
PRETRAINING_BATCH_SIZE=128
FINETUNING_BATCH_SIZE=32
SEQ_LENGTH=128
MASK_RATE=0.25
PREDICTIONS_PER_SEQ=32

In [None]:
#load sst2 data set
sst_train_ds=tf.data.experimental.CsvDataset(sst_dir+"train.tsv",[tf.string,tf.int32],header=True,field_delim="\t").batch(FINETUNING_BATCH_SIZE)
sst_val_ds=tf.data.experimental.CsvDataset(sst_dir+"dev.tsv",[tf.string,tf.int32],header=True,field_delim="\t").batch(FINETUNING_BATCH_SIZE)

#load wiki text dataset and filter short lines
wiki_train_ds=(
    tf.data.TextLineDataset(wiki_dir+"wiki.valid.raw")
    .filter(lambda x: tf.strings.length(x)>100)
    .batch(PRETRAINING_BATCH_SIZE)
)

wiki_valid_dataset=(
    tf.data.TextLineDataset(wiki_dir+"wiki.valid.raw")
    .filter(lambda x: tf.strings.length(x)>100)
    .batch(PRETRAINING_BATCH_SIZE)
)

#take a peek at sst-2 dataset
print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())



(<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'hide new secretions from the parental units ',
       b'contains no wit , only labored gags ',
       b'that loves its characters and communicates something rather beautiful about human nature ',
       b'remains utterly satisfied to remain the same throughout '],
      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)


#Two main components

keras_nlp.tokenizers.Tokenizer- transform text into sequence of input tekoen_ids

keras_nlp.tokenizers.WordPieceTokenizer-subword tokenization , its pular when training on a large corpa
it allwos model to learn uncommon words
while not requiring massive vacab of every word in our training set

keras_nlp.layers.MaskedLMMaskGenerator
this randomly selects a set of imput and mask them out

tf.data.Dataset.map -both tokenizer and masking can be used in a call

tf.data- efficiently precompute each batch on cpu

In [None]:
#example
vocab=["[UNK]","the","qu","##ick","br","##own","fox","."]
inputs=["The quick brown fox."]

#implimentation of word piece tokenizer
tokenizer=keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=True
)

tokenizer(inputs)

<tf.RaggedTensor [[1, 2, 3, 4, 5, 6, 7]]>

In [None]:
#example for string op
vocab=["[UNK]","the","qu","##ick","br","##own","fox","."]
inputs=["The quick brown fox."]

#implimentation of word piece tokenizer
tokenizer=keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=True,
    dtype="string",
)

tokenizer(inputs)

<tf.RaggedTensor [[b'the', b'qu', b'##ick', b'br', b'##own', b'fox', b'.']]>

In [None]:
#preprocessing params
PRETRAINING_BATCH_SIZE=128
FINETUNING_BATCH_SIZE=32
SEQ_LENGTH=128
MASK_RATE=0.25
PREDICTIONS_PER_SEQ=32

#model params
#encoder stack
NUM_LAYERS=3 #12 or 24 in general
MODEL_DIM=256 #512 min
INTERMIDIATE_DIM=512
NUM_HEAD=4 #change number
DROPOUT=0.1
NORM_EPSILON=1e-5

#training params
PRETRAINING_LEARNING_RATE=5e-4
PRETRAINING_EPOCHS=8
FINETUNEING_LEANING_RATE=5e-5
FINETUNING_EPOCHS=3


In [None]:
#setting sequence length will trim or pad the token ops to shape
# (batchsize,seqlength)
tokenizer=keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vacab_file,
    sequence_length=SEQ_LENGTH,
    lowercase=True,
    strip_accents=True
)

#setting mask_selection_length will trim or pad the token ops to shape
# (batchsize,PREDICTIONS_PER_SEQ)

masker=keras_nlp.layers.MaskedLMMaskGenerator(
    vocabulary_size=tokenizer.vocabulary_size(),
    mask_selection_rate=MASK_RATE,
    mask_selection_length=PREDICTIONS_PER_SEQ,
    mask_token_id=tokenizer.token_to_id("[MASK]")

)

def preprocess(inputs):
  inputs=tokenizer(inputs)
  outputs=masker(inputs)

  #split the masking layer ops into a (feature ,labels and weights)
  #tuple that so that we can use keras.Model.fit()

  features={
      "token_ids":outputs["token_ids"],
      "mask_positions":outputs["mask_positions"]
  }

  labels=outputs["mask_ids"]
  weights=outputs["mask_weights"]

  return features,labels,weights


#we use prefetch to precompute preprocessed batched on the fly on cpu

pretrain_ds=wiki_train_ds.map(
      preprocess,num_parallel_calls=tf.data.AUTOTUNE
  ).prefetch(tf.data.AUTOTUNE)

pretrain_val_ds=wiki_valid_dataset.map(
      preprocess,num_parallel_calls=tf.data.AUTOTUNE
  ).prefetch(tf.data.AUTOTUNE)

#preview a single input example

#the mask will change each time you run the cell
print(pretrain_val_ds.take(1).get_single_element())

({'token_ids': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
array([[7570, 7849, 2271, ..., 9673, 1012,  103],
       [7570,  103, 2271, ..., 1007, 1012, 2023],
       [1996, 2034, 3940, ...,    0,    0,    0],
       ...,
       [2076, 1996, 2307, ...,    0,    0,    0],
       [3216, 2225, 2083, ...,    0,    0,    0],
       [9794, 2007,  103, ...,    0,    0,    0]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
array([[  5,  11,  15, ..., 112, 119, 127],
       [  1,   5,   6, ..., 121, 122, 124],
       [  3,   8,  11, ...,   0,   0,   0],
       ...,
       [  9,  11,  18, ..., 119, 120,   0],
       [ 15,  21,  22, ...,   0,   0,   0],
       [  2,   4,  17, ...,   0,   0,   0]])>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
array([[ 1010,  2030,  2003, ...,  1996,  2077,  7570],
       [ 7849,  2003,  1037, ...,  1006,  9587,  2075],
       [ 1997,  4273,  2312, ...,     0,     0,     0],
       ...,
       [23133,  1996,  6032, ..

In [None]:
#building block of bert which is an encoder only block
#transformer encoder layer

# This class follows the architecture of the transformer encoder layer in the
# paper Attention is All You Need. Users
# can instantiate multiple instances of this class to stack up an encoder.

# This layer will correctly compute an attention mask from an implicit
# Keras padding mask (for example, by passing mask_zero=True to a
# keras.layers.Embedding layer).

# keras_nlp.layers.TransformerEncoder(
#     intermediate_dim,
#     num_heads,
#     dropout=0,
#     activation="relu",
#     layer_norm_epsilon=1e-5,
#     kernel_initializer='golrot_uniform',
#     bias_initializer="zeros",
#     name=None,
#     **kwargs
# )


#simple example of Token and positioning embedding

# keras.nlp.layers.TokenANDPositionEmbedding(
#     vacabulary_size,
#     sequence_length,
#     embedding_dim,
#     embedding_initializer="glorot_uniform",
#     mask_zero=False**kwargs

# )

# Layer normalization layer (Ba et al., 2016).

# Normalize the activations of the previous layer for each given example in a
# batch independently, rather than across a batch like Batch Normalization.
# i.e. applies a transformation that maintains the mean activation within each
# example close to 0 and the activation standard deviation close to 1.

# Given a tensor inputs, moments are calculated and normalization
# is performed across the axes specified in axis.


# tf.keras.layers.LayerNormalization(
#     axis=-1,
#     epsilon=0.001,
#     centre=True,
#     scle=True,
#     beta_initializer="zeros",
#     gamma_initilizer="ones",
#     beta_regularizer=None,
#     gamma_regularizer=None,
#     beta_constraint=None,
#     gamma_constraint=None,
#     **kwargs
# )

# tf.keras.layers.Dropout(rate,noise_shape=None,seed=None,**kwargs)


#Pretraining the model


In [None]:
#input layer
inputs=keras.Input(shape=(SEQ_LENGTH,),dtype=tf.int32)

#embedding layer for token and position
embedding_layer=keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=tokenizer.vocabulary_size(),
    sequence_length=SEQ_LENGTH,
    embedding_dim=MODEL_DIM
)

#op of embedding layer
outputs=embedding_layer(inputs)
#add a layer normalisation
outputs=keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
#dd a dropout
outputs=keras.layers.Dropout(rate=DROPOUT)(outputs)

#added the attention mechanism / encoder layer of keras
for i in range(NUM_LAYERS):
  outputs=keras_nlp.layers.TransformerEncoder(
      intermediate_dim=INTERMIDIATE_DIM,
      num_heads=NUM_HEAD,
      dropout=DROPOUT,
      layer_norm_epsilon=NORM_EPSILON
  )(outputs)

#create the model having ip and op
encoder_model= keras.Model(inputs,outputs)

#print summary of the model
encoder_model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 128)]             0         
                                                                 
 token_and_position_embeddi  (None, 128, 256)          7846400   
 ng_1 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 layer_normalization_1 (Lay  (None, 128, 256)          512       
 erNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 128, 256)          0         
                                                                 
 transformer_encoder_3 (Tra  (None, 128, 256)          527104    
 nsformerEncoder)                                          

In [None]:
#now pretrain the transformer model
#create a pretrain model by attachig a mask language model head
inputs={
    "token_ids":keras.Input(shape=(SEQ_LENGTH,),dtype=tf.int32),
    "mask_positions":keras.Input(shape=(PREDICTIONS_PER_SEQ,),dtype=tf.int32)
}

encoded_tokens=encoder_model(inputs["token_ids"])

# predict output word for each masked token_id
# token embeddings to project from our encoded vectors to
# vocabulary logits which has been shown to improve training efficiency

#bug in mask_positions used masked_positions
#github issue https://github.com/keras-team/keras-io/issues/1446
outputs=keras_nlp.layers.MaskedLMHead(
    embedding_weights=embedding_layer.token_embedding.embeddings,
    activation="softmax",
    )(encoded_tokens,inputs["mask_positions"])


#define the model
pretraining_model=keras.Model(inputs,outputs)

#compile the model
pretraining_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.experimental.AdamW(PRETRAINING_LEARNING_RATE),
    weighted_metrics=["sparse_categorical_accuracy"],
    jit_compile=True
)

#pretrain model on wiki text
pretraining_model.fit(
    pretrain_ds,
    validation_data=pretrain_val_ds,
    epochs=PRETRAINING_EPOCHS
)

encoder_model.save("encoder_model")

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8




In [None]:
inputs["mask_positions"]

<KerasTensor: shape=(None, 32) dtype=int32 (created by layer 'input_26')>

#Fine Tuning

In [None]:
def preprocess(sentences,labels):
  return tokenizer(sentences),labels

finetune_ds=sst_train_ds.map(
    preprocess,num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

finetune_val_ds=sst_val_ds.map(
    preprocess,num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)


print(finetune_val_ds.take(1).get_single_element())


(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[ 2009,  1005,  1055, ...,     0,     0,     0],
       [ 4895, 10258,  2378, ...,     0,     0,     0],
       [ 4473,  2149,  2000, ...,     0,     0,     0],
       ...,
       [ 1045,  2018,  2000, ...,     0,     0,     0],
       [ 4283,  2000,  3660, ...,     0,     0,     0],
       [ 1012,  1012,  1012, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int32)>)


In [None]:
encoder_model=keras.models.load_model("encoder_model",compile=False)

#tokenized input
inputs=keras.Input(shape=(SEQ_LENGTH,),dtype=tf.int32)

#encode and pool the tokens
encoded_tokens=encoder_model(inputs)

pooled_tokens=keras.layers.GlobalAveragePooling1D()(encoded_tokens)

#predict output label
outputs=keras.layers.Dense(1,activation="sigmoid")(pooled_tokens)

#define and compile model
finetune_model=keras.Model(inputs,outputs)
finetune_model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.experimental.AdamW(FINETUNEING_LEANING_RATE),
    metrics=["accuracy"],
)

#finetune the model for SST-2 Task
finetune_model.fit(
    finetune_ds,
    validation_data=finetune_val_ds,
    epochs=FINETUNING_EPOCHS
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7e294bd45ab0>

In [None]:
#saving the model along with the tokenization layer.
# as with the keras_nlp preprocessing is done inside tensorflow graph
#benifit is the model can be saved and restoref , that can directly run inefrence on raw text
#you dont have to load tokenizer seperately


#so add tokenization into final model
inputs=keras.Input(shape=(),dtype=tf.string)
tokens=tokenizer(inputs)
outputs=finetune_model(tokens)
final_model=keras.Model(inputs,outputs)

final_model.save("final_model")




#Model Inference

In [None]:
#now the model can directly predict raw text


#reload model
restore_finetuned_model=keras.models.load_model("final_model")

#second start inference

inference_data=tf.constant(["terrible,no good,trash","So great: I loved it!"])
print("inference on pretrained and finetuned bert model =",restore_finetuned_model.predict(inference_data))




inference on pretrained and finetuned bert model = [[0.04788]
 [1.     ]]
