In [1]:
import os

import keras_nlp
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from utils import *
import numpy as np

policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

--ip=127.0.0.1
The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [2]:
# Download pretraining data.
keras.utils.get_file(
    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
    extract=True,
)
wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")

# Download vocabulary data (WordPiece vocabulary, to do sub-word tokenization)
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)

In [3]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

# Model params.
NUM_LAYERS = 3
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.1
NORM_EPSILON = 1e-5

# Training params.
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

In [4]:
# Load wikitext-103 and filter out short lines.
wiki_train_ds = (
    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)
)
wiki_val_ds = (
    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)
)

In [5]:
# Setting sequence_length will trim or pad the token outputs to shape
# (batch_size, SEQ_LENGTH).
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file,
    sequence_length=SEQ_LENGTH,
    lowercase=True,
    strip_accents=True,
)

In [6]:
# Setting mask_selection_length will trim or pad the mask outputs to shape
# (batch_size, PREDICTIONS_PER_SEQ).
masker = keras_nlp.layers.MLMMaskGenerator(
    vocabulary_size=tokenizer.vocabulary_size(),
    mask_selection_rate=MASK_RATE,
    mask_selection_length=PREDICTIONS_PER_SEQ,
    mask_token_id=tokenizer.token_to_id("[MASK]"),
)


def preprocess(inputs):
    inputs = tokenizer(inputs)
    outputs = masker(inputs)
    # Split the masking layer outputs into a (features, labels, and weights)
    # tuple that we can use with keras.Model.fit().
    features = {
        "tokens": outputs["tokens"],
        "mask_positions": outputs["mask_positions"],
    }
    labels = outputs["mask_ids"]
    weights = outputs["mask_weights"]
    return features, labels, weights


# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.
pretrain_ds = wiki_train_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
pretrain_val_ds = wiki_val_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

# Preview a single input example.
# The masks will change each time you run the cell.
print(pretrain_val_ds.take(1).get_single_element())

({'tokens': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
array([[ 7570,   103,  2271, ...,   103,  1012,  7570],
       [ 7570, 23283,  2271, ...,   103,   103,  2023],
       [ 1996,  2034,  3940, ...,     0,     0,     0],
       ...,
       [  103,  1996,   103, ...,     0,     0,     0],
       [ 3216,   103,  2083, ...,     0,     0,     0],
       [ 9794,   103,  1045, ...,     0,     0,     0]])>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
array([[  1,   4,   5, ..., 115, 124, 125],
       [  1,   8,   9, ..., 124, 125, 126],
       [  5,   6,  12, ...,   0,   0,   0],
       ...,
       [  0,   2,   4, ..., 119, 120,   0],
       [  1,   2,   3, ...,   0,   0,   0],
       [  1,   5,   7, ...,   0,   0,   0]], dtype=int64)>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
array([[ 7849,  7946,  1010, ...,  2039, 25009,  9673],
       [ 7849, 19116, 10732, ...,  2075,  1007,  1012],
       [ 3695, 22925,  1010, ...,     0,     0,     0],
       ...

In [7]:
inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

# Embed our tokens with a positional embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=tokenizer.vocabulary_size(),
    sequence_length=SEQ_LENGTH,
    embedding_dim=MODEL_DIM,
)
outputs = embedding_layer(inputs)

# Apply layer normalization and dropout to the embedding.
outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)

# Add a number of encoder blocks
for i in range(NUM_LAYERS):
    outputs = keras_nlp.layers.TransformerEncoder(
        intermediate_dim=INTERMEDIATE_DIM,
        num_heads=NUM_HEADS,
        dropout=DROPOUT,
        layer_norm_epsilon=NORM_EPSILON,
    )(outputs)

encoder_model = keras.Model(inputs, outputs)
encoder_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 token_and_position_embeddin  (None, 128, 256)         7846400   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 layer_normalization (LayerN  (None, 128, 256)         512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128, 256)          0         
                                                                 
 transformer_encoder (Transf  (None, 128, 256)         527104    
 ormerEncoder)                                               

In [8]:
# Create the pretraining model by attaching a masked language model head.
inputs = {
    "tokens": keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32),
    "mask_positions": keras.Input(shape=(PREDICTIONS_PER_SEQ,), dtype=tf.int32),
}

# Encode the tokens.
encoded_tokens = encoder_model(inputs["tokens"])

# Predict an output word for each masked input token.
# We use the input token embedding to project from our encoded vectors to
# vocabulary logits, which has been shown to improve training efficiency.
outputs = keras_nlp.layers.MLMHead(
    embedding_weights=embedding_layer.token_embedding.embeddings, activation="softmax",
)(encoded_tokens, mask_positions=inputs["mask_positions"])

# Define and compile our pretraining model.
pretraining_model = keras.Model(inputs, outputs)
pretraining_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=PRETRAINING_LEARNING_RATE),
    weighted_metrics=["sparse_categorical_accuracy"],
    jit_compile=True,
)

# Pretrain the model on our wiki text dataset.
pretraining_model.fit(
    pretrain_ds, validation_data=pretrain_val_ds, epochs=PRETRAINING_EPOCHS,
)

# Save this base model for further finetuning.
encoder_model.save("encoder_model")

Epoch 1/8


## Finetuning

In [None]:
df_train = pd.read_csv("../Data/PreprocessedData/train_preprocessed.csv")
df_test = pd.read_csv("../Data/PreprocessedData/test_preprocessed.csv")
df_val = pd.read_csv("../Data/PreprocessedData/val_preprocessed.csv")

df_train = df_train[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]
df_val = df_val[['preprocessed_text', 'label']]

In [None]:
# Convert DataFrame to tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices((df_train['preprocessed_text'].values, df_train['label'].values))
test_ds = tf.data.Dataset.from_tensor_slices((df_test['preprocessed_text'].values, df_test['label'].values))
val_ds = tf.data.Dataset.from_tensor_slices((df_val['preprocessed_text'].values, df_val['label'].values))

In [None]:
# Batch and shuffle the dataset
train_ds = train_ds.batch(FINETUNING_BATCH_SIZE).shuffle(10000)
test_ds = test_ds.batch(FINETUNING_BATCH_SIZE)
val_ds = val_ds.batch(FINETUNING_BATCH_SIZE).shuffle(10000)

In [None]:

def preprocess(sentences, labels):
    return tokenizer(sentences), labels


# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
finetune_train_ds = train_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
finetune_val_ds = val_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
finetune_test_ds = test_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

# Preview a single input example.
print(finetune_val_ds.take(1).get_single_element())

(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[1026, 5310, 1028, ...,    0,    0,    0],
       [4485, 2139, 2480, ...,    0,    0,    0],
       [4720, 2575, 2397, ...,    0,    0,    0],
       ...,
       [1026, 5310, 1028, ...,    0,    0,    0],
       [1026, 5310, 1028, ...,    0,    0,    0],
       [1026, 5310, 1028, ...,    0,    0,    0]])>, <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'NOT', b'OFF', b'NOT', b'NOT', b'NOT', b'NOT', b'NOT', b'OFF',
       b'OFF', b'OFF', b'OFF', b'NOT', b'NOT', b'OFF', b'NOT', b'NOT',
       b'NOT', b'NOT', b'NOT', b'OFF', b'OFF', b'NOT', b'NOT', b'NOT',
       b'NOT', b'NOT', b'OFF', b'OFF', b'NOT', b'NOT', b'OFF', b'NOT'],
      dtype=object)>)


In [None]:
# Reload the encoder model from disk so we can restart fine-tuning from scratch.
encoder_model = keras.models.load_model("encoder_model", compile=False)

# Take as input the tokenized input.
inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

# Encode and pool the tokens.
encoded_tokens = encoder_model(inputs)
pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens)

# Predict an output label.
outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)

# Define and compile our finetuning model.
finetuning_model = keras.Model(inputs, outputs)
finetuning_model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=FINETUNING_LEARNING_RATE),
    metrics=["accuracy"],
)

# Finetune the model for the SST-2 task.
finetuning_model.fit(
    finetune_train_ds, validation_data=finetune_val_ds, epochs=FINETUNING_EPOCHS,
)

In [None]:
# Add our tokenization into our final model.
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = finetuning_model(tokens)
final_model = keras.Model(inputs, outputs)
final_model.save("final_model")

# This model can predict directly on raw text.
restored_model = keras.models.load_model("final_model", compile=False)
inference_data = tf.constant(["Terrible, no good, trash.", "So great; I loved it!"])
print(restored_model(inference_data))

In [None]:
# Predictions
train_pred = final_model.predict(finetune_train_ds)
test_pred = final_model.predict(finetune_test_ds)
val_pred = final_model.predict(finetune_val_ds)

# Convert predictions to labels
train_pred = np.where(train_pred > 0.5, 1, 0)
test_pred = np.where(test_pred > 0.5, 1, 0)
val_pred = np.where(val_pred > 0.5, 1, 0)

# Convert labels to numpy arrays
computeAllScores(train_pred, val_pred, test_pred)