## Setup

### Install libraries

In [None]:
!pip3 install google-cloud-bigquery==3.25.0 -U
!pip install google-cloud-aiplatform==1.59.0
!pip uninstall -y shapely pygeos geopandas
!pip install shapely==1.8.5.post1 pygeos==0.12.0 geopandas>=0.12.2
# Install pydot and graphviz
!pip install pydot
!sudo apt install graphviz -y

In [None]:
!pip install tensorflow==2.15.0 tensorflow-hub==0.15.0 tensorflow-text

### Restart Kernel

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

### Define constants

In [None]:
# Add installed library dependencies to Python PATH variable.
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

In [None]:
# Retrieve and set PROJECT_ID and REGION environment variables.
# TODO: Fill in the PROJECT_ID and REGION provided in the lab manual.
PROJECT_ID = "..."
REGION = "us-east1"
GCS_BUCKET = f"gs://{PROJECT_ID}"

In [None]:
!gcloud storage buckets create -l $REGION $GCS_BUCKET

### Import Libraries

In [None]:
import os
import shutil
import logging

# TensorFlow model building libraries.
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

# Re-create the AdamW optimizer used in the original BERT paper.
from official.nlp import optimization  

# Libraries for data and plot model training metrics.
import pandas as pd
import matplotlib.pyplot as plt

# Import the Vertex AI Python SDK.
from google.cloud import aiplatform as vertexai


### Initialize Vertex AI Python SDK

Initialize the Vertex AI Python SDK with your GCP Project, Region, and Google Cloud Storage Bucket.

In [None]:
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

## Build and train your model locally in a Vertex Notebook

### Lab Dataset

In this lab, we will use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment) that contains the text of 50,000 movie reviews from the Internet Movie Database. These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

### Data ingestion and processing

#### Import dataset

In [None]:
DATA_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
LOCAL_DATA_DIR = "."

In [None]:
def download_data(data_url, local_data_dir):
    """Download dataset.
    Args:
      data_url(str): Source data URL path.
      local_data_dir(str): Local data download directory path.
    Returns:
      dataset_dir(str): Local unpacked data directory path.
    """
    if not os.path.exists(local_data_dir):
        os.makedirs(local_data_dir)
    
    dataset = tf.keras.utils.get_file(
      fname="aclImdb_v1.tar.gz",
      origin=data_url,
      untar=True,
      cache_dir=local_data_dir,
      cache_subdir="")
    
    dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
    
    train_dir = os.path.join(dataset_dir, "train")
    
    # Remove unused folders to make it easier to load the data.
    remove_dir = os.path.join(train_dir, "unsup") # commented out to resolve bug
    shutil.rmtree(remove_dir)
    
    return dataset_dir

In [None]:
DATASET_DIR = download_data(data_url=DATA_URL, local_data_dir=LOCAL_DATA_DIR)

In [None]:
# Create a dictionary to iteratively add data pipeline and model training hyperparameters.
HPARAMS = {
    # Set a random sampling seed to prevent data leakage in data splits from files.
    "seed": 42,
    # Number of training and inference examples.
    "batch-size": 32
}

In [None]:
def load_datasets(dataset_dir, hparams):
    """Load pre-split tf.datasets.
    Args:
      hparams(dict): A dictionary containing model training arguments.
    Returns:
      raw_train_ds(tf.dataset): Train split dataset (20k examples).
      raw_val_ds(tf.dataset): Validation split dataset (5k examples).
      raw_test_ds(tf.dataset): Test split dataset (25k examples).
    """    

    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_dir, 'train'),
        batch_size=hparams['batch-size'],
        validation_split=0.2,
        subset='training',
        seed=hparams['seed'])    

    raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_dir, 'train'),
        batch_size=hparams['batch-size'],
        validation_split=0.2,
        subset='validation',
        seed=hparams['seed'])

    raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_dir, 'test'),
        batch_size=hparams['batch-size'])
    
    return raw_train_ds, raw_val_ds, raw_test_ds

In [None]:
raw_train_ds, raw_val_ds, raw_test_ds = load_datasets(DATASET_DIR, HPARAMS)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
CLASS_NAMES = raw_train_ds.class_names

train_ds = raw_train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.prefetch(buffer_size=AUTOTUNE)

Let's print a few example reviews:

In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Review {i}: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({CLASS_NAMES[label]})')

### Choose a pre-trained BERT model to fine-tune for higher accuracy

In [None]:
HPARAMS.update({
    # TF Hub BERT modules.
    "tfhub-bert-preprocessor": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
    "tfhub-bert-encoder": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
})

Text inputs need to be transformed to numeric token ids and arranged in several Tensors before being input to BERT. TensorFlow Hub provides a matching preprocessing model for each of the BERT models discussed above, which implements this transformation using TF ops from the TF.text library. Since this text preprocessor is a TensorFlow model, It can be included in your model directly.

For fine-tuning, you will use the same optimizer that BERT was originally trained with: the "Adaptive Moments" (Adam). This optimizer minimizes the prediction loss and does regularization by weight decay (not using moments), which is also known as [AdamW](https://arxiv.org/abs/1711.05101).

For the learning rate `initial-learning-rate`, you will use the same schedule as BERT pre-training: linear decay of a notional initial learning rate, prefixed with a linear warm-up phase over the first 10% of training steps `n_warmup_steps`. In line with the BERT paper, the initial learning rate is smaller for fine-tuning.

In [None]:
HPARAMS.update({
    # Model training hyperparameters for fine tuning and regularization.
    "epochs": 5,
    "initial-learning-rate": 3e-5,
    "dropout": 0.1 
})

In [None]:
# https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
# https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1

In [None]:
HPARAMS.update({
    'model-dir': './saved_model'  # Add this line
})

In [None]:
HPARAMS

In [None]:
HPARAMS

In [None]:
# Calculate training steps
epochs = HPARAMS['epochs']
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
n_train_steps = steps_per_epoch * epochs
n_warmup_steps = int(0.1 * n_train_steps)    

# Create learning rate schedule with warmup
class WarmupLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, decay_steps, warmup_steps):
        super().__init__()
        self.initial_learning_rate = initial_learning_rate
        self.decay_steps = decay_steps
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        # Cast to float32 to avoid dtype issues
        step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
        decay_steps = tf.cast(self.decay_steps, tf.float32)
        
        # Warmup phase: linear increase from 0 to initial_lr
        warmup_lr = self.initial_learning_rate * step / warmup_steps
        
        # Decay phase: linear decrease from initial_lr to 0
        decay_lr = self.initial_learning_rate * (
            1.0 - (step - warmup_steps) / (decay_steps - warmup_steps)
        )
        
        # Choose between warmup and decay
        return tf.cond(
            step < warmup_steps,
            lambda: warmup_lr,
            lambda: tf.maximum(decay_lr, 0.0)  # Don't go below 0
        )

# Create the learning rate schedule
lr_schedule = WarmupLinearDecay(
    initial_learning_rate=HPARAMS['initial-learning-rate'],
    decay_steps=n_train_steps,
    warmup_steps=n_warmup_steps
)

# Create optimizer with warmup schedule
OPTIMIZER = tf.keras.optimizers.AdamW(
    learning_rate=lr_schedule,
    weight_decay=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-6
)

### Build and compile a TensorFlow BERT sentiment classifier

Next, we will define and compile our model by assembling pre-built TF-Hub components and tf.keras layers.

In [None]:
def build_text_classifier(hparams, optimizer):
    """Define and compile a TensorFlow BERT sentiment classifier.
    Args:
      hparams(dict): A dictionary containing model training arguments.
    Returns:
      model(tf.keras.Model): A compiled TensorFlow model.
    """
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessor = hub.KerasLayer(hparams['tfhub-bert-preprocessor'], name='preprocessing')
    
    
    encoder_inputs = preprocessor(text_input)
    encoder = hub.KerasLayer(hparams['tfhub-bert-encoder'], trainable=True, name='BERT_encoder')

    outputs = encoder(encoder_inputs)
    # For the fine-tuning you are going to use the `pooled_output` array which represents 
    # each input sequence as a whole. The shape is [batch_size, H]. 
    # You can think of this as an embedding for the entire movie review.
    classifier = outputs['pooled_output']
    # Add dropout to prevent overfitting during model fine-tuning.
    classifier = tf.keras.layers.Dropout(hparams['dropout'], name='dropout')(classifier)
    classifier = tf.keras.layers.Dense(1, activation=None, name='classifier')(classifier)
    model = tf.keras.Model(text_input, classifier, name='bert-sentiment-classifier')
    
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()    
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)    
    
    return model

In [None]:
model = build_text_classifier(HPARAMS, OPTIMIZER)

In [None]:
# Visualize your fine-tuned BERT sentiment classifier.
tf.keras.utils.plot_model(model)

In [None]:
TEST_REVIEW = ['the passion of CHRIST is such an amazing movie!']

In [None]:
BERT_RAW_RESULT = model(tf.constant(TEST_REVIEW))
print(BERT_RAW_RESULT)

In [None]:
TEST_REVIEW_0 = ['this is such an amazing movie!']

In [None]:
BERT_RAW_RESULT = model(tf.constant(TEST_REVIEW_0))
print(BERT_RAW_RESULT)

In [None]:
def train_evaluate(hparams):
    """Train and evaluate TensorFlow BERT sentiment classifier.
    Args:
      hparams(dict): A dictionary containing model training arguments.
    Returns:
      history(tf.keras.callbacks.History): Keras callback that records training event history.
    """
    # dataset_dir = download_data(data_url, local_data_dir)
    raw_train_ds, raw_val_ds, raw_test_ds = load_datasets(DATASET_DIR, hparams)
    
    train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)     
    
    epochs = hparams['epochs']
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    n_train_steps = steps_per_epoch * epochs
    n_warmup_steps = int(0.1 * n_train_steps)    
    
    # Create learning rate schedule with warmup
    class WarmupLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, initial_learning_rate, decay_steps, warmup_steps):
            super().__init__()
            self.initial_learning_rate = initial_learning_rate
            self.decay_steps = decay_steps
            self.warmup_steps = warmup_steps

        def __call__(self, step):
            # Cast to float32 to avoid dtype issues
            step = tf.cast(step, tf.float32)
            warmup_steps = tf.cast(self.warmup_steps, tf.float32)
            decay_steps = tf.cast(self.decay_steps, tf.float32)
            
            # Warmup phase: linear increase from 0 to initial_lr
            warmup_lr = self.initial_learning_rate * step / warmup_steps
            
            # Decay phase: linear decrease from initial_lr to 0
            decay_lr = self.initial_learning_rate * (
                1.0 - (step - warmup_steps) / (decay_steps - warmup_steps)
            )
            
            # Choose between warmup and decay
            return tf.cond(
                step < warmup_steps,
                lambda: warmup_lr,
                lambda: tf.maximum(decay_lr, 0.0)  # Don't go below 0
            )

    # Create the learning rate schedule
    lr_schedule = WarmupLinearDecay(
        initial_learning_rate=hparams['initial-learning-rate'],
        decay_steps=n_train_steps,
        warmup_steps=n_warmup_steps
    )

    # Create optimizer with warmup schedule
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=lr_schedule,
        weight_decay=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6
    )
    
    mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = build_text_classifier(hparams=hparams, optimizer=optimizer)
    
    logging.info(model.summary())
        
    history = model.fit(x=train_ds,
                        validation_data=val_ds,
                        epochs=epochs)  
    
    logging.info("Test accuracy: %s", model.evaluate(test_ds))
    # Export Keras model in TensorFlow SavedModel format.
    model.save(hparams['model-dir'])
    
    return history

Based on the `History` object returned by `model.fit()`. We can plot the training and validation loss for comparison, as well as the training and validation accuracy:

In [None]:
history = train_evaluate(HPARAMS)

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right');

In this plot, the red lines represent the training loss and accuracy, and the blue lines are the validation loss and accuracy. Based on the plots above, we should see model accuracy of around 78-80% which exceeds our business requirements target of greater than 75% accuracy.