Search hyperparameters for LLM

# Imports & Setup

In [17]:
from google.colab import auth
auth.authenticate_user()

In [18]:
%%bash
pip3 install transformers emoji==0.6.0 keras_nlp



In [19]:
!git clone https://github.com/carlosinator/cil-sentiment.git

fatal: destination path 'cil-sentiment' already exists and is not an empty directory.


In [20]:
%%bash
cd cil-sentiment
git pull
cd ..

Updating 6a44b06..1c8ac5c
Fast-forward
 hyperparam_search.ipynb | 1218 ++---------------------------------------------
 1 file changed, 50 insertions(+), 1168 deletions(-)


From https://github.com/carlosinator/cil-sentiment
   6a44b06..1c8ac5c  main       -> origin/main


In [23]:

import tensorflow as tf
import tensorflow_probability as tfp
import keras_nlp
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
import transformers
from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForSequenceClassification
import matplotlib.pyplot as plt
import pickle

import sys
sys.path.append("./cil-sentiment/models/")
sys.path.append("./cil-sentiment")
from gru_models import GRUModel, VGRUModel
import utils

# reproducibility
transformers.set_seed(0) # sets the seed in random, numpy, and tf

In [24]:
!gsutil cp "gs://cil_2023/train_pos_full_preprocessed_without_duplicates.txt" .
!gsutil cp "gs://cil_2023/train_neg_full_preprocessed_without_duplicates.txt" .

model_name = "vinai/bertweet-base"
filename_train_pos = "train_pos_full_preprocessed_without_duplicates.txt"
filename_train_neg = "train_neg_full_preprocessed_without_duplicates.txt"

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Copying gs://cil_2023/train_pos_full_preprocessed_without_duplicates.txt...
- [1 files][ 74.6 MiB/ 74.6 MiB]                                                
Operation completed over 1 objects/74.6 MiB.                                     
Copying gs://cil_2023/train_neg_full_preprocessed_without_duplicates.txt...
\ [1 files][ 91.6 MiB/ 91.6 MiB]                                                
Operation completed over 1 objects/91.6 MiB.                                     




In [25]:
# read dataset
dataset_pos_pd = pd.read_table(filename_train_pos, sep='\r\n', header=None, names=['text'])
dataset_neg_pd = pd.read_table(filename_train_neg, sep='\r\n', header=None, names=['text'])
dataset_pos_pd['label'] = 0
dataset_neg_pd['label'] = 1
dataset_pd = pd.concat([dataset_pos_pd, dataset_neg_pd])

# shuffle
dataset_pd = dataset_pd.sample(frac=1, random_state=0).reset_index(drop=True)

# tokenize data set
tokenizer = AutoTokenizer.from_pretrained(model_name)
texts = tokenizer.batch_encode_plus(dataset_pd['text'].tolist(),
                                    padding=True, truncation=True,
                                    return_tensors='tf')

dataset = tf.data.Dataset.from_tensor_slices((dict(texts), dataset_pd['label']))

# split training / validation
batch_size = 32 * tpu_strategy.num_replicas_in_sync
AUTOTUNE = tf.data.experimental.AUTOTUNE

val_data_size = int(0.1 * len(dataset_pd.index))
train_data_size = len(dataset_pd.index) - val_data_size
val_ds = dataset.take(val_data_size).batch(batch_size, drop_remainder=True)
train_ds = dataset.skip(val_data_size).batch(batch_size, drop_remainder=True)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)

  dataset_pos_pd = pd.read_table(filename_train_pos, sep='\r\n', header=None, names=['text'])
  dataset_neg_pd = pd.read_table(filename_train_neg, sep='\r\n', header=None, names=['text'])


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Set Model

In [26]:
def get_model(learning_rate, use_model="basemodel"):
  """ loads the model and compiles it with the passed hyperparams.
  Which model to use is chosen based on use_model.
  returns a model ready to train.
  """

  assert use_model == "basemodel" or use_model == "read" or use_model == "read-var", "invalid model name, use 'basemodel', 'read' or 'read-var'"

  if use_model == "basemodel":
    with tpu_strategy.scope():
      model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=AutoConfig.from_pretrained(model_name))
      model.compile(
          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=False, clipnorm=1.),
          metrics=[tf.metrics.SparseCategoricalAccuracy()]
      )

  elif use_model == "read":
    with tpu_strategy.scope():
      model = GRUModel(model_name, 2, num_gru_units=8)
      model.compile(
          loss=tf.keras.losses.SparseCategoricalCrossentropy(),
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=False, clipnorm=None),
          metrics=[tf.metrics.SparseCategoricalAccuracy()]
      )

  else:
    with tpu_strategy.scope():
      model = VGRUModel(model_name, 2, train_data_size=train_data_size, num_gru_units=8)
      model.compile(
          loss=tf.keras.losses.SparseCategoricalCrossentropy(),
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=False, clipnorm=None),
          metrics=[tf.metrics.SparseCategoricalAccuracy()]
      )

  return model

# Hyperparameter Search

In [27]:
USE_MODEL = "read-var" # alternatively "read"

LARGEST_LR = 1e-3
SMALLEST_LR = 1e-6
NUM_POINTS = 16

MAX_EPOCHS_PER_CONFIG = 1

In [28]:
lr_range = 10**np.linspace(np.log10(SMALLEST_LR), np.log10(LARGEST_LR), NUM_POINTS)
history_dict = {}

for i in range(0, len(lr_range)):
  lr = lr_range[i]
  print(f"HPS for lr = {lr:.2e}")
  model = get_model(lr, USE_MODEL)
  callback = tf.keras.callbacks.EarlyStopping(patience=3)
  history = model.fit(train_ds, validation_data=val_ds, epochs=MAX_EPOCHS_PER_CONFIG, verbose=1, callbacks=[callback])

  run_name = "hps_" + USE_MODEL + "_lr=" + f"{lr:.2e}"
  mpath = run_name + ".h5"

  history_dict[run_name] = history
  hd_name = run_name + "_dict.pkl"



  model.save_weights(mpath)
  with open(hd_name, 'wb') as f:
      pickle.dump(history_dict, f)

  !gs cp $mpath "gs://cil_2023/models/"
  !gs cp $hd_name "gs://cil_2023/models/"

HPS for lr = 1.00e-06


Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


KeyboardInterrupt: ignored