Search hyperparameters for LLM

# Imports & Setup

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
%%bash
pip3 install transformers emoji==0.6.0 keras_nlp

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.2/7.2 MB 57.9 MB/s eta 0:00:00
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.0/51.0 kB 6.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting keras_nlp
  Downloading keras_nlp-0.6.0-py3-none-any.whl (576 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 576.5/576.5 kB 46.5 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 27.8 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 103.5 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.7 requires jax>=0.4.6, but you have jax 0.3.25 which is incompatible.
flax 0.6.11 requires jax>=0.4.2, but you have jax 0.3.25 which is incompatible.
numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.24.3 which is incompatible.
orbax-checkpoint 0.2.6 requires jax>=0.4.9, but you have jax 0.3.25 which is incompatible.


In [3]:
!git clone https://github.com/carlosinator/cil-sentiment.git

Cloning into 'cil-sentiment'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 34 (delta 9), reused 18 (delta 4), pack-reused 0[K
Unpacking objects: 100% (34/34), 80.38 KiB | 2.51 MiB/s, done.


In [4]:

import tensorflow as tf
import tensorflow_probability as tfp
import keras_nlp
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
import transformers
from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForSequenceClassification
import matplotlib.pyplot as plt
import pickle

# reproducibility
transformers.set_seed(0) # sets the seed in random, numpy, and tf

Using TensorFlow backend


In [5]:
import sys
sys.path.append("./cil-sentiment/models")
from gru_models import GRUModel

In [24]:
!gsutil cp "gs://cil_2023/train_pos_preprocessed.txt" .
!gsutil cp "gs://cil_2023/train_neg_preprocessed.txt" .

model_name = "vinai/bertweet-base"
filename_train_pos = "train_pos_preprocessed.txt"
filename_train_neg = "train_neg_preprocessed.txt"

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Copying gs://cil_2023/train_pos_preprocessed.txt...
\
Operation completed over 1 objects/82.2 MiB.                                     
Copying gs://cil_2023/train_neg_preprocessed.txt...
|
Operation completed over 1 objects/101.5 MiB.                                    




In [21]:
old_pd = pd.read_fwf("train_pos_preprocessed.txt", sep='\n', header=None, names=['text'])
new_pd = pd.read_fwf(filename_train_pos, sep='\n', header=None, names=['text'])

In [None]:
new_pd

In [25]:
# read dataset
dataset_pos_pd = pd.read_fwf(filename_train_pos, sep='\n', header=None, names=['text'])
dataset_neg_pd = pd.read_fwf(filename_train_neg, sep='\n', header=None, names=['text'])
dataset_pos_pd['label'] = 0
dataset_neg_pd['label'] = 1
dataset_pd = pd.concat([dataset_pos_pd, dataset_neg_pd])

# shuffle
dataset_pd = dataset_pd.sample(frac=1, random_state=0).reset_index(drop=True)

# tokenize data set
tokenizer = AutoTokenizer.from_pretrained(model_name)
texts = tokenizer.batch_encode_plus(dataset_pd['text'].tolist(),
                                    padding=True, truncation=True,
                                    return_tensors='tf')

dataset = tf.data.Dataset.from_tensor_slices((dict(texts), dataset_pd['label']))

# split training / validation
batch_size = 32 * tpu_strategy.num_replicas_in_sync
AUTOTUNE = tf.data.experimental.AUTOTUNE

val_data_size = int(0.1 * len(dataset_pd.index))
train_data_size = len(dataset_pd.index) - val_data_size
val_ds = dataset.take(val_data_size).batch(batch_size, drop_remainder=True)
train_ds = dataset.skip(val_data_size).batch(batch_size, drop_remainder=True)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Set Model

In [None]:
USE_MODEL = "basemodel" # alternatively "read"

In [None]:
def get_model(learning_rate, use_model="basemodel"):
  """ loads the model and compiles it with the passed hyperparams.
  Which model to use is chosen based on use_model.
  returns a model ready to train.
  """

  assert use_model == "basemodel" or use_model == "read", "invalid model name, use 'basemodel' or 'read'"

  if use_model == "basemodel":
    with tpu_strategy.scope():
      model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=AutoConfig.from_pretrained(model_name))
      model.compile(
          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=False, clipnorm=1.),
          metrics=[tf.metrics.SparseCategoricalAccuracy()]
      )

  else:
    with tpu_strategy.scope():
      model = GRUModel(model_name, 2, num_gru_units=8)
      model.compile(
          loss=tf.keras.losses.SparseCategoricalCrossentropy(),
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=False, clipnorm=None),
          metrics=[tf.metrics.SparseCategoricalAccuracy()]
      )

  return model

# Hyperparameter Search

In [None]:
LARGEST_LR = 1e-3
SMALLEST_LR = 1e-6
NUM_POINTS = 1

MAX_EPOCHS_PER_CONFIG = 1

In [None]:
lr_range = 10**np.linspace(np.log10(SMALLEST_LR), np.log10(LARGEST_LR), NUM_POINTS)
history_dict = {}

for lr in lr_range:
  model = get_model(lr, USE_MODEL)
  history = model.fit(train_ds, validation_data=val_ds, epochs=MAX_EPOCHS_PER_CONFIG, verbose=1)

  model_name = "hps_" + USE_MODEL + "_lr=" + f"{lr:.2e}"
  mpath = model_name + ".h5"

  history_dict[model_name] = history

  model.save_weights(mpath)
  with open('history_dict.pkl', 'wb') as f:
      pickle.dump(history_dict, f)

  !gs cp $mpath "gs://cil_2023/models/"
  !gs cp $'history_dict.pkl' "gs://cil_2023/models/"