# Model Fine Tuning


# Notebook Environment

For a unified research environment, enable the flags below:

In [None]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  # !pip install -q tensorboard==2.15.2
  # !pip install -q tensorflow[and-cuda]==2.15.1
  # !pip install -q tensorflow==2.15.0
  !pip install -q focal-loss
  # !pip install -q tensorflow-io-gcs-filesystem==0.36.0
  # !pip install -q tensorflow-text==2.15.0
  # !pip install -q tf_keras==2.15.1
  # !pip install -q tokenizers==0.15.2
  # !pip install -q torch==2.2.0+cpu
  # !pip install -q torch-xla==2.2.0+libtpu
  # !pip install -q torchdata==0.7.1
  !pip install -q transformers==4.38.2

if UPGRADE_PY:
    !mamba create -n py311 -y
    !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

    !sudo rm /opt/conda/bin/python3
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
    !sudo rm /opt/conda/bin/python3.10
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
    !sudo rm /opt/conda/bin/python
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

!python --version

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Transformers cannot use keras3
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_USE_LEGACY_KERAS'] = '1'
IN_KAGGLE = IN_COLAB = False
!export CUDA_LAUNCH_BLOCKING=1
!export XLA_FLAGS=--xla_cpu_verbose=0

try:
  # https://www.tensorflow.org/install/pip#windows-wsl2
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive')
  DATA_PATH = "/content/drive/MyDrive/EDT dataset"
  IN_COLAB = True
  print('Colab!')
except:
  IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    DATA_PATH = "./data/"
    print('Normal!')

MODEL_PATH = "google-bert/bert-base-cased"

# Accelerators Configuration

If you have a GPU, TPU or in one of the collaborative notebooks. Configure your setup below:

In [None]:
import numpy as np
import math
import shutil
import pandas as pd

from pathlib import Path
import re
import pickle
from copy import deepcopy

from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import mixed_precision

print(f'Tensorflow version: [{tf.__version__}]')

tf.get_logger().setLevel('INFO')

#tf.config.set_soft_device_placement(True)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
except Exception as e:
  # Not an exception, just no TPUs available, GPU is fallback
  # https://www.tensorflow.org/guide/mixed_precision
  print(e)
  policy = mixed_precision.Policy('mixed_float16')
  mixed_precision.set_global_policy(policy)
  gpus = tf.config.experimental.list_physical_devices('GPU')
  if len(gpus) > 0:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, False)
        tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=12288)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        strategy = tf.distribute.MirroredStrategy()

        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
    finally:
        print("Running on", len(tf.config.list_physical_devices('GPU')), "GPU(s)")
  else:
    # CPU is final fallback
    strategy = tf.distribute.get_strategy()
    print("Running on CPU")

def is_tpu_strategy(strategy):
    return isinstance(strategy, tf.distribute.TPUStrategy)

print("Number of accelerators:", strategy.num_replicas_in_sync)
os.getcwd()

In [None]:
MAX_LEN = 256 # Default 256
LEARN_RATE=5e-5 # 5e-5
LR_FACTOR=0.1
LR_MINDELTA=1e-4
EPOCHS=100
PATIENCE=10
BATCH_SIZE = 8 * strategy.num_replicas_in_sync # Default 8

NUM_LABELS = 12 # See Labels description above.
SPECIAL_TOKEN = '[CLS]' # Use for classification and hidden state placeholder.
UNK_ID = NUM_LABELS # Unknown token will be the max class ID + 1
UNK = '[UNK]'
OTHER_ID = 11
OTHER = 'O'

# Fine-Tuning with Masked Models

In [None]:
from transformers import BertTokenizerFast,TFBertForMaskedLM

# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#berttokenizerfast
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
MASK = tokenizer.mask_token

masked_text = [f"Jim Cramer is consistently bullish when it comes to {MASK}. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of {MASK} recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached."]

inputs = tokenizer(masked_text, return_tensors="tf", padding=True, truncation=True)

model = TFBertForMaskedLM.from_pretrained(MODEL_PATH)
logits = model(**inputs).logits
mask_token_idxs = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)
print(mask_token_idxs)
print(logits)

In [None]:
mask_logits = tf.gather_nd(logits, mask_token_idxs)
top_5 = tf.math.top_k(mask_logits, k=5)
[tokenizer.decode([idx]) for idx in top_5.indices.numpy().flatten()]
for i in range(5):
    new_text = masked_text[0]
    for j in range(2):
        token_idx = top_5.indices[j, i]
        top5_logits = top_5.values[j]

        proba = tf.nn.softmax(top5_logits)
        predicted_token = tokenizer.decode([token_idx])
        new_text = new_text.replace(MASK, f'[{predicted_token}:{proba[i].numpy()*100.:.01f}%]', 1)
    print(new_text)


# Financial Conditioning

In [None]:
from tqdm import tqdm

adapt_train_file = os.path.join(DATA_PATH, 'Domain_adapation/train.txt')
adapt_test_file = os.path.join(DATA_PATH, 'Domain_adapation/dev.txt')
def text_dataset(tokenizer, file_path):
    def generator():
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in tqdm(file, desc="text_dataset"):
                tokens = tokenizer(line.strip(),
                                   add_special_tokens=True,
                                   truncation=False,
                                   padding=False)
                yield {
                    'input_ids': tf.ragged.constant([tokens['input_ids']]),
                    'attention_mask': tf.ragged.constant([tokens['attention_mask']])
                }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32),
            'attention_mask': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32)
        })

train_dataset = text_dataset(tokenizer, adapt_train_file)
eval_dataset = text_dataset(tokenizer, adapt_test_file)
for example in train_dataset.take(3):
    inputs = example['input_ids'].numpy()[0]
    print(f"Input IDs (len: {len(inputs)}):", inputs)
    print("Attention Mask:", example['attention_mask'].numpy())

The MLM needs chunked sequences which are comprised of the whole corpus concatenated. Chunks are sized on the given hardware or the max dictionary the  tokenizer has - in general 128 is a good number for modern hardward.

As we concatenate, we add a lable column on which the MLM can use as a ground truth

In [79]:
def chunked_text_dataset(tokenizer, file_path, chunk_len=MAX_LEN):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []  # To store special tokens masks

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])  # Capture special token masks

    def generator():
        num_chunks = len(all_tokens) // chunk_len
        for i in range(num_chunks):
            start = i * chunk_len
            end = start + chunk_len
            input_ids_chunk = all_tokens[start:end]
            attention_mask_chunk = all_attention_masks[start:end]
            special_tokens_mask_chunk = all_special_tokens_masks[start:end]  # Special tokens for the chunk

            yield {
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'labels': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32)  # Include this in the output
            }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'attention_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'labels': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'special_tokens_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32)  # Ensure this is also defined
        })


train_dataset = chunked_text_dataset(tokenizer, adapt_train_file)
eval_dataset = chunked_text_dataset(tokenizer, adapt_test_file)
for example in train_dataset.take(1):
    inputs = example['input_ids'].numpy()
    print(f"Input IDs (len: {len(inputs)}):", inputs)
    print("Decoded IDs:", tokenizer.decode(inputs))

Input IDs (len: 256): [  101  1327  1110  1126   138   118   139  4623   136  1760   138   118
   139  3496  1110   170  4091  3496  1687  1118   170  1597  2337  1111
  1103  3007  1104  8715 25596  3327  7538   119  1760   138   118   139
  3496  1110   170  3496  1115 22646  1154  1160  1852  1103  1473  1104
  1103  1148 20846   119  1135  1110  1824  1114  1296 20846  6544  6661
  1107  1103  3496  1105 10505  1112  1103  1509 26181 11470 27989  3113
  1251  6736  1825  2589  1103  1168 20846   119  1109  3496  3370  1157
  1271  1121  1103  1864  1115  1122 22141  1154  1160  1852  1103  1148
 20846   112   188  1473  3496   138  1137  1103 17544   112   188  3496
   117  1105  3496   139  1137  1103  1260 19482  2227   112   188  3496
   119  1731  1126   138   118   139  4623  5853  1258  1103  1473  1104
  1126  2510   117  1117  3327  1110  3641  1174  3777  1196  1117 26181
 11470 27989  5927  3531  1122   119  1370  1859   117   170  1597  2337
  1144  1126  3327  3869   10

For MLMs huggingface offers a specific data collector that does the masking. Although we can mask random tokens using the `[MASK]` special token at random intervals, as long as there is a labals column with the ground truth.

In [83]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
batched_dataset = train_dataset.batch(2).take(1)

for batch in batched_dataset:
    batch = {k: v.numpy() for k, v in batch.items()}
    examples = [{k: v[i] for k, v in batch.items()} for i in range(batch['input_ids'].shape[0])]
    collated_batch = data_collator(examples)
    for input_ids, labels in zip(collated_batch['input_ids'], collated_batch['labels']):
        masked_text = tokenizer.decode(input_ids)
        original_text = tokenizer.decode([label if label != -100 else input_id for label, input_id in zip(labels, input_ids)])

        print(f"Masked: {masked_text}")
        print(f"Original: {original_text}")

Masked: [CLS] What is an A - B Trust? An [MASK] - B trust is a joint trust created by [MASK] married couple for the purpose of minimizing estate taxes. An A - B trust is [MASK] trust that divides into two upon the [MASK] of the first spouse. It is formed [MASK] each spouse [MASK] assets in the trust and naming as [MASK] final beneficiary any suitable person except [MASK] other spouse. The trust [MASK] its name from the fact that it splits into two upon the [MASK] spouse's death [MASK] A or [MASK] survivor [MASK] s [MASK], andspace B or the decedent's trust. How an [MASK] [MASK] B Trust Works After the death of an individual, his estate is taxed heavily before his beneficiaries receive it. For example, a married couple has an estate worth $ 3 [MASK] by the time [MASK] of the spouses die. The surviving spouse is left with $ 3 million which is not taxed due to the unlimited marital deduction for [MASK] flowing from [MASK] [MASK] spouse to a surviving [MASK]. However, if the other spouse d

In [81]:

training_args = TFTrainingArguments(
    output_dir='./models',
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    eval_steps=500,
    warmup_steps=500,
    learning_rate=LEARN_RATE,
    do_train=True,
    do_eval=True,
    logging_dir='./logs',
    logging_steps=500,
    gradient_accumulation_steps=2,
    max_steps=10_000,
    load_best_model_at_end=True
)
trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
eval_results = trainer.evaluate()

eval_results

NameError: name 'TFTrainingArguments' is not defined