# Model Fine Tuning

# Notebook Environment

For a unified research environment, enable the flags below:

In [1]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  # !pip install -q tensorboard==2.15.2
  # !pip install -q tensorflow[and-cuda]==2.15.1
  # !pip install -q tensorflow==2.15.0
  # !pip install -q tensorflow-io-gcs-filesystem==0.36.0
  # !pip install -q tensorflow-text==2.15.0
  # !pip install -q tf_keras==2.15.1
  # !pip install -q tokenizers==0.15.2
  # !pip install -q torch==2.2.0+cpu
  # !pip install -q torch-xla==2.2.0+libtpu
  # !pip install -q torchdata==0.7.1
  !pip install -q transformers==4.38.2

if UPGRADE_PY:
    !mamba create -n py311 -y
    !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

    !sudo rm /opt/conda/bin/python3
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
    !sudo rm /opt/conda/bin/python3.10
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
    !sudo rm /opt/conda/bin/python
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

!python --version

Python 3.10.13


In [2]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Transformers cannot use keras3
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_USE_LEGACY_KERAS'] = '1'
IN_KAGGLE = IN_COLAB = False
!export CUDA_LAUNCH_BLOCKING=1
!export XLA_FLAGS=--xla_cpu_verbose=0

try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "/content/drive/MyDrive/models"
    IN_COLAB = True
    print('Colab!')
except:
    IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    MODEL_PATH = "./models"
    DATA_PATH = "/kaggle/input"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    MODEL_PATH = "./models"
    DATA_PATH = "./data"
    print('Normal!')

MODEL_BASE = "google-bert/bert-base-cased"

Running in Kaggle...
/kaggle/input/Event_detection/train.txt
/kaggle/input/Event_detection/dev.txt
/kaggle/input/Trading_benchmark/evaluate_news.json
/kaggle/input/Domain_adapation/train.txt
/kaggle/input/Domain_adapation/dev.txt
Kaggle!


# Accelerators Configuration

If you have a GPU, TPU or in one of the collaborative notebooks. Configure your setup below:

In [3]:
import numpy as np
import math
import shutil
import pandas as pd

from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import mixed_precision

print(f'Tensorflow version: [{tf.__version__}]')

tf.get_logger().setLevel('INFO')

#tf.config.set_soft_device_placement(True)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except Exception as e:
    # Not an exception, just no TPUs available, GPU is fallback
    # https://www.tensorflow.org/guide/mixed_precision
    print(e)
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if len(gpus) > 0:

        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, False)
            tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=12288)])
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            strategy = tf.distribute.MirroredStrategy()

            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
        finally:
            print("Running on", len(tf.config.list_physical_devices('GPU')), "GPU(s)")
    else:
        # CPU is final fallback
        strategy = tf.distribute.get_strategy()
        print("Running on CPU")

def is_tpu_strategy(strategy):
    return isinstance(strategy, tf.distribute.TPUStrategy)

print("Number of accelerators:", strategy.num_replicas_in_sync)
os.getcwd()

2024-05-04 21:41:32.684624: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 21:41:32.684720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 21:41:32.828505: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Tensorflow version: [2.15.0]
Please provide a TPU Name to connect to.
1 Physical GPUs, 1 Logical GPUs
Running on 1 GPU(s)
Number of accelerators: 1


'/kaggle/working'

# Fine-Tuning with Masked Models

In [4]:
from transformers import BertTokenizerFast,TFBertForMaskedLM

# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#berttokenizerfast
tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE)
MASK = tokenizer.mask_token

masked_text = [f"Jim Cramer is consistently bullish when it comes to {MASK}. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of {MASK} recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached."]

inputs = tokenizer(masked_text, return_tensors="tf", padding=True, truncation=True)

model = TFBertForMaskedLM.from_pretrained(MODEL_BASE)
logits = model(**inputs).logits
mask_token_idxs = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)
print(mask_token_idxs)
print(logits)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


tf.Tensor(
[[ 0 13]
 [ 0 81]], shape=(2, 2), dtype=int64)
tf.Tensor(
[[[ -7.363  -7.258  -7.37  ...  -6.348  -6.03   -6.34 ]
  [ -7.21   -7.266  -6.906 ...  -6.49   -5.54   -6.582]
  [-11.97  -11.59  -10.73  ...  -9.36   -8.42  -12.26 ]
  ...
  [ -5.293  -5.055  -5.56  ...  -5.207  -5.664  -4.508]
  [-10.17  -10.64  -10.67  ...  -9.55   -8.18  -10.38 ]
  [-10.1   -10.61  -10.6   ...  -9.55   -8.195 -10.32 ]]], shape=(1, 110, 28996), dtype=float16)


In [5]:
mask_logits = tf.gather_nd(logits, mask_token_idxs)
top_5 = tf.math.top_k(mask_logits, k=5)
[tokenizer.decode([idx]) for idx in top_5.indices.numpy().flatten()]
for i in range(5):
    new_text = masked_text[0]
    for j in range(2):
        token_idx = top_5.indices[j, i]
        top5_logits = top_5.values[j]

        proba = tf.nn.softmax(top5_logits)
        predicted_token = tokenizer.decode([token_idx])
        new_text = new_text.replace(MASK, f'[{predicted_token}:{proba[i].numpy()*100.:.01f}%]', 1)
    print(new_text)

Jim Cramer is consistently bullish when it comes to [buying:54.0%]. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of [his:56.5%] recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached.
Jim Cramer is consistently bullish when it comes to [sales:14.3%]. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of [sales:17.4%] recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation a

# Financial Conditioning

In [6]:
adapt_train_file = os.path.join(DATA_PATH, 'Domain_adapation/train.txt')
adapt_test_file = os.path.join(DATA_PATH, 'Domain_adapation/dev.txt')
def text_dataset(tokenizer, file_path):
    def generator():
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in tqdm(file, desc="text_dataset"):
                tokens = tokenizer(line.strip(),
                                   add_special_tokens=True,
                                   truncation=False,
                                   padding=False)
                yield {
                    'input_ids': tf.ragged.constant([tokens['input_ids']]),
                    'attention_mask': tf.ragged.constant([tokens['attention_mask']])
                }
    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32),
            'attention_mask': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32)
        })

train_dataset = text_dataset(tokenizer, adapt_train_file)
eval_dataset = text_dataset(tokenizer, adapt_test_file)

iterator = iter(eval_dataset.as_numpy_iterator())
example = next(iterator)
inputs = example['input_ids'][0]
print(f"Input IDs (len: {len(inputs)}):", inputs)
print("Attention Mask:", example['attention_mask'])

text_dataset: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4034 > 512). Running this sequence through the model will result in indexing errors


Input IDs (len: 4034): [  101 10054 11522 ...   119  3254   102]
Attention Mask: [[1 1 1 ... 1 1 1]]


The MLM needs chunked sequences which are comprised of the whole corpus concatenated. Chunks are sized on the given hardware or the max dictionary the  tokenizer has - in general 128 is a good number for modern hardward.

As we concatenate, we add a lable column on which the MLM can use as a ground truth

In [7]:
def chunked_text_dataset(tokenizer, file_path, chunk_len=512):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Reading file lines", position=0, leave=True):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])

    def generator():
        num_chunks = len(all_tokens) // chunk_len
        for i in tqdm(range(num_chunks), desc= "chunking...", position=0, leave=True):
            start = i * chunk_len
            end = start + chunk_len
            input_ids_chunk = all_tokens[start:end]
            attention_mask_chunk = all_attention_masks[start:end]
            special_tokens_mask_chunk = all_special_tokens_masks[start:end]
            yield {
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'labels': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32)
            }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'attention_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'labels': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'special_tokens_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32)
        })


train_dataset = chunked_text_dataset(tokenizer, adapt_train_file)
iterator = iter(eval_dataset.as_numpy_iterator())
example = next(iterator)
inputs = example['input_ids'][0]
print(f"Input IDs (len: {len(inputs)}):", inputs)
print("Decoded IDs:", tokenizer.decode(inputs)[:50])

Reading file lines: 15463it [00:24, 624.74it/s]
text_dataset: 1it [00:24, 24.96s/it]
text_dataset: 0it [00:00, ?it/s]

Input IDs (len: 4034): [  101 10054 11522 ...   119  3254   102]
Decoded IDs: [CLS] Lancaster Colony Reports Fourth Quarter and 


For MLMs huggingface offers a specific data collector that does the masking. Although we can mask random tokens using the `[MASK]` special token at random intervals, as long as there is a labals column with the ground truth.

In [8]:
from transformers import DataCollatorForLanguageModeling, BertConfig

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
batched_dataset = train_dataset.batch(1).take(1)

batch = next(iter(eval_dataset.as_numpy_iterator()))
batch = {k: v for k, v in batch.items()}
examples = [{k: v[i] for k, v in batch.items()} for i in range(batch['input_ids'].shape[0])]
print(examples)
collated_batch = data_collator(examples)
for input_ids, labels in tqdm(zip(collated_batch['input_ids'], collated_batch['labels']), desc="tokenizing batches"):
    masked_text = tokenizer.decode(input_ids)
    original_text = tokenizer.decode([label if label != -100 else input_id for label, input_id in zip(labels, input_ids)])

    print(f"Masked: {masked_text[:50]}")
    print(f"Labels: {labels[:50]}")
    print(f"Original: {original_text[:50]}")
collated_batch


text_dataset: 1it [00:00, 25.23it/s]

[{'input_ids': array([  101, 10054, 11522, ...,   119,  3254,   102], dtype=int32), 'attention_mask': array([1, 1, 1, ..., 1, 1, 1], dtype=int32)}]




tokenizing batches: 0it [00:00, ?it/s][A
tokenizing batches: 1it [00:03,  3.85s/it]

Masked: [CLS] Lancaster [MASK] Reports Fourth Quarter and 
Labels: [ -100  -100 11522  -100  -100  -100  -100  -100  -100  -100  2381  -100
  -100  -100  -100  -100  -100  -100  -100  -100  -100  -100  -100  -100
  -100  -100  -100  -100  -100  2249  -100  -100  -100  -100  -100  -100
  -100  -100  -100 11896  -100  -100  -100  -100  -100  -100  -100  -100
  -100  -100]
Original: [CLS] Lancaster Colony Reports Fourth Quarter and 





{'input_ids': <tf.Tensor: shape=(1, 4034), dtype=int64, numpy=array([[  101, 10054,   103, ...,   119,  3254,   102]])>, 'attention_mask': <tf.Tensor: shape=(1, 4034), dtype=int32, numpy=array([[1, 1, 1, ..., 1, 1, 1]], dtype=int32)>, 'labels': <tf.Tensor: shape=(1, 4034), dtype=int64, numpy=array([[ -100,  -100, 11522, ...,  -100,  -100,  -100]])>}

Add everything together

In [9]:
MAX_LEN = 512 # Default 256, MAX 512
def mlm_text_dataset(file_path, tokenizer, data_collator, chunk_len=MAX_LEN):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, position=0, leave=True, desc="Processing file..."):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])
            

    num_chunks = len(all_tokens) // chunk_len
    tokens_chunks = []
    attention_mask_chunks = []
    label_chunks = []
    special_tokens_mask_chunk=[]
    for i in tqdm(range(num_chunks), position=0, leave=True, desc="Chunking..."):
        start = i * chunk_len
        end = start + chunk_len
        input_ids_chunk = all_tokens[start:end]
        attention_mask_chunk = all_attention_masks[start:end]
        special_tokens_mask_chunk = all_special_tokens_masks[start:end]

        masked_chunks = data_collator([{
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32),}])
        tokens_chunks.extend(masked_chunks['input_ids'])
        label_chunks.extend(masked_chunks['labels'])
        attention_mask_chunks.extend(masked_chunks['attention_mask'])
        special_tokens_mask_chunk.extend(special_tokens_mask_chunk)
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': tokens_chunks,
            'attention_mask': attention_mask_chunks,
            'labels': label_chunks,
            # 'special_tokens_mask': special_tokens_mask_chunk
        },
    ))

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

iterex = iter(mlm_test_dataset.as_numpy_iterator())
next(iterex)

Processing file...: 15463it [00:24, 629.20it/s]
Chunking...: 100%|██████████| 7675/7675 [00:32<00:00, 233.06it/s]
Processing file...: 999it [00:02, 441.88it/s]
Chunking...: 100%|██████████| 491/491 [00:02<00:00, 238.60it/s]


({'input_ids': array([  101, 10054, 11522, 16098,  7652, 12664,  1105, 17355, 26996,
           103,  2381, 16005,   160,  9919, 12880,  2069, 23314, 23955,
          2036,   117,  3197,   117, 16892,   119,  1765,   117, 12795,
           120, 11629,  2249, 17540, 24952,   120,   118,   118,  7737,
           103,  3436,   113,   103,  1116,  1810,  4426,   131, 10722,
         15517,   114,  2052,  2103,  2686,  1111,  1103,  2223,  3861,
          1105, 12087,  1214,  2207,  1340,  1476,   103, 12795,   119,
           103, 13231,  1132,  1112,  3226,   131,  7652, 12664, 16005,
         23582,  5795,   103,  5799,   121,   119,   130,   110,  1106,
           109, 14116,   119,   130,  1550,  6055,   109,  2724,  1495,
           119,   128,  1550,  1314,  1214,   119, 16409,  1665,  7535,
          3408,  1155,  3813,  6547,  1106,   170,   103,  3880,  3311,
          3694,  1121,  1103,  1379,  1479,   117,  1857,  7626,  5416,
           152,  1306,   103, 18757,  4419,  1881, 

## BERT Conditioning

From the paper:

* Batch size: 16, 32
* Learning rate (Adam): 5e-5, 3e-5, 2e-5
* Number of epochs: 2, 3, 4

In [10]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync # Default 8
BUFFER_SIZE = 10000

def eval_mlm(model, batched_dataset):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
    total_loss = 0.
    total_accuracy = 0.
    total_examples = 0.

    # TODO: convert this to a TF function for distributed strat.
    for batch in tqdm(batched_dataset, desc="eval_mlm", position=0, leave=True):
        for dataset_output in batch:
            input_ids = dataset_output['input_ids']
            attention_mask = dataset_output['attention_mask']
            labels = dataset_output['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            mask = (labels != -100)
            masked_logits = tf.boolean_mask(logits, mask)
            masked_labels = tf.boolean_mask(labels, mask)
            batch_loss = loss_fn(masked_labels, masked_logits)
            predictions = tf.argmax(masked_logits, axis=-1)
            batch_accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, masked_labels), dtype=tf.float32))
            print(batch_loss)

            total_loss += tf.cast(batch_loss,tf.float32)
            total_accuracy += batch_accuracy
            total_examples += tf.size(masked_labels, out_type=tf.float32)

    avg_loss = total_loss / total_examples
    avg_perplexity = tf.exp(avg_loss).numpy()
    avg_accuracy = total_accuracy / total_examples

    print(f"Average Cross-Entropy Loss: {avg_loss.numpy()}")
    print(f"Average Perplexity: {avg_perplexity}")
    print(f"Average Accuracy: {avg_accuracy.numpy()}")

config = BertConfig.from_pretrained(MODEL_BASE)
model = TFBertForMaskedLM.from_pretrained(MODEL_BASE, config=config)
test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)
eval_mlm(model, test_dataset)

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
eval_mlm:   3%|▎         | 1/31 [00:00<00:15,  1.90it/s]

tf.Tensor(4236.0, shape=(), dtype=float16)


eval_mlm:   6%|▋         | 2/31 [00:00<00:13,  2.11it/s]

tf.Tensor(3956.0, shape=(), dtype=float16)


eval_mlm:  10%|▉         | 3/31 [00:01<00:12,  2.17it/s]

tf.Tensor(3936.0, shape=(), dtype=float16)


eval_mlm:  13%|█▎        | 4/31 [00:01<00:12,  2.20it/s]

tf.Tensor(3872.0, shape=(), dtype=float16)


eval_mlm:  16%|█▌        | 5/31 [00:02<00:11,  2.23it/s]

tf.Tensor(3858.0, shape=(), dtype=float16)


eval_mlm:  19%|█▉        | 6/31 [00:02<00:11,  2.25it/s]

tf.Tensor(4088.0, shape=(), dtype=float16)


eval_mlm:  23%|██▎       | 7/31 [00:03<00:10,  2.26it/s]

tf.Tensor(4364.0, shape=(), dtype=float16)


eval_mlm:  26%|██▌       | 8/31 [00:03<00:10,  2.27it/s]

tf.Tensor(4100.0, shape=(), dtype=float16)


eval_mlm:  29%|██▉       | 9/31 [00:04<00:09,  2.28it/s]

tf.Tensor(3846.0, shape=(), dtype=float16)


eval_mlm:  32%|███▏      | 10/31 [00:04<00:09,  2.28it/s]

tf.Tensor(4640.0, shape=(), dtype=float16)


eval_mlm:  35%|███▌      | 11/31 [00:04<00:08,  2.28it/s]

tf.Tensor(4002.0, shape=(), dtype=float16)


eval_mlm:  39%|███▊      | 12/31 [00:05<00:08,  2.28it/s]

tf.Tensor(4360.0, shape=(), dtype=float16)


eval_mlm:  42%|████▏     | 13/31 [00:05<00:07,  2.28it/s]

tf.Tensor(3908.0, shape=(), dtype=float16)


eval_mlm:  45%|████▌     | 14/31 [00:06<00:07,  2.29it/s]

tf.Tensor(4204.0, shape=(), dtype=float16)


eval_mlm:  48%|████▊     | 15/31 [00:06<00:06,  2.29it/s]

tf.Tensor(3888.0, shape=(), dtype=float16)


eval_mlm:  52%|█████▏    | 16/31 [00:07<00:06,  2.29it/s]

tf.Tensor(3908.0, shape=(), dtype=float16)


eval_mlm:  55%|█████▍    | 17/31 [00:07<00:06,  2.29it/s]

tf.Tensor(4010.0, shape=(), dtype=float16)


eval_mlm:  58%|█████▊    | 18/31 [00:07<00:05,  2.29it/s]

tf.Tensor(4484.0, shape=(), dtype=float16)


eval_mlm:  61%|██████▏   | 19/31 [00:08<00:05,  2.29it/s]

tf.Tensor(4244.0, shape=(), dtype=float16)


eval_mlm:  65%|██████▍   | 20/31 [00:08<00:04,  2.28it/s]

tf.Tensor(4296.0, shape=(), dtype=float16)


eval_mlm:  68%|██████▊   | 21/31 [00:09<00:04,  2.29it/s]

tf.Tensor(3924.0, shape=(), dtype=float16)


eval_mlm:  71%|███████   | 22/31 [00:09<00:03,  2.29it/s]

tf.Tensor(4264.0, shape=(), dtype=float16)


eval_mlm:  74%|███████▍  | 23/31 [00:10<00:03,  2.29it/s]

tf.Tensor(4172.0, shape=(), dtype=float16)


eval_mlm:  77%|███████▋  | 24/31 [00:10<00:03,  2.29it/s]

tf.Tensor(4210.0, shape=(), dtype=float16)


eval_mlm:  81%|████████  | 25/31 [00:11<00:02,  2.29it/s]

tf.Tensor(4336.0, shape=(), dtype=float16)


eval_mlm:  84%|████████▍ | 26/31 [00:11<00:02,  2.29it/s]

tf.Tensor(4300.0, shape=(), dtype=float16)


eval_mlm:  87%|████████▋ | 27/31 [00:11<00:01,  2.29it/s]

tf.Tensor(4236.0, shape=(), dtype=float16)


eval_mlm:  90%|█████████ | 28/31 [00:12<00:01,  2.29it/s]

tf.Tensor(4244.0, shape=(), dtype=float16)


eval_mlm:  94%|█████████▎| 29/31 [00:12<00:00,  2.29it/s]

tf.Tensor(3866.0, shape=(), dtype=float16)


eval_mlm:  97%|█████████▋| 30/31 [00:13<00:00,  2.29it/s]

tf.Tensor(4144.0, shape=(), dtype=float16)


eval_mlm: 100%|██████████| 31/31 [00:13<00:00,  2.29it/s]

tf.Tensor(2776.0, shape=(), dtype=float16)
Average Cross-Entropy Loss: 3.3830089569091797
Average Perplexity: 29.459278106689453
Average Accuracy: 0.48690009117126465





In [11]:
MAX_LEN = 512 # Default 256, MAX 512
LEARN_RATE=5e-5 # 5e-5
PATIENCE=10
EPOCHS=50

TOTAL_STEPS = 100000 
WARM_STEPS = 10000
INIT_LR = 1e-4
BETA_1 = 0.9
BETA_2 = 0.999
WEIGHT_DECAY = 0.01

In [12]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, TerminateOnNaN
from tensorflow.keras.optimizers import AdamW


with strategy.scope():
    # https://huggingface.co/transformers/v3.0.2/_modules/transformers/configuration_bert.html#BertConfig
    config = BertConfig.from_pretrained(MODEL_BASE)
    cond_model = TFBertForMaskedLM.from_pretrained(MODEL_BASE, config=config)
    
    # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/TensorBoard
    tensorboard_callback = TensorBoard(log_dir=f"{MODEL_PATH}/logs",
                                        histogram_freq=2,
                                        embeddings_freq=2)
    # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
    early_stopping = EarlyStopping(mode='min', patience=PATIENCE, start_from_epoch=1)
    #tf.debugging.enable_check_numerics() # - Assert if no Infs or NaNs go through. not for TPU!
    #tf.config.run_functions_eagerly(not is_tpu_strategy(strategy)) # - Easy debugging
    # https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
    train_dataset = (mlm_train_dataset.shuffle(buffer_size=BUFFER_SIZE)
                                    .batch(BATCH_SIZE)
                                    .cache()
                                    .prefetch(tf.data.experimental.AUTOTUNE))
    test_dataset = (mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE)
                                    .batch(BATCH_SIZE)
                                    .cache()
                                    .prefetch(tf.data.experimental.AUTOTUNE))
    cond_model.compile(optimizer=AdamW(learning_rate=LEARN_RATE))
    history = cond_model.fit(train_dataset,
                        epochs=EPOCHS,
                        callbacks=[early_stopping, TerminateOnNaN()],
                        verbose="auto",
                        validation_data=test_dataset)

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Epoch 1/50
Cause: for/else statement not yet supported
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x7a62141ca350>


I0000 00:00:1714859119.852272      71 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [13]:
import zipfile

# CAn get very big if run in kaggle
SAVE_ZIP = False
cond_model.save_pretrained(f"{MODEL_PATH}/model")
config.save_pretrained(f"{MODEL_PATH}/config")
tokenizer.save_pretrained(f"{MODEL_PATH}/tokenizer")

if SAVE_ZIP:
    def zip_models(directory, output_filename, compression_level = 9):
        with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED, compresslevel=compression_level) as zipf:
            for root, dirs, files in os.walk(directory):
                for file in files:
                    zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(directory, '..')))    

    zip_models(MODEL_PATH, './cond_bert.zip')

# Entropy

Entropy is a measure that quantifies uncertainty or the inverse of probability of an event occurring;h igher the probability, lesser is the uncertainty. 


Hence, the goal of the language model is to minimize the entropy of generating a sequence of words that are similar to the training sequences. The formula for calculating Entropy is as given below where P(x) is the probability of the word x:

$$
H(X) = -\sum_{i=1}^n P(x_i) \log_b P(x_i)
$$

Where:
* H(X) is the entropy of the random variable X, which represents the different outcomes in the language model
* P(x_i) is the probability of occurrence of each outcome x_i
* n is the number of possible outcomes.
* logb is the logarithm base, e.g base 2 for binary entropy calculations.

Cross enthropy measures 2 distributions the true outcome distributions and the models. Using the equaltion above the second p(x_i) is replaced with the models distribution.

# Perplexity

Perplexity means the model is surprised to see new data. The lower the perplexity, the better the training is.

The formula for perplexity is the exponent of mean of log likelihood of all the words in an input sequence:

$$
\text{PPL}(X) = \exp\left(-\frac{1}{T} \sum_{i=1}^T \log p_{\theta}(x_i | x_{< i}) \right)
$$

In [14]:
import matplotlib.pyplot as plt

train_loss = history.history["loss"][-1]
try:
    train_perplexity = math.exp(train_loss)
except OverflowError:
    train_perplexity = math.inf
validation_loss = history.history["val_loss"][-1]
try:
    validation_perplexity = math.exp(validation_loss)
except OverflowError:
    validation_perplexity = math.inf
results_dict = {}
results_dict["train_loss"] = train_loss
results_dict["train_perplexity"] = train_perplexity
results_dict["eval_loss"] = validation_loss
results_dict["eval_perplexity"] = validation_perplexity


results_dict

{'train_loss': 0.16385002434253693,
 'train_perplexity': 1.1780376248761306,
 'eval_loss': 2.0024571418762207,
 'eval_perplexity': 7.407234382261664}

# Evaluate Conditioned

Base:

1. Average Cross-Entropy Loss: 3.3887569904327393
1. Average Perplexity: 29.629100799560547
1. Average Accuracy: 0.48579704761505127

Conditioned:

* Average Cross-Entropy Loss: 7.493394374847412
* Average Perplexity: 1796.1385498046875
* Average Accuracy: 0.02848563715815544

In [15]:
config = BertConfig.from_pretrained(f"./{MODEL_PATH}/config")
model = TFBertForMaskedLM.from_pretrained(f"./{MODEL_PATH}/model", config=config)
test_dataset = mlm_test_dataset.shuffle(buffer_size=10000).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)

eval_mlm(model, test_dataset)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at ././models/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
eval_mlm:   3%|▎         | 1/31 [00:00<00:13,  2.17it/s]

tf.Tensor(2718.0, shape=(), dtype=float16)


eval_mlm:   6%|▋         | 2/31 [00:00<00:12,  2.24it/s]

tf.Tensor(2380.0, shape=(), dtype=float16)


eval_mlm:  10%|▉         | 3/31 [00:01<00:12,  2.25it/s]

tf.Tensor(2314.0, shape=(), dtype=float16)


eval_mlm:  13%|█▎        | 4/31 [00:01<00:11,  2.26it/s]

tf.Tensor(2003.0, shape=(), dtype=float16)


eval_mlm:  16%|█▌        | 5/31 [00:02<00:11,  2.26it/s]

tf.Tensor(2222.0, shape=(), dtype=float16)


eval_mlm:  19%|█▉        | 6/31 [00:02<00:11,  2.26it/s]

tf.Tensor(2638.0, shape=(), dtype=float16)


eval_mlm:  23%|██▎       | 7/31 [00:03<00:10,  2.26it/s]

tf.Tensor(2130.0, shape=(), dtype=float16)


eval_mlm:  26%|██▌       | 8/31 [00:03<00:10,  2.26it/s]

tf.Tensor(2548.0, shape=(), dtype=float16)


eval_mlm:  29%|██▉       | 9/31 [00:03<00:09,  2.26it/s]

tf.Tensor(2522.0, shape=(), dtype=float16)


eval_mlm:  32%|███▏      | 10/31 [00:04<00:09,  2.26it/s]

tf.Tensor(2212.0, shape=(), dtype=float16)


eval_mlm:  35%|███▌      | 11/31 [00:04<00:08,  2.26it/s]

tf.Tensor(2558.0, shape=(), dtype=float16)


eval_mlm:  39%|███▊      | 12/31 [00:05<00:08,  2.26it/s]

tf.Tensor(2424.0, shape=(), dtype=float16)


eval_mlm:  42%|████▏     | 13/31 [00:05<00:07,  2.26it/s]

tf.Tensor(2638.0, shape=(), dtype=float16)


eval_mlm:  45%|████▌     | 14/31 [00:06<00:07,  2.26it/s]

tf.Tensor(2448.0, shape=(), dtype=float16)


eval_mlm:  48%|████▊     | 15/31 [00:06<00:07,  2.26it/s]

tf.Tensor(2672.0, shape=(), dtype=float16)


eval_mlm:  52%|█████▏    | 16/31 [00:07<00:06,  2.26it/s]

tf.Tensor(2472.0, shape=(), dtype=float16)


eval_mlm:  55%|█████▍    | 17/31 [00:07<00:06,  2.26it/s]

tf.Tensor(2514.0, shape=(), dtype=float16)


eval_mlm:  58%|█████▊    | 18/31 [00:07<00:05,  2.26it/s]

tf.Tensor(2410.0, shape=(), dtype=float16)


eval_mlm:  61%|██████▏   | 19/31 [00:08<00:05,  2.26it/s]

tf.Tensor(2488.0, shape=(), dtype=float16)


eval_mlm:  65%|██████▍   | 20/31 [00:08<00:04,  2.26it/s]

tf.Tensor(2408.0, shape=(), dtype=float16)


eval_mlm:  68%|██████▊   | 21/31 [00:09<00:04,  2.27it/s]

tf.Tensor(2496.0, shape=(), dtype=float16)


eval_mlm:  71%|███████   | 22/31 [00:09<00:03,  2.27it/s]

tf.Tensor(2836.0, shape=(), dtype=float16)


eval_mlm:  74%|███████▍  | 23/31 [00:10<00:03,  2.26it/s]

tf.Tensor(2144.0, shape=(), dtype=float16)


eval_mlm:  77%|███████▋  | 24/31 [00:10<00:03,  2.26it/s]

tf.Tensor(2472.0, shape=(), dtype=float16)


eval_mlm:  81%|████████  | 25/31 [00:11<00:02,  2.26it/s]

tf.Tensor(2192.0, shape=(), dtype=float16)


eval_mlm:  84%|████████▍ | 26/31 [00:11<00:02,  2.27it/s]

tf.Tensor(2592.0, shape=(), dtype=float16)


eval_mlm:  87%|████████▋ | 27/31 [00:11<00:01,  2.27it/s]

tf.Tensor(2322.0, shape=(), dtype=float16)


eval_mlm:  90%|█████████ | 28/31 [00:12<00:01,  2.26it/s]

tf.Tensor(2812.0, shape=(), dtype=float16)


eval_mlm:  94%|█████████▎| 29/31 [00:12<00:00,  2.26it/s]

tf.Tensor(2732.0, shape=(), dtype=float16)


eval_mlm:  97%|█████████▋| 30/31 [00:13<00:00,  2.27it/s]

tf.Tensor(2226.0, shape=(), dtype=float16)


eval_mlm: 100%|██████████| 31/31 [00:13<00:00,  2.28it/s]

tf.Tensor(1521.0, shape=(), dtype=float16)
Average Cross-Entropy Loss: 2.004753828048706
Average Perplexity: 7.424266338348389
Average Accuracy: 0.6953235864639282



