# Model Fine Tuning

# Notebook Environment

For a unified research environment, enable the flags below:

In [1]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  # !pip install -q tensorboard==2.15.2
  # !pip install -q tensorflow[and-cuda]==2.15.1
  # !pip install -q tensorflow==2.15.0
  # !pip install -q tensorflow-io-gcs-filesystem==0.36.0
  # !pip install -q tensorflow-text==2.15.0
  # !pip install -q tf_keras==2.15.1
  # !pip install -q tokenizers==0.15.2
  # !pip install -q torch==2.2.0+cpu
  # !pip install -q torch-xla==2.2.0+libtpu
  # !pip install -q torchdata==0.7.1
  !pip install -q transformers==4.38.2

if UPGRADE_PY:
    !mamba create -n py311 -y
    !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

    !sudo rm /opt/conda/bin/python3
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
    !sudo rm /opt/conda/bin/python3.10
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
    !sudo rm /opt/conda/bin/python
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

!python --version

Python 3.10.13


In [2]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Transformers cannot use keras3
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_USE_LEGACY_KERAS'] = '1'
IN_KAGGLE = IN_COLAB = False
!export CUDA_LAUNCH_BLOCKING=1
!export XLA_FLAGS=--xla_cpu_verbose=0

try:
  # https://www.tensorflow.org/install/pip#windows-wsl2
  import google.colab
  from google.colab import drive
  drive.mount('/content/drive')
  DATA_PATH = "/content/drive/MyDrive/EDT dataset"
  IN_COLAB = True
  print('Colab!')
except:
  IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    DATA_PATH = "/kaggle/input/uscorpactionnews"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    DATA_PATH = "./data/"
    print('Normal!')

MODEL_PATH = "google-bert/bert-base-cased"

Running in Kaggle...
/kaggle/input/uscorpactionnews/Event_detection/train.txt
/kaggle/input/uscorpactionnews/Event_detection/dev.txt
/kaggle/input/uscorpactionnews/Trading_benchmark/evaluate_news.json
/kaggle/input/uscorpactionnews/Domain_adapation/train.txt
/kaggle/input/uscorpactionnews/Domain_adapation/dev.txt
Kaggle!


# Accelerators Configuration

If you have a GPU, TPU or in one of the collaborative notebooks. Configure your setup below:

In [3]:
import numpy as np
import math
import shutil
import pandas as pd

from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import mixed_precision

print(f'Tensorflow version: [{tf.__version__}]')

tf.get_logger().setLevel('INFO')

#tf.config.set_soft_device_placement(True)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except Exception as e:
    # Not an exception, just no TPUs available, GPU is fallback
    # https://www.tensorflow.org/guide/mixed_precision
    print(e)
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if len(gpus) > 0:
        
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, False)
            tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=12288)])
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            strategy = tf.distribute.MirroredStrategy()

            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
        finally:
            print("Running on", len(tf.config.list_physical_devices('GPU')), "GPU(s)")
    else:
        # CPU is final fallback
        strategy = tf.distribute.get_strategy()
        print("Running on CPU")

def is_tpu_strategy(strategy):
    return isinstance(strategy, tf.distribute.TPUStrategy)

print("Number of accelerators:", strategy.num_replicas_in_sync)
os.getcwd()

2024-04-27 07:01:53.038320: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-27 07:01:53.038425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-27 07:01:53.176568: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Tensorflow version: [2.15.0]
Please provide a TPU Name to connect to.
1 Physical GPUs, 1 Logical GPUs
Running on 1 GPU(s)
Number of accelerators: 1


'/kaggle/working'

# Fine-Tuning with Masked Models

In [4]:
from transformers import BertTokenizerFast,TFBertForMaskedLM

# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#berttokenizerfast
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
MASK = tokenizer.mask_token

masked_text = [f"Jim Cramer is consistently bullish when it comes to {MASK}. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of {MASK} recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached."]

inputs = tokenizer(masked_text, return_tensors="tf", padding=True, truncation=True)

model = TFBertForMaskedLM.from_pretrained(MODEL_PATH)
logits = model(**inputs).logits
mask_token_idxs = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)
print(mask_token_idxs)
print(logits)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


tf.Tensor(
[[ 0 13]
 [ 0 81]], shape=(2, 2), dtype=int64)
tf.Tensor(
[[[ -7.363  -7.258  -7.37  ...  -6.348  -6.03   -6.34 ]
  [ -7.21   -7.266  -6.906 ...  -6.49   -5.54   -6.582]
  [-11.97  -11.59  -10.73  ...  -9.36   -8.42  -12.26 ]
  ...
  [ -5.293  -5.055  -5.56  ...  -5.207  -5.664  -4.508]
  [-10.17  -10.64  -10.67  ...  -9.55   -8.18  -10.38 ]
  [-10.1   -10.61  -10.6   ...  -9.55   -8.195 -10.32 ]]], shape=(1, 110, 28996), dtype=float16)


In [5]:
mask_logits = tf.gather_nd(logits, mask_token_idxs)
top_5 = tf.math.top_k(mask_logits, k=5)
[tokenizer.decode([idx]) for idx in top_5.indices.numpy().flatten()]
for i in range(5):
    new_text = masked_text[0]
    for j in range(2):
        token_idx = top_5.indices[j, i]
        top5_logits = top_5.values[j]

        proba = tf.nn.softmax(top5_logits)
        predicted_token = tokenizer.decode([token_idx])
        new_text = new_text.replace(MASK, f'[{predicted_token}:{proba[i].numpy()*100.:.01f}%]', 1)
    print(new_text)

Jim Cramer is consistently bullish when it comes to [buying:54.0%]. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of [his:56.5%] recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached.
Jim Cramer is consistently bullish when it comes to [sales:14.3%]. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of [sales:17.4%] recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation a

# Financial Conditioning

In [6]:
from tqdm import tqdm

adapt_train_file = os.path.join(DATA_PATH, 'Domain_adapation/train.txt')
adapt_test_file = os.path.join(DATA_PATH, 'Domain_adapation/dev.txt')
def text_dataset(tokenizer, file_path):
    def generator():
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in tqdm(file, desc="text_dataset"):
                tokens = tokenizer(line.strip(),
                                   add_special_tokens=True,
                                   truncation=False,
                                   padding=False)
                yield {
                    'input_ids': tf.ragged.constant([tokens['input_ids']]),
                    'attention_mask': tf.ragged.constant([tokens['attention_mask']])
                }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32),
            'attention_mask': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32)
        })

train_dataset = text_dataset(tokenizer, adapt_train_file)
eval_dataset = text_dataset(tokenizer, adapt_test_file)
for example in train_dataset.take(3):
    inputs = example['input_ids'].numpy()[0]
    print(f"Input IDs (len: {len(inputs)}):", inputs)
    print("Attention Mask:", example['attention_mask'].numpy())

text_dataset: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 512). Running this sequence through the model will result in indexing errors
text_dataset: 2it [00:00, 62.78it/s]

Input IDs (len: 1056): [ 101 1327 1110 ... 5927  119  102]
Attention Mask: [[1 1 1 ... 1 1 1]]
Input IDs (len: 2): [101 102]
Attention Mask: [[1 1]]
Input IDs (len: 792): [  101  1327  2372  1975   138   118   156 22705  1116   136  1975   138
   118  6117  1132  1103  4482  6117  1104  8684  1975   118  1359  2557
  1115  2597  1113  1103  1160  1922  4482 17755   117  1103  7962  9924
  7855   113  6663  2036   114  1105  1103 26197 21821  9924  7855   113
   156  5301 12649   114   119 14630   117  1975   138   118  6117  1127
  1178  1907  1111  4779  1118  8684  4037  1496  1106  1975   112   188
  9118  1113  2880  5151   119  1438   117  1290  1581   117  8247  2880
  4300  1138  1151  1682  1106  4779  1292  6117  1194  1103 10862  4201
 11127  1348  1130  5710  2772   113   154 17675  2240   114  1449   119
 14633  1107  1617   117  1103   154 17675  2240  1788  3643  9467  6825
  1835  9660  1106  4417  1105  4582  1113  8684  1975   112   188  4482
 17755   119   138   118  




The MLM needs chunked sequences which are comprised of the whole corpus concatenated. Chunks are sized on the given hardware or the max dictionary the  tokenizer has - in general 128 is a good number for modern hardward.

As we concatenate, we add a lable column on which the MLM can use as a ground truth

In [8]:
def chunked_text_dataset(tokenizer, file_path, chunk_len=512):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Reading file lines"):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])

    def generator():
        num_chunks = len(all_tokens) // chunk_len
        for i in tqdm(range(num_chunks), "chunking..."):
            start = i * chunk_len
            end = start + chunk_len
            input_ids_chunk = all_tokens[start:end]
            attention_mask_chunk = all_attention_masks[start:end]
            special_tokens_mask_chunk = all_special_tokens_masks[start:end]

            yield {
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'labels': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32)
            }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'attention_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'labels': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'special_tokens_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32)
        })


train_dataset = chunked_text_dataset(tokenizer, adapt_train_file)
for example in train_dataset.take(1):
    inputs = example['input_ids'].numpy()
    print(f"Input IDs (len: {len(inputs)}):", inputs)
    print("Decoded IDs:", tokenizer.decode(inputs))

Reading file lines: 15463it [00:25, 599.54it/s]
chunking...:   0%|          | 0/7675 [00:00<?, ?it/s]

Input IDs (len: 512): [  101  1327  1110  1126   138   118   139  4623   136  1760   138   118
   139  3496  1110   170  4091  3496  1687  1118   170  1597  2337  1111
  1103  3007  1104  8715 25596  3327  7538   119  1760   138   118   139
  3496  1110   170  3496  1115 22646  1154  1160  1852  1103  1473  1104
  1103  1148 20846   119  1135  1110  1824  1114  1296 20846  6544  6661
  1107  1103  3496  1105 10505  1112  1103  1509 26181 11470 27989  3113
  1251  6736  1825  2589  1103  1168 20846   119  1109  3496  3370  1157
  1271  1121  1103  1864  1115  1122 22141  1154  1160  1852  1103  1148
 20846   112   188  1473  3496   138  1137  1103 17544   112   188  3496
   117  1105  3496   139  1137  1103  1260 19482  2227   112   188  3496
   119  1731  1126   138   118   139  4623  5853  1258  1103  1473  1104
  1126  2510   117  1117  3327  1110  3641  1174  3777  1196  1117 26181
 11470 27989  5927  3531  1122   119  1370  1859   117   170  1597  2337
  1144  1126  3327  3869   10




For MLMs huggingface offers a specific data collector that does the masking. Although we can mask random tokens using the `[MASK]` special token at random intervals, as long as there is a labals column with the ground truth.

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
batched_dataset = train_dataset.batch(1).take(1)

for batch in tqdm(batched_dataset, desc="batched_dataset"):
    batch = {k: v.numpy() for k, v in batch.items()}
    examples = [{k: v[i] for k, v in batch.items()} for i in range(batch['input_ids'].shape[0])]
    print(examples)
    collated_batch = data_collator(examples)
    for input_ids, labels in tqdm(zip(collated_batch['input_ids'], collated_batch['labels']), desc="tokenizing batches"):
        masked_text = tokenizer.decode(input_ids)
        original_text = tokenizer.decode([label if label != -100 else input_id for label, input_id in zip(labels, input_ids)])

        print(f"Masked: {masked_text}")
        print(f"Original: {original_text}")

    logits = model(**collated_batch)
    print(f"logits: {logits}")

batched_dataset: 0it [00:00, ?it/s]
chunking...:   0%|          | 0/7675 [00:00<?, ?it/s][A


[{'input_ids': array([  101,  1327,  1110,  1126,   138,   118,   139,  4623,   136,
        1760,   138,   118,   139,  3496,  1110,   170,  4091,  3496,
        1687,  1118,   170,  1597,  2337,  1111,  1103,  3007,  1104,
        8715, 25596,  3327,  7538,   119,  1760,   138,   118,   139,
        3496,  1110,   170,  3496,  1115, 22646,  1154,  1160,  1852,
        1103,  1473,  1104,  1103,  1148, 20846,   119,  1135,  1110,
        1824,  1114,  1296, 20846,  6544,  6661,  1107,  1103,  3496,
        1105, 10505,  1112,  1103,  1509, 26181, 11470, 27989,  3113,
        1251,  6736,  1825,  2589,  1103,  1168, 20846,   119,  1109,
        3496,  3370,  1157,  1271,  1121,  1103,  1864,  1115,  1122,
       22141,  1154,  1160,  1852,  1103,  1148, 20846,   112,   188,
        1473,  3496,   138,  1137,  1103, 17544,   112,   188,  3496,
         117,  1105,  3496,   139,  1137,  1103,  1260, 19482,  2227,
         112,   188,  3496,   119,  1731,  1126,   138,   118,   139,
     


tokenizing batches: 0it [00:00, ?it/s][A
tokenizing batches: 1it [00:00,  2.15it/s][A

Masked: [CLS] What is an A - B Trust? An A - B trust is [MASK] [MASK] trust created by a married couple for the purpose of minimizing estate taxes [MASK] An A - B trust is a trust that divides into two upon [MASK] death of the first spouse. It is formed with each spouse placing assets in the trust and naming as the final benefici [MASK] any [MASK] person except the other spouse. The trust gets its name from the fact that it splits into two upon the first spouse's death trust A or the survivor's trust [MASK] and trust B or the de [MASK]nt's trust. How an A - B Trust Works After the [MASK] of [MASK] individual, his estate is taxed [MASK] before his benefici [MASK] receive [MASK]. For example, a married [MASK] has an estate worth $ 3 million by the time one of the spouses die. The surviving spouse is left with $ 3 million which is not taxed due [MASK] the unlimited marital deduction for assets flowing from a deceased [MASK] to a surviving [MASK] [MASK] However [MASK] if the other spouse d


batched_dataset: 1it [00:02,  2.29s/it]

logits: TFMaskedLMOutput(loss=<tf.Tensor: shape=(1,), dtype=float16, numpy=array([2.033], dtype=float16)>, logits=<tf.Tensor: shape=(1, 512, 28996), dtype=float16, numpy=
array([[[ -7.277,  -7.207,  -7.344, ...,  -6.164,  -5.977,  -6.35 ],
        [ -6.574,  -6.71 ,  -6.39 , ...,  -5.64 ,  -5.062,  -5.68 ],
        [-12.875, -13.   , -11.66 , ...,  -7.508,  -8.61 ,  -8.805],
        ...,
        [ -6.844,  -6.574,  -6.855, ...,  -5.504,  -6.625,  -6.17 ],
        [ -9.03 ,  -9.3  ,  -9.49 , ...,  -7.28 ,  -6.297,  -7.508],
        [-14.33 , -14.75 , -14.6  , ..., -12.84 , -12.984, -13.07 ]]],
      dtype=float16)>, hidden_states=None, attentions=None)





Add everything together

In [11]:
MAX_LEN = 512 # Default 256, MAX 512
def mlm_text_dataset(file_path, tokenizer, data_collator, chunk_len=MAX_LEN):
    all_tokens = []
    all_attention_masks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Processing file..."):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])

    num_chunks = len(all_tokens) // chunk_len
    tokens_chunks = []
    attention_mask_chunks = []
    label_chunks = []
    for i in tqdm(range(num_chunks), desc="Chunking..."):
        start = i * chunk_len
        end = start + chunk_len
        input_ids_chunk = all_tokens[start:end]
        attention_mask_chunk = all_attention_masks[start:end]

        masked_chunks = data_collator([{
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32)}])
        tokens_chunks.extend(masked_chunks['input_ids'])
        label_chunks.extend(masked_chunks['labels'])
        attention_mask_chunks.extend(masked_chunks['attention_mask'])
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': tokens_chunks,
            'attention_mask': attention_mask_chunks,
            'labels': label_chunks
        },
    ))

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

for example in train_dataset.take(1):
    inputs = example['input_ids']
    print(f"Input IDs (len: {len(inputs)}):", inputs)
    print("Decoded IDs:", tokenizer.decode(inputs))

Processing file...: 15463it [00:25, 613.91it/s]
Chunking...: 100%|██████████| 7675/7675 [00:34<00:00, 219.50it/s]
Processing file...: 999it [00:02, 416.03it/s]
Chunking...: 100%|██████████| 491/491 [00:02<00:00, 218.22it/s]
chunking...:   0%|          | 0/7675 [00:00<?, ?it/s]

Input IDs (len: 512): tf.Tensor(
[  101  1327  1110  1126   138   118   139  4623   136  1760   138   118
   139  3496  1110   170  4091  3496  1687  1118   170  1597  2337  1111
  1103  3007  1104  8715 25596  3327  7538   119  1760   138   118   139
  3496  1110   170  3496  1115 22646  1154  1160  1852  1103  1473  1104
  1103  1148 20846   119  1135  1110  1824  1114  1296 20846  6544  6661
  1107  1103  3496  1105 10505  1112  1103  1509 26181 11470 27989  3113
  1251  6736  1825  2589  1103  1168 20846   119  1109  3496  3370  1157
  1271  1121  1103  1864  1115  1122 22141  1154  1160  1852  1103  1148
 20846   112   188  1473  3496   138  1137  1103 17544   112   188  3496
   117  1105  3496   139  1137  1103  1260 19482  2227   112   188  3496
   119  1731  1126   138   118   139  4623  5853  1258  1103  1473  1104
  1126  2510   117  1117  3327  1110  3641  1174  3777  1196  1117 26181
 11470 27989  5927  3531  1122   119  1370  1859   117   170  1597  2337
  1144  1126  3327




In [12]:
MAX_LEN = 512 # Default 256, MAX 512
LEARN_RATE=5e-5 # 5e-5
LR_FACTOR=0.1
LR_MINDELTA=1e-4
EPOCHS=30
PATIENCE=10
BATCH_SIZE = 8 * strategy.num_replicas_in_sync # Default 8

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import AdamW
from transformers import BertConfig

with strategy.scope():
    config = BertConfig.from_pretrained(MODEL_PATH)
    model = TFBertForMaskedLM.from_pretrained(MODEL_PATH, config=config)

    # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
    early_stopping = EarlyStopping(mode='min', patience=PATIENCE, start_from_epoch=1)
    lr_reducer = ReduceLROnPlateau(factor=LR_FACTOR,
                                    patience=0,
                                    min_delta=LR_MINDELTA,
                                    min_lr=LEARN_RATE/10.)
    #tf.debugging.enable_check_numerics() # - Assert if no Infs or NaNs go through. not for TPU!
    #tf.config.run_functions_eagerly(not is_tpu_strategy(strategy)) # - Easy debugging
    # https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
    train_dataset = (mlm_train_dataset.shuffle(buffer_size=10000)
                                    .batch(BATCH_SIZE)
                                    .cache()
                                    .prefetch(tf.data.experimental.AUTOTUNE))
    test_dataset = (mlm_test_dataset.shuffle(buffer_size=10000)
                                    .batch(BATCH_SIZE)
                                    .cache()
                                    .prefetch(tf.data.experimental.AUTOTUNE))
    model.compile(optimizer=AdamW(learning_rate=LEARN_RATE))
    history = model.fit(train_dataset,
                        epochs=EPOCHS,
                        callbacks=[lr_reducer, early_stopping],
                        verbose="auto",
                        validation_data=test_dataset)

model.save_pretrained("./models")

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Epoch 1/30
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x7d9e5d6419f0>


I0000 00:00:1714201684.624054     147 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


212/960 [=====>........................] - ETA: 8:11 - loss: 2.9002

In [None]:
train_loss = history.history["loss"][-1]
try:
    train_perplexity = math.exp(train_loss)
except OverflowError:
    train_perplexity = math.inf
validation_loss = history.history["val_loss"][-1]
try:
    validation_perplexity = math.exp(validation_loss)
except OverflowError:
    validation_perplexity = math.inf
results_dict = {}
results_dict["train_loss"] = train_loss
results_dict["train_perplexity"] = train_perplexity
results_dict["eval_loss"] = validation_loss
results_dict["eval_perplexity"] = validation_perplexity

results_dict