# BERT Financial Conditioning

# Notebook Environment

In [None]:
UPGRADE_PY = False
INSTALL_DEPS = False
if INSTALL_DEPS:
  # !pip install -q tensorboard==2.15.2
  # !pip install -q tensorflow[and-cuda]==2.15.1
  # !pip install -q tensorflow==2.15.0
  # !pip install -q tensorflow-io-gcs-filesystem==0.36.0
  # !pip install -q tensorflow-text==2.15.0
  # !pip install -q tf_keras==2.15.1
  # !pip install -q tokenizers==0.15.2
  # !pip install -q torch==2.2.0+cpu
  # !pip install -q torch-xla==2.2.0+libtpu
  # !pip install -q torchdata==0.7.1
  !pip install -q transformers==4.38.2

if UPGRADE_PY:
    !mamba create -n py311 -y
    !source /opt/conda/bin/activate py312 && mamba install python=3.11 jupyter mamba -y

    !sudo rm /opt/conda/bin/python3
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3
    !sudo rm /opt/conda/bin/python3.10
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python3.10
    !sudo rm /opt/conda/bin/python
    !sudo ln -sf /opt/conda/envs/py312/bin/python3 /opt/conda/bin/python

!python --version

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Transformers cannot use keras3
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_USE_LEGACY_KERAS'] = '1'
IN_KAGGLE = IN_COLAB = False
!export CUDA_LAUNCH_BLOCKING=1
!export XLA_FLAGS=--xla_cpu_verbose=0

try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "/content/drive/MyDrive/models"
    IN_COLAB = True
    print('Colab!')
except:
    IN_COLAB = False
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and not IN_COLAB:
    print('Running in Kaggle...')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    MODEL_PATH = "./models"
    DATA_PATH = "/kaggle/input"
    IN_KAGGLE = True
    print('Kaggle!')
elif not IN_COLAB and not IN_KAGGLE:
    IN_KAGGLE = False
    MODEL_PATH = "./models"
    DATA_PATH = "./data"
    print('Normal!')

MODEL_CONDITIONED_PATH = f"{MODEL_PATH}/model"
MODEL_BASE_CASED = "google-bert/bert-base-cased"
MODEL_BASE_UNCASED = "google-bert/bert-base-uncased"

# Accelerators Configuration

In [None]:
import numpy as np
import math
import shutil
import pandas as pd

from tqdm import tqdm

import torch
import tensorflow as tf
from tensorflow.keras import mixed_precision

print(f'Tensorflow version: [{tf.__version__}]')

tf.get_logger().setLevel('INFO')

#tf.config.set_soft_device_placement(True)
#tf.config.experimental.enable_op_determinism()
#tf.random.set_seed(1)
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except Exception as e:
    # Not an exception, just no TPUs available, GPU is fallback
    # https://www.tensorflow.org/guide/mixed_precision
    print(e)
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if len(gpus) > 0:

        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, False)
            tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=12288)])
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            strategy = tf.distribute.MirroredStrategy()

            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
        finally:
            print("Running on", len(tf.config.list_physical_devices('GPU')), "GPU(s)")
    else:
        # CPU is final fallback
        strategy = tf.distribute.get_strategy()
        print("Running on CPU")

def is_tpu_strategy(strategy):
    return isinstance(strategy, tf.distribute.TPUStrategy)

print("Number of accelerators:", strategy.num_replicas_in_sync)
os.getcwd()

# Conditioning with Masked Models

In [None]:
from transformers import BertTokenizerFast,TFBertForMaskedLM

# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#berttokenizerfast
tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE_CASED)
MASK = tokenizer.mask_token

masked_text = [f"Jim Cramer is consistently bullish when it comes to {MASK}. What this means in practicality is that Cramer routinely recommends buying stocks, and he rarely offers up a sell call. Analysis of his recommendations between 2016 and 2022 (via the data project Jim Cramer's Recommendations: A Six-Year Analysis) shows a 10.32% distribution of {MASK} recommendations alongside 61.27% buys, plus a smattering of positive or negative commentary without a formal buy or sell recommendation attached."]

inputs = tokenizer(masked_text, return_tensors="tf", padding=True, truncation=True)

model = TFBertForMaskedLM.from_pretrained(MODEL_BASE_CASED)
logits = model(**inputs).logits
mask_token_idxs = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)
print(mask_token_idxs)
print(logits)

In [None]:
mask_logits = tf.gather_nd(logits, mask_token_idxs)
top_5 = tf.math.top_k(mask_logits, k=5)
[tokenizer.decode([idx]) for idx in top_5.indices.numpy().flatten()]
for i in range(5):
    new_text = masked_text[0]
    for j in range(2):
        token_idx = top_5.indices[j, i]
        top5_logits = top_5.values[j]

        proba = tf.nn.softmax(top5_logits)
        predicted_token = tokenizer.decode([token_idx])
        new_text = new_text.replace(MASK, f'[{predicted_token}:{proba[i].numpy()*100.:.01f}%]', 1)
    print(new_text)

In [None]:
adapt_train_file = os.path.join(DATA_PATH, 'Domain_adapation/train.txt')
adapt_test_file = os.path.join(DATA_PATH, 'Domain_adapation/dev.txt')
def text_dataset(tokenizer, file_path):
    def generator():
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in tqdm(file, desc="text_dataset"):
                tokens = tokenizer(line.strip(),
                                   add_special_tokens=True,
                                   truncation=False,
                                   padding=False)
                yield {
                    'input_ids': tf.ragged.constant([tokens['input_ids']]),
                    'attention_mask': tf.ragged.constant([tokens['attention_mask']])
                }
    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32),
            'attention_mask': tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32)
        })

train_dataset = text_dataset(tokenizer, adapt_train_file)
eval_dataset = text_dataset(tokenizer, adapt_test_file)

iterator = iter(eval_dataset.as_numpy_iterator())
example = next(iterator)
inputs = example['input_ids'][0]
print(f"Input IDs (len: {len(inputs)}):", inputs)
print("Attention Mask:", example['attention_mask'])

In [None]:
def chunked_text_dataset(tokenizer, file_path, chunk_len=512):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Reading file lines", position=0, leave=True):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])

    def generator():
        num_chunks = len(all_tokens) // chunk_len
        for i in tqdm(range(num_chunks), desc= "chunking...", position=0, leave=True):
            start = i * chunk_len
            end = start + chunk_len
            input_ids_chunk = all_tokens[start:end]
            attention_mask_chunk = all_attention_masks[start:end]
            special_tokens_mask_chunk = all_special_tokens_masks[start:end]
            yield {
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'labels': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32)
            }

    return tf.data.Dataset.from_generator(
        generator,
        output_signature={
            'input_ids': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'attention_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'labels': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32),
            'special_tokens_mask': tf.TensorSpec(shape=(chunk_len,), dtype=tf.int32)
        })


train_dataset = chunked_text_dataset(tokenizer, adapt_train_file)
iterator = iter(eval_dataset.as_numpy_iterator())
example = next(iterator)
inputs = example['input_ids'][0]
print(f"Input IDs (len: {len(inputs)}):", inputs)
print("Decoded IDs:", tokenizer.decode(inputs)[:50])

In [None]:
from transformers import DataCollatorForLanguageModeling, BertConfig

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
batched_dataset = train_dataset.batch(1).take(1)

batch = next(iter(eval_dataset.as_numpy_iterator()))
batch = {k: v for k, v in batch.items()}
examples = [{k: v[i] for k, v in batch.items()} for i in range(batch['input_ids'].shape[0])]
print(examples)
collated_batch = data_collator(examples)
for input_ids, labels in tqdm(zip(collated_batch['input_ids'], collated_batch['labels']), desc="tokenizing batches"):
    masked_text = tokenizer.decode(input_ids)
    original_text = tokenizer.decode([label if label != -100 else input_id for label, input_id in zip(labels, input_ids)])

    print(f"Masked: {masked_text[:50]}")
    print(f"Labels: {labels[:50]}")
    print(f"Original: {original_text[:50]}")
collated_batch

In [None]:
MAX_LEN = 512 # Default 256, MAX 512
def mlm_text_dataset(file_path, tokenizer, data_collator, chunk_len=MAX_LEN):
    all_tokens = []
    all_attention_masks = []
    all_special_tokens_masks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in tqdm(file, position=0, leave=True, desc="Processing file..."):
            tokens = tokenizer(line.strip(),
                               truncation=True,
                               add_special_tokens=True,
                               return_special_tokens_mask=True,
                               padding=False)
            all_tokens.extend(tokens['input_ids'])
            all_attention_masks.extend(tokens['attention_mask'])
            all_special_tokens_masks.extend(tokens['special_tokens_mask'])


    num_chunks = len(all_tokens) // chunk_len
    tokens_chunks = []
    attention_mask_chunks = []
    label_chunks = []
    special_tokens_mask_chunk=[]
    for i in tqdm(range(num_chunks), position=0, leave=True, desc="Chunking..."):
        start = i * chunk_len
        end = start + chunk_len
        input_ids_chunk = all_tokens[start:end]
        attention_mask_chunk = all_attention_masks[start:end]
        special_tokens_mask_chunk = all_special_tokens_masks[start:end]

        masked_chunks = data_collator([{
                'input_ids': tf.convert_to_tensor(input_ids_chunk, dtype=tf.int32),
                'attention_mask': tf.convert_to_tensor(attention_mask_chunk, dtype=tf.int32),
                'special_tokens_mask': tf.convert_to_tensor(special_tokens_mask_chunk, dtype=tf.int32),}])
        tokens_chunks.extend(masked_chunks['input_ids'])
        label_chunks.extend(masked_chunks['labels'])
        attention_mask_chunks.extend(masked_chunks['attention_mask'])
        special_tokens_mask_chunk.extend(special_tokens_mask_chunk)
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': tokens_chunks,
            'attention_mask': attention_mask_chunks,
            'labels': label_chunks,
            # 'special_tokens_mask': special_tokens_mask_chunk
        },
    ))

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE_CASED)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

iterex = iter(mlm_test_dataset.as_numpy_iterator())
next(iterex)

## BERT Conditioning Training Loops

Recommended training params from the paper:

* Batch size: 16, 32
* Learning rate (Adam): 5e-5, 3e-5, 2e-5
* Number of epochs: 2, 3, 4

In [None]:
MAX_LEN = 512 # Default 256, MAX 512
LEARN_RATE=5e-5 # 5e-5
PATIENCE=10
EPOCHS=50

TOTAL_STEPS = 100000
WARM_STEPS = 10000
INIT_LR = 1e-4
BETA_1 = 0.9
BETA_2 = 0.999
WEIGHT_DECAY = 0.01

BATCH_SIZE = 16 * strategy.num_replicas_in_sync # Default 8
BUFFER_SIZE = 10000

In [None]:
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, TerminateOnNaN
from tensorflow.keras.optimizers import AdamW

import zipfile

import matplotlib.pyplot as plt

def eval_mlm(model, batched_dataset):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
    total_loss = 0.
    total_accuracy = 0.
    total_examples = 0.

    # TODO: convert this to a TF function for distributed strat.
    for batch in tqdm(batched_dataset, desc="eval_mlm", position=0, leave=True):
        for dataset_output in batch:
            input_ids = dataset_output['input_ids']
            attention_mask = dataset_output['attention_mask']
            labels = dataset_output['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            mask = (labels != -100)
            masked_logits = tf.boolean_mask(logits, mask)
            masked_labels = tf.boolean_mask(labels, mask)
            batch_loss = loss_fn(masked_labels, masked_logits)
            predictions = tf.argmax(masked_logits, axis=-1)
            batch_accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, masked_labels), dtype=tf.float32))

            total_loss += tf.cast(batch_loss,tf.float32)
            total_accuracy += batch_accuracy
            total_examples += tf.size(masked_labels, out_type=tf.float32)

    avg_loss = total_loss / total_examples
    avg_perplexity = tf.exp(avg_loss).numpy()
    avg_accuracy = total_accuracy / total_examples

    print(f"Average Cross-Entropy Loss: {avg_loss.numpy()}")
    print(f"Average Perplexity: {avg_perplexity}")
    print(f"Average Accuracy: {avg_accuracy.numpy()}")



def condition_model(model_path, models_log_dir=MODEL_PATH, from_pt=False):
    with strategy.scope():
        # https://huggingface.co/transformers/v3.0.2/_modules/transformers/configuration_bert.html#BertConfig
        config = BertConfig.from_pretrained(model_path, from_pt=from_pt)
        cond_model = TFBertForMaskedLM.from_pretrained(model_path, config=config, from_pt=from_pt)

        # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/TensorBoard
        tensorboard_callback = TensorBoard(log_dir=f"{models_log_dir}/logs",
                                            histogram_freq=2,
                                            embeddings_freq=2)
        # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping
        early_stopping = EarlyStopping(mode='min', patience=PATIENCE, start_from_epoch=1)
        #tf.debugging.enable_check_numerics() # - Assert if no Infs or NaNs go through. not for TPU!
        #tf.config.run_functions_eagerly(not is_tpu_strategy(strategy)) # - Easy debugging
        # https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
        train_dataset = (mlm_train_dataset.shuffle(buffer_size=BUFFER_SIZE)
                                        .batch(BATCH_SIZE)
                                        .cache()
                                        .prefetch(tf.data.experimental.AUTOTUNE))
        test_dataset = (mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE)
                                        .batch(BATCH_SIZE)
                                        .cache()
                                        .prefetch(tf.data.experimental.AUTOTUNE))
        cond_model.compile(optimizer=AdamW(learning_rate=LEARN_RATE))
        history = cond_model.fit(train_dataset,
                            epochs=EPOCHS,
                            callbacks=[early_stopping, TerminateOnNaN()],
                            verbose="auto",
                            validation_data=test_dataset)

        cond_model.save_pretrained(f"{MODEL_PATH}/model")
        config.save_pretrained(f"{MODEL_PATH}")
        tokenizer.save_pretrained(f"{MODEL_PATH}/tokenizer")

        return cond_model, history


def plot_training_metrics(history):
    epochs = range(1, len(history.history['loss']) + 1)

    fig, axs = plt.subplots(1, 2, figsize=(12, 4))

    axs[0].plot(epochs, history.history['loss'], 'bo-', label='Training Loss')
    axs[0].plot(epochs, history.history['val_loss'], 'ro-', label='Validation Loss')
    axs[0].set_title('Training and Validation Loss')
    axs[0].set_xlabel('Epochs')
    axs[0].set_ylabel('Loss')
    axs[0].legend()

    train_perplexity = []
    validation_perplexity = []
    for loss in history.history["loss"]:
        try:
            epoch_perplexity = math.exp(loss)
        except OverflowError:
            epoch_perplexity = float('inf')
        train_perplexity.append(epoch_perplexity)
    for val_loss in history.history.get("val_loss", []):
        try:
            epoch_perplexity = math.exp(val_loss)
        except OverflowError:
            epoch_perplexity = float('inf')
        validation_perplexity.append(epoch_perplexity)

    axs[1].plot(epochs,train_perplexity, 'bo-', label='Training Perplexity')
    axs[1].plot(epochs, validation_perplexity, 'ro-', label='Validation Perplexity')
    axs[1].set_title('Training and Validation Perplexity')
    axs[1].set_xlabel('Epochs')
    axs[1].set_ylabel('Perplexity')
    axs[1].legend()

    plt.tight_layout()
    plt.show()

    results_dict = {}
    results_dict["train_loss"] = history.history["loss"][-1]
    results_dict["eval_loss"] = history.history["val_loss"][-1]
    results_dict["train_perplexity"] = train_perplexity[-1]
    results_dict["eval_perplexity"] = validation_perplexity[-1]

    return results_dict


def zip_models(directory, output_filename, compression_level = 9):
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED, compresslevel=compression_level) as zipf:
        for root, dirs, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(directory, '..')))

In [None]:
# Base Eval
config = BertConfig.from_pretrained(MODEL_BASE_UNCASED)
model = TFBertForMaskedLM.from_pretrained(MODEL_BASE_UNCASED, config=config)
test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)
eval_mlm(model, test_dataset)

## Base Uncased Conditioned

In [None]:
cond_model, history = condition_model(MODEL_BASE_UNCASED)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

In [None]:
eval_mlm(cond_model, test_dataset)

## Base Cased

In [None]:
# Base Eval
config = BertConfig.from_pretrained(MODEL_BASE_CASED)
model = TFBertForMaskedLM.from_pretrained(MODEL_BASE_CASED, config=config)
test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)
eval_mlm(model, test_dataset)

## Base Cased Conditioned

In [None]:
cond_model, history = condition_model(MODEL_BASE_CASED)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

# Evaluate FinBERT

## Cased FinVocab

In [None]:
# https://huggingface.co/yiyanghkust/finbert-pretrain
# https://github.com/yya518/FinBERT?tab=readme-ov-file
FINBERT_MODEL_CASED_PATH = "radmada/FinBERT-FinVocab-Cased" # f"{MODEL_PATH}/FinBERT-FinVocab-Cased"

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(FINBERT_MODEL_CASED_PATH, from_pt=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_MODEL_CASED_PATH, from_pt=True)
    model = TFBertForMaskedLM.from_pretrained(FINBERT_MODEL_CASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)

    eval_mlm(model, test_dataset)

## Cased FinVocab Conditioned

In [None]:
cond_model, history = condition_model(FINBERT_MODEL_CASED_PATH, from_pt=True)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

## Uncased FinVocab

In [None]:
FINBERT_MODEL_UNCASED_PATH = f"{MODEL_PATH}/FinBERT-FinVocab-Uncased"

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(FINBERT_MODEL_UNCASED_PATH, from_pt=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_MODEL_UNCASED_PATH, from_pt=True)
    model = TFBertForMaskedLM.from_pretrained(FINBERT_MODEL_UNCASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)

    eval_mlm(model, test_dataset)

## Uncased FinVocab Conditioned

In [None]:
cond_model, history = condition_model(FINBERT_MODEL_UNCASED_PATH, from_pt=True)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

## Cased Base

In [None]:
FINBERT_BASEMODEL_CASED_PATH = f"{MODEL_PATH}/FinBERT-BaseVocab-Cased"

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(FINBERT_BASEMODEL_CASED_PATH, from_pt=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_BASEMODEL_CASED_PATH, from_pt=True)
    model = TFBertForMaskedLM.from_pretrained(FINBERT_BASEMODEL_CASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)

    eval_mlm(model, test_dataset)

## Cased Base Conditioned

In [None]:
cond_model, history = condition_model(FINBERT_BASEMODEL_CASED_PATH, from_pt=True)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

## Uncased Base

In [None]:
FINBERT_BASEMODEL_UNCASED_PATH = f"{MODEL_PATH}/FinBERT-BaseVocab-Uncased"

with strategy.scope():
    tokenizer = BertTokenizerFast.from_pretrained(FINBERT_BASEMODEL_UNCASED_PATH, from_pt=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_BASEMODEL_UNCASED_PATH, from_pt=True)
    model = TFBertForMaskedLM.from_pretrained(FINBERT_BASEMODEL_UNCASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)

    eval_mlm(model, test_dataset)

## Uncased Conditioned

In [None]:
cond_model, history = condition_model(FINBERT_BASEMODEL_UNCASED_PATH, from_pt=True)
plot_training_metrics(history)
eval_mlm(cond_model, test_dataset)

# Combining Vocabs


## Cased

In [None]:
finbert_tokenizer = BertTokenizerFast.from_pretrained(FINBERT_MODEL_CASED_PATH)
bert_cased_tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE_CASED)

finbert_vocab = finbert_tokenizer.get_vocab()
bert_cased_vocab = bert_cased_tokenizer.get_vocab()
finbert_vocab_set = set(finbert_vocab.keys())
bert_cased_vocab_set = set(bert_cased_vocab.keys())

combined_vocab = finbert_vocab_set.union(bert_cased_vocab_set)
print(list(combined_vocab)[:25]) # check the vocab

COMBI_CASED_VOCAB_PATH = f"{MODEL_PATH}/vocabs/vocab.txt"

os.makedirs(f"{MODEL_PATH}/vocabs", exist_ok=True)
with open(COMBI_CASED_VOCAB_PATH, 'w', encoding='utf-8') as file:
    for token in sorted(combined_vocab):
        file.write(token + '\n')
    print(f"Saved to {COMBI_CASED_VOCAB_PATH}")

In [None]:
with strategy.scope():
    tokenizer = BertTokenizerFast(vocab_file=COMBI_CASED_VOCAB_PATH, do_lower_case=False)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_MODEL_CASED_PATH)

    model = TFBertForMaskedLM.from_pretrained(FINBERT_MODEL_CASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)
    eval_mlm(model, test_dataset)

## Uncased

In [None]:
finbert_tokenizer = BertTokenizerFast.from_pretrained(FINBERT_MODEL_UNCASED_PATH)
bert_cased_tokenizer = BertTokenizerFast.from_pretrained(MODEL_BASE_UNCASED)

finbert_vocab = finbert_tokenizer.get_vocab()
bert_cased_vocab = bert_cased_tokenizer.get_vocab()
finbert_vocab_set = set(finbert_vocab.keys())
bert_cased_vocab_set = set(bert_cased_vocab.keys())

combined_vocab = finbert_vocab_set.union(bert_cased_vocab_set)
print(list(combined_vocab)[:25]) # check the vocab

COMBI_UNCASED_VOCAB_PATH = f"{MODEL_PATH}/vocabs/vocab.txt"

os.makedirs(f"{MODEL_PATH}/vocabs", exist_ok=True)
with open(COMBI_CASED_VOCAB_PATH, 'w', encoding='utf-8') as file:
    for token in sorted(combined_vocab):
        file.write(token + '\n')
    print(f"Saved to {COMBI_UNCASED_VOCAB_PATH}")

In [None]:
with strategy.scope():
    tokenizer = BertTokenizerFast(vocab_file=COMBI_CASED_VOCAB_PATH, do_lower_case=False)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np")
    mlm_train_dataset = mlm_text_dataset(adapt_train_file, tokenizer, data_collator)
    mlm_test_dataset = mlm_text_dataset(adapt_test_file, tokenizer, data_collator)

    config = BertConfig.from_pretrained(FINBERT_MODEL_CASED_PATH)

    model = TFBertForMaskedLM.from_pretrained(FINBERT_MODEL_CASED_PATH, config=config, from_pt=True)
    test_dataset = mlm_test_dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).cache().prefetch(tf.data.experimental.AUTOTUNE)
    eval_mlm(model, test_dataset)

# Save Best Model

In [None]:
SAVE_ZIP = False

if SAVE_ZIP:
    zip_models(MODEL_PATH, './cond_bert.zip')