## Terminology improvement analysis through language modeling

### Notebook for fine-tuning BERT with masked language modeling (MLM) loss using the HuggingFace Trainer API

In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForMaskedLM, DataCollatorForLanguageModeling
import torch

#### Load pre-processed dataset
Note that, for fine-tuning, our original training data is further split into the training_bert (80%) and validation_bert (20%) sets

In [None]:
training_data = 'training_testing_data/with_name_objective/training_data_stopwords_bert.txt'
validation_data = 'training_testing_data/with_name_objective/validation_data_stopwords_bert.txt'
bert_datasets = load_dataset("text", data_files={"train": training_data, "validation": validation_data})

In [None]:
# Model checkpoint is the name of the pre-trained model that we fine-tune

# model_checkpoint = 'bert-base-uncased'
# model_checkpoint = 'distilbert-base-uncased'
model_checkpoint = 'bert-large-uncased-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(sentences):
    from transformers import AutoTokenizer
#     model_checkpoint = 'bert-base-uncased'
#     model_checkpoint = 'distilbert-base-uncased'
    model_checkpoint = 'bert-large-uncased-whole-word-masking'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    return tokenizer(sentences["text"])

In [None]:
tokenized_datasets = bert_datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
def group_texts(sentences):
    block_size = 128

    # Concatenate all sentences
    concatenated_sentences = {k: sum(sentences[k], []) for k in sentences.keys()}
    total_length = len(concatenated_sentences[list(sentences.keys())[0]])
    total_length = (total_length // block_size) * block_size
    
    # Split by chunks of max_len
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_sentences.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
block_size = 128
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01
)

In [None]:
# Mask 15% of the words in the dataset
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
# Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# # Save model with huggingface API
# trainer.model.save_pretrained('/fine_tuned_bert_models/my_bert_base_stopwords')
# trainer.model.save_pretrained('/fine_tuned_bert_models/my_bert_distilbert_stopwords')
trainer.model.save_pretrained('/fine_tuned_bert_models/my_bert_whole_word_stopwords')

# # Save model with pytorch
# torch.save(trainer.model, '/fine_tuned_bert_models/my_bert_base_stopwords')
# torch.save(trainer.model, '/fine_tuned_bert_models/my_bert_distilbert_stopwords')
# torch.save(trainer.model, '/fine_tuned_bert_models/my_bert_whole_word_stopwords')