In [None]:
#__author__ = "Zafar Hussain (University of Helsinki, IVVES Project)"

import os
import pandas as pd
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import BertTokenizer, BertForMaskedLM, BertTokenizerFast, BertConfig
import json
from transformers import Trainer, TrainingArguments
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
from pathlib import Path
import torch
from datasets import load_dataset
from datasets import Dataset
from transformers import pipeline

In [None]:
# add special tokens in your data
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
# if you want to train the tokenizer on both sets
# files = ["train.txt", "test.txt"]
# training the tokenizer on the training set
files = ["text_file.txt"]
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 50_522
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = True

In [None]:
tokenizer = BertWordPieceTokenizer()
# train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)
# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)

In [None]:
# save the tokenized data
tokenizer_path = "your_path"
if not os.path.isdir(tokenizer_path):
    os.mkdir(tokenizer_path)
# save the tokenizer  
tokenizer.save_model(tokenizer_path)
# dumping some of the tokenizer config to config file, 
# including special tokens, whether to lower case and the maximum sequence length
with open(os.path.join(tokenizer_path, "config.json"), "w") as f:
    tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": max_length,
      "max_len": max_length,
     }
    json.dump(tokenizer_cfg, f)

In [None]:
# read the data from a csv to a pandas format
data = pd.read_csv('commands.csv')
dataset = Dataset.from_pandas(data)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)

In [None]:
def encode_with_truncation(examples):
  """Mapping function to tokenize the sentences passed with truncation"""
  return tokenizer(examples["text"], truncation=True, padding="max_length",
                   max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["text"], return_special_tokens_mask=True)

# the encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation
# tokenizing the train dataset
train_dataset = dataset.map(encode)
# tokenizing the testing dataset
if truncate_longer_samples:
  # remove other columns and set input_ids and attention_mask as PyTorch tensors
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

else:
  # remove other columns, and remain them as Python lists
  train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

In [None]:

train_dataset = train_dataset.remove_columns(["token_type_ids", "text", "index_ids"])

In [None]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
if not truncate_longer_samples:
    train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
    # convert them from lists to torch tensors
    train_dataset.set_format("torch")
  

In [None]:
# model configurations
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2)

In [None]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    #evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=8, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    #logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    #save_steps=1000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

In [None]:
# train the model
trainer.train()

In [None]:
# save the model
model_path = "model_path"
trainer.save_model(model_path)

In [None]:
#Load the model through a pipeline and test it

fill_mask = pipeline(
    "fill-mask",
    model=model_path,
    tokenizer= tokenizer_path 
)

In [None]:
fill_mask("C: Windows Microsoft.NET Framework64 v4.0.30319 ngen.exe uninstall c:windows assembly nativeimages_v4.0 [MASK]")