In [None]:
from transformers import (
    BertForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    Trainer,
    TrainingArguments,
)
import torch
import pandas as pd
from pathlib import Path

In [None]:
raw_descriptions = Path("data/train_descriptions.txt")

if not raw_descriptions.exists():
    descriptions = pd.read_csv("data/train.tsv", sep="\t", header=None)
    descriptions = descriptions.set_index(0).to_dict()
    with open(raw_descriptions, "w") as f:
        for key, value in descriptions[1].items():
            f.write(value + "\n")

In [None]:
model_name = "bert-base-uncased"

model = BertForMaskedLM.from_pretrained(model_name)
device = torch.device("cuda")
model.to(device)

# CLS token is added by tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer, file_path=raw_descriptions, block_size=512
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./tune",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=200,
    save_total_limit=2,
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model, args=training_args, data_collator=data_collator, train_dataset=dataset
)

In [None]:
trainer.train()