In [None]:
!pip install transformers
!pip install tokenizers

In [None]:
from google.colab import drive
drive.mount('./gdrive')

In [None]:
cd /content/gdrive/My\ Drive/DeepLearning

In [None]:
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=32000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./pretrained", max_len=512)
model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()

In [None]:
%%time
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./samples/corpus.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./logs",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=16,
    save_steps=10000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./pretrained")