In [1]:
import logging
logging.basicConfig(level=logging.INFO)

from transformers import (
    CamembertTokenizer,
    RobertaConfig,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer, 
    TrainingArguments
)

#thai2transformers

import os, sys
sys.path.append('/workspace/thai2transformers/')

from thai2transformers.datasets import MLMDataset


INFO:transformers.file_utils:PyTorch version 1.5.0a0+8f84ded available.


In [2]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [3]:

#initialize tokenizer

tokenizer = CamembertTokenizer(vocab_file='/workspace/thai2transformers/dataset/spm/th-wiki_only_20.7.2020_small_sentencepiece_for_camembert_16k/sentencepiece.bpe.model')


In [5]:
print(tokenizer.vocab_size)
#initialize models
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    type_vocab_size=1,
    #roberta base as default
    num_hidden_layers=12,
    hidden_size=768, 
    intermediate_size=3072,
    num_attention_head=12
#     #roberta large
#     num_hidden_layers=24,
#     hidden_size=1024, 
#     intermediate_size=4096,
#     num_attention_head=16
)


16005


In [6]:

model = RobertaForMaskedLM(config=config)


In [7]:

#datasets
train_dataset = MLMDataset(tokenizer, '../dataset/split/th-wiki_only_20.7.2020_small/train', 512)



HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=90004), HTML(value='')))




In [8]:
eval_dataset = MLMDataset(tokenizer, '../dataset/split/th-wiki_only_20.7.2020_small/val', 512)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9999), HTML(value='')))




In [19]:
eval_dataset.bs

10000

In [23]:

#data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

#training args
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=4e-6,
    warmup_steps=500,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    max_grad_norm=1.0,
    #checkpoint
    output_dir='./test_ckp/',
    
    save_total_limit=1,
    save_steps=10,
    #logs


    #eval
    evaluate_during_training=True,
    eval_steps=10,
    #others
    seed=123,
    fp16=False,

    dataloader_drop_last=True
)

#initiate trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator = data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
    
)

#train
# trainer.train()


INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


INFO:wandb.run_manager:file/dir created: /workspace/thai2transformers/notebooks/wandb/run-20200721_062303-3f6rf1gk/wandb-metadata.json
INFO:wandb.run_manager:system metrics and metadata threads started
INFO:wandb.run_manager:checking resume status, waiting at most 10 seconds
INFO:wandb.run_manager:resuming run from id: UnVuOnYxOjNmNnJmMWdrOmh1Z2dpbmdmYWNlOmxhbGl0YWw=
INFO:wandb.run_manager:upserting run before process can begin, waiting at most 10 seconds
INFO:wandb.run_manager:saving pip packages
INFO:wandb.run_manager:initializing streaming files api
INFO:wandb.run_manager:unblocking file change observer, beginning sync with W&B servers
INFO:wandb.run_manager:shutting down system stats and metadata service
INFO:wandb.run_manager:file/dir modified: /workspace/thai2transformers/notebooks/wandb/run-20200721_062303-3f6rf1gk/config.yaml
INFO:wandb.run_manager:file/dir created: /workspace/thai2transformers/notebooks/wandb/run-20200721_062303-3f6rf1gk/wandb-summary.json
INFO:wandb.run_manag

In [24]:
trainer

<transformers.trainer.Trainer at 0x7fc12842bf60>

In [None]:

# #save
# trainer.save_model('test_lm')

#evaluate
trainer.evaluate()


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 9999
INFO:transformers.trainer:  Batch size = 64


HBox(children=(IntProgress(value=0, description='Evaluation', max=156, style=ProgressStyle(description_width='…