### Imports

In [1]:
import os
import sys
from pathlib import Path

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from collections import OrderedDict


import src.domlm as model 
import src.dataset as dataset
from src.data_collator import DataCollatorForDOMNodeMask

### Setup model and load roberta weights

In [2]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta = AutoModel.from_pretrained("roberta-base")
roberta_config = roberta.config

roberta_config_dict = roberta_config.to_dict()
roberta_config_dict["_name_or_path"] = "domlm"
roberta_config_dict["architectures"] = ["DOMLMForMaskedLM"]
domlm_config = model.DOMLMConfig.from_dict(roberta_config_dict)
# domlm_config.save_pretrained("../domlm-config/")
domlm = model.DOMLMForMaskedLM(domlm_config)

state_dict = OrderedDict((f"domlm.{k}",v) for k,v in roberta.state_dict().items())
domlm.load_state_dict(state_dict,strict=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


_IncompatibleKeys(missing_keys=['domlm.embeddings.tree_embeddings.node_embeddings.weight', 'domlm.embeddings.tree_embeddings.parent_embeddings.weight', 'domlm.embeddings.tree_embeddings.sibling_embeddings.weight', 'domlm.embeddings.tree_embeddings.depth_embeddings.weight', 'domlm.embeddings.tree_embeddings.tag_embeddings.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'], unexpected_keys=['domlm.pooler.dense.weight', 'domlm.pooler.dense.bias'])

### Prepare SWDE data

In [3]:
dataset_path = "../../dataset/SWDE_Dataset/processed_subset/"
print(f"Loading datasets from {dataset_path}...")
train_ds = dataset.SWDEDataset(dataset_path)
test_ds = dataset.SWDEDataset(dataset_path, split="test")

# tokenizer.pad_token = tokenizer.eos_token # why do we need this?
data_collator = DataCollatorForDOMNodeMask(tokenizer=tokenizer, mlm_probability=0.15)

Loading datasets from ../../dataset/SWDE_Dataset/processed_subset/...


In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_ds, collate_fn= data_collator, batch_size= 2)

In [6]:
batch = next(iter(train_dataloader))

### Train in Masked LM fashion

In [None]:
#TODO: add evaluation metrics (ppl, etc.)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    # optim="adamw_apex_fused", # only with apex installed
    weight_decay=0.01,
    num_train_epochs=5,
    warmup_ratio=0.1,
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    # gradient_checkpointing=True, # vram is enough without checkpointing
    fp16 = True, # If Ampere: bf16 = True
    # tf32 = True, # Ampere Only
    dataloader_num_workers=8,
    dataloader_pin_memory=True
)

trainer = Trainer(
    model=domlm,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
)

trainer.train()

