In [1]:
import json
import os
from datasets import load_dataset, load_metric
from transformers import PreTrainedTokenizerFast
from transformers import RobertaConfig, RobertaModel, RobertaForCausalLM, EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
raw_datasets = load_dataset('json', data_files ={'train': 'indomain_data/trial_en_es_train.json', 'valid': 'indomain_data/trial_en_es_valid.json'}, field='data')
metric = load_metric("sacrebleu")

Using custom data configuration default-f4cee673dc41c6a4
Reusing dataset json (/ldap_home/aurelio.prahara/.cache/huggingface/datasets/json/default-f4cee673dc41c6a4/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9)


In [4]:
en_tokenizer = PreTrainedTokenizerFast(tokenizer_file="en_pt_tokenizers/en_tok.json")
es_tokenizer = PreTrainedTokenizerFast(tokenizer_file='bpe_en_es/es_tok.json')
en_tokenizer.unk_token,en_tokenizer.cls_token, en_tokenizer.sep_token,en_tokenizer.pad_token, en_tokenizer.mask_token = ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']
es_tokenizer.unk_token, es_tokenizer.cls_token, es_tokenizer.sep_token, es_tokenizer.pad_token, es_tokenizer.mask_token = ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']

In [5]:
encoder_max_length = 256
decoder_max_length = 256
source_lang = 'en'
target_lang = 'es'

In [6]:
def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = [ex[source_lang] for ex in batch["translation"]]
  targets = [ex[target_lang] for ex in batch["translation"]]
  inputs = en_tokenizer(inputs, padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = es_tokenizer(targets, padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == en_tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [7]:
tokenized_datasets = raw_datasets.map(process_data_to_model_inputs,
                                      batched=True,
                                      batch_size=16,
                                      remove_columns=['translation'])

Loading cached processed dataset at /ldap_home/aurelio.prahara/.cache/huggingface/datasets/json/default-f4cee673dc41c6a4/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9/cache-5f6c21a4c87d26ec.arrow
Loading cached processed dataset at /ldap_home/aurelio.prahara/.cache/huggingface/datasets/json/default-f4cee673dc41c6a4/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9/cache-00fa88aa90adcba6.arrow


In [8]:
tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [9]:
print(tokenized_datasets['train']['labels'])

tensor([[ 345,  431,   15,  ..., -100, -100, -100],
        [ 408,  817,   17,  ..., -100, -100, -100],
        [ 293,  103, 1261,  ..., -100, -100, -100],
        ...,
        [ 758,  169,  626,  ..., -100, -100, -100],
        [ 270,   15, 2451,  ..., -100, -100, -100],
        [ 293, 1367,   15,  ..., -100, -100, -100]])


In [10]:
vocabsize = 30000
model = EncoderDecoderModel.from_pretrained("roberta_pt_checkpoints/checkpoint-115416")

Some weights of EncoderDecoderModel were not initialized from the model checkpoint at roberta_pt_checkpoints/checkpoint-115416 and are newly initialized: ['encoder.encoder.layer.1.crossattention.self.query.bias', 'encoder.encoder.layer.1.crossattention.self.query.weight', 'encoder.encoder.layer.2.crossattention.self.value.bias', 'encoder.encoder.layer.6.crossattention.output.dense.bias', 'encoder.encoder.layer.4.crossattention.output.dense.bias', 'encoder.encoder.layer.1.crossattention.self.value.bias', 'encoder.encoder.layer.7.crossattention.output.dense.bias', 'encoder.encoder.layer.2.crossattention.output.LayerNorm.bias', 'encoder.encoder.layer.4.crossattention.self.query.weight', 'encoder.encoder.layer.0.crossattention.self.query.bias', 'encoder.encoder.layer.0.crossattention.self.key.weight', 'encoder.encoder.layer.3.crossattention.output.LayerNorm.weight', 'encoder.encoder.layer.6.crossattention.self.value.bias', 'encoder.encoder.layer.0.crossattention.self.key.bias', 'encoder.en

In [11]:
# freeze all of the layers in the encoder 
# for param in model.encoder.parameters():
    # param.requires_grad = False

In [12]:
# freeze the embedding only
# for param in model.encoder.embeddings.parameters():
    # param.requires_grad = False

In [13]:
# freeze the first n layers in the encoder 
# n = 1
# for param in model.encoder.encoder.layer[:n].parameters():
    # param.requires_grad = False

In [14]:
# make sure that the number of parameters trained is correct
print("number of total parameters:", model.num_parameters())
print("number of trainable parameters: ", model.num_parameters(only_trainable=True))

number of total parameters: 200100144
number of trainable parameters:  200100144


In [15]:
model.to(device)

EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(1024, 768, padding_idx=1)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [16]:
model.config.decoder_start_token_id = en_tokenizer.cls_token_id
model.config.eos_token_id = en_tokenizer.sep_token_id
model.config.pad_token_id = en_tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [17]:
batch_size = 24
args = Seq2SeqTrainingArguments(
    output_dir="roberta_pt_es_checkpoints/",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # learning_rate=6e-04,
    weight_decay=1e-5,
    max_grad_norm=1.0,
    warmup_steps=4000,
    # label_smoothing_factor=0.1,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    save_strategy="no",
    save_total_limit=10
)

In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [19]:
def compute_metrics(eval_preds):
    labels_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    pred_str = es_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = es_tokenizer.pad_token_id
    label_str = es_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # postprocessing
    pred_str, label_str = postprocess_text(pred_str, label_str)

    result = metric.compute(predictions=pred_str, references=label_str)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != es_tokenizer.pad_token_id) for pred in pred_ids]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    tokenizer=en_tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [21]:
trainer.train()

***** Running training *****
  Num examples = 21690
  Num Epochs = 20
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 18080


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,11.2772,6.090669,0.0554,20.0
2,5.7577,5.614186,0.0532,20.0


***** Running Evaluation *****
  Num examples = 2409
  Batch size = 24
***** Running Evaluation *****
  Num examples = 2409
  Batch size = 24


KeyboardInterrupt: 