In [None]:
# !pip install -U adapter-transformers
# !conda install -y -c conda-forge tensorboard
# !conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch


In [1]:
from datasets import load_dataset

scierc_name = 'nsusemiehl/SciERC'
scierc_dataset = load_dataset(scierc_name)
print(scierc_dataset.num_rows)

Using custom data configuration nsusemiehl--SciERC-f57c64a52b9c80c0
Reusing dataset json (C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 3/3 [00:00<00:00, 999.04it/s]

{'train': 3219, 'test': 974, 'validation': 455}





In [2]:
scierc_dataset['train'][255]

{'text': 'We present two [[ methods ]] for capturing << nonstationary chaos >> , then present a few examples including biological signals , ocean waves and traffic flow .',
 'label': 'USED-FOR',
 'metadata': [3, 3, 6, 7]}

In [3]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

# Encode the input data
# NOTE: num_proc does not seem to work, for some reason it can't find the tokenizer
scierc_dataset = scierc_dataset.map(encode_batch, 
                                    batched=True, 
                                    remove_columns=scierc_dataset['train'].column_names, 
                                    )

# Main data processing function that will concatenate all texts from
# our dataset and generate chunks of block_size.

# NOTE: We may not need this due to our data being complete sentences
# block_size = 128
# def group_texts(examples):
#     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     result = {
#         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
#         for k, t in concatenated_examples.items()
#     }
#     result["labels"] = result["input_ids"].copy()
#     return result

# NOTE: num_proc does not seem to work, for some reason it can't find the tokenizer  
# scierc_dataset = scierc_dataset.map(group_texts, batched=True)

def add_labels(examples):
  examples["labels"] = examples["input_ids"].copy()
  return examples
  
scierc_dataset = scierc_dataset.map(add_labels, batched=True)

Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-322d59869b37225b.arrow
100%|██████████| 1/1 [00:00<00:00,  2.68ba/s]
100%|██████████| 1/1 [00:00<00:00,  6.45ba/s]
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-11adc37878be1af7.arrow
100%|██████████| 1/1 [00:00<00:00,  6.32ba/s]
100%|██████████| 1/1 [00:00<00:00, 13.32ba/s]


In [4]:
scierc_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [5]:
# Collater adds padding in the form of EOS tokens, makes data augmentations of random masking ('mlm_probability)
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [6]:
# creating model
from transformers import RobertaConfig
from transformers import RobertaAdapterModel

config = RobertaConfig.from_pretrained(
    "roberta-base",
    # num_labels=num_of_labels,
)
model = RobertaAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaAdapterModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Add new adapter
model.add_adapter("sci_erc")
# Add a matching language model head
model.add_masked_lm_head(
    "sci_erc",
)
# Activate the adapter
# model.set_active_adapters("sci_erc")
model.train_adapter('sci_erc')


In [12]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    output_dir="./training_output/pretraining",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
)


# Adding Support for Tensorboard, supposedly you don't have to do this, but I find that it doesn't work
from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback
writer = SummaryWriter()
writer = TensorBoardCallback(writer)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=scierc_dataset["train"],
    eval_dataset=scierc_dataset["validation"],
    data_collator=data_collator,
     
)

using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
trainer.train()

***** Running training *****
  Num examples = 3219
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10100
  0%|          | 6/10100 [00:07<1:58:57,  1.41it/s] 

KeyboardInterrupt: 

In [None]:
trainer.evaluate(scierc_dataset['test'])