In [None]:
#Loading Libraries
%conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
%pip install -U adapter-transformers
%conda install -y -c conda-forge tensorboard
%pip install optuna

In [None]:
from datasets import load_dataset

scierc_name = 'nsusemiehl/SciERC'
scierc_dataset = load_dataset(scierc_name)
print(scierc_dataset.num_rows)

In [None]:
scierc_dataset['train'][255]

This block creates dataset for TAPT

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch_pretraining(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

# Encode the input data
# NOTE: num_proc does not seem to work, for some reason it can't find the tokenizer
scierc_dataset_pretraining = scierc_dataset.map(encode_batch_pretraining, 
                                    batched=True, 
                                    remove_columns=scierc_dataset['train'].column_names, 
                                    )

def add_labels(examples):
  examples["labels"] = examples["input_ids"].copy()
  return examples
  
scierc_dataset_pretraining = scierc_dataset_pretraining.map(add_labels, batched=True)
scierc_dataset_pretraining.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



In [None]:
# Collater adds padding in the form of EOS tokens, makes data augmentations of random masking ('mlm_probability)
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Here we are creating the dataset for task finetuning

In [None]:
# Finding the number of labels
import numpy as np
labels = np.unique(np.array(scierc_dataset['train']['label']))
num_of_labels = labels.size

print(labels)
print(num_of_labels)

In [None]:
# encoding the labels
def encode_labels(dataset):
    for i in range(num_of_labels):
        if dataset['label'] == labels[i]:
            dataset['label'] = i
    return dataset

scierc_dataset = scierc_dataset.map(encode_labels)
scierc_dataset['train'][0]

In [None]:
def encode_batch_finetuning(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

# Encode the input data
scierc_dataset_finetuning = scierc_dataset.map(encode_batch_finetuning, batched=True)
# The transformers model expects the target class column to be named "labels"
scierc_dataset_finetuning = scierc_dataset_finetuning.rename_column("label", 'labels')
# Transform to pytorch tensors and only output the required columns
scierc_dataset_finetuning.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Model Creation

In [None]:
from transformers import RobertaConfig
from transformers import RobertaAdapterModel

def model_init(adapter_name = 'default_adapter', 
               num_lables = 0, 
               pretraining = False):
    
    if pretraining:
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            # num_labels=num_of_labels,
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        # Add new adapter
        model.add_adapter(adapter_name)
        # Add a matching classification head
        model.add_masked_lm_head(adapter_name)
        # Activate the adapter
        model.train_adapter(adapter_name)
    
    else: 
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            num_labels=num_of_labels,
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        # Add new adapter
        model.add_adapter(adapter_name)
        # Add a matching classification head
        model.add_classification_head(
            adapter_name,
            num_labels=num_of_labels,
            id2label={0:'COMPARE', 1:'CONJUNCTION', 2:'EVALUATE-FOR', 
                    3:'FEATURE-OF', 4:'HYPONYM-OF', 5:'PART-OF', 6:'USED-FOR'}
        )
        # Activate the adapter
        model.train_adapter(adapter_name)
    
    
    return model

Pretraining Block

In [None]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import load_metric
from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback

def pretraining_loop(num_models, training_args, dataset, data_collator, adapter_name):

    for i in range(num_models):
        adapter_name = f"{adapter_name}_{i}"
        model = model_init(adapter_name = adapter_name, pretraining=True)
        
        writer = SummaryWriter()
        writer = TensorBoardCallback(writer, filename_suffix = adapter_name)

        trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            data_collator=data_collator,  
            callbacks=[writer] 
        )
        
        trainer.train()
        
        model.save_pretrained(adapter_name)

DAPT Training

In [2]:
training_args = TrainingArguments(
    learning_rate=2e-5,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    output_dir="./training_output/pretraining/DAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100
)

NameError: name 'TrainingArguments' is not defined

In [None]:
pretraining_loop(num_models = 5, 
                 training_args = training_args, 
                #  dataset = DAPT_dataset, 
                 data_collator = data_collator, 
                 adapter_name = "DAPT_sci-erc")

Fine Tuning Models

In [None]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import load_metric

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    output_dir="./training_output/pretraining",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
)


metric = load_metric('f1')

def compute_metric(EvalPrediction):
  
  logits, labels = EvalPrediction
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels, average= 'macro')

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=scierc_dataset["train"],
    eval_dataset=scierc_dataset["validation"],
    data_collator=data_collator,
    compute_metric = compute_metric
     
)

In [9]:
trainer.train()



In [None]:
trainer.evaluate(scierc_dataset['test'])

In [1]:
for i in range(1):
    print(i)

0
