In [None]:
#Loading Libraries
# %conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
# %pip install -U adapter-transformers
# %conda install -y -c conda-forge tensorboard
# %pip install optuna
# %pip install tqdm
# from tqdm.notebook import tqdm

In [1]:
# Loading dataset
from datasets import load_dataset

# dataset_name = 'nsusemiehl/SciERC'
dataset_name = 'zapsdcn/citation_intent'

dataset = load_dataset(dataset_name)
dataset_name = 'citation_intent'

print(dataset.num_rows)

Using custom data configuration zapsdcn--citation_intent-0b0f6658161cc990
Reusing dataset json (C:\Users\The Doctor\.cache\huggingface\datasets\json\zapsdcn--citation_intent-0b0f6658161cc990\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 3/3 [00:00<00:00, 1498.68it/s]

{'train': 1688, 'test': 139, 'validation': 114}





In [2]:
dataset['train'][255]

{'text': 'There have been several efforts aimed at developing a domain-independent method for generating responses from a frame representation of user requests ( Bobrow et al. , 1977 ; Chu-Carroll , 1999 ) .',
 'label': 'Future',
 'metadata': {}}

This block creates dataset for pretraining

In [3]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# Tokenize the set for the transformer
def encode_batch_pretraining(batch):
    """Encodes a batch of input data using the model tokenizer."""
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

# Encode the input data
# NOTE: num_proc does not seem to work, for some reason it can't find the tokenizer
print(dataset['train'].column_names)
dataset_pretraining = dataset.map(encode_batch_pretraining, batched=True, remove_columns=dataset['train'].column_names,)

# We make the labels the same as the input as this is language learning 
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
  
dataset_pretraining = dataset_pretraining.map(add_labels, batched=True)
dataset_pretraining.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



['text', 'label', 'metadata']


Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\zapsdcn--citation_intent-0b0f6658161cc990\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-69c4b714d94b638b.arrow
100%|██████████| 1/1 [00:00<00:00,  9.34ba/s]
100%|██████████| 1/1 [00:00<00:00, 15.14ba/s]
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\zapsdcn--citation_intent-0b0f6658161cc990\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-c2962fc358ac477f.arrow
100%|██████████| 1/1 [00:00<00:00, 39.96ba/s]
100%|██████████| 1/1 [00:00<00:00, 47.58ba/s]


In [4]:
# Collater adds padding in the form of EOS tokens, makes data augmentations of random masking ('mlm_probability)
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Here we are creating the dataset for task finetuning

In [5]:
# Finding the number of labels
import numpy as np
labels = np.unique(np.array(dataset['train']['label']))
num_of_labels = labels.size

print(labels)
print(num_of_labels)

['Background' 'CompareOrContrast' 'Extends' 'Future' 'Motivation' 'Uses']
6


In [None]:
# encoding the labels
def encode_labels(dataset):
    for i in range(num_of_labels):
        if dataset['label'] == labels[i]:
            dataset['label'] = i
    return dataset

if dataset_name == 'zapsdcn/citation_intent':
    dataset = dataset.map(encode_labels, remove_columns=["metadata"])
else:
    dataset = dataset.map(encode_labels)
dataset['train'][0]

In [None]:
def encode_batch_finetuning(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=128, truncation=True, padding="max_length")

# Encode the input data
dataset_finetuning = dataset.map(encode_batch_finetuning, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset_finetuning = dataset_finetuning.rename_column("label", 'labels')
# Transform to pytorch tensors and only output the required columns
dataset_finetuning.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

100%|██████████| 2/2 [00:00<00:00,  3.82ba/s]
100%|██████████| 1/1 [00:00<00:00, 21.26ba/s]
100%|██████████| 1/1 [00:00<00:00, 27.00ba/s]


# Model Creation

In [None]:
from transformers import RobertaConfig
from transformers import RobertaAdapterModel

def model_init(adapter_name = 'default_adapter', 
               num_lables = 0, 
               pretraining = False,
               load_adapter = False,
               adapter_dir = 'path'):
    """Creates a new roBERTa model with the given name for its adapter.

    Args:
        adapter_name (str): The name of the adapter to load/create. Defaults to 'default_adapter'.
        num_lables (int, optional): The number of labels for classification task. Defaults to 0.
        pretraining (bool, optional): Whether to create a model for pretraining or classification. Defaults to False.
        load_adapter (bool, optional): Whether to load an adapter with the adapter_name given or create a new one. Defaults to False.
        adapter_dir (str, optional): Directory to load the adapter. If load_adapter you need to specify this.  Defaults to 'path'.

    Returns:
        RobertaAdapterModel: A roBERTA model with an adapter added to it.
    """
    
    if pretraining:
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            # num_labels=num_of_labels,*-8536.22.03
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        if load_adapter:
            # Add new adapter
            model.load_adapter(adapter_dir)

        else:
            # Add new adapter
            model.add_adapter(adapter_name)
            
        # Add a matching classification head
        model.add_masked_lm_head(adapter_name)
            
    else:
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            num_labels=num_lables,
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        
        if load_adapter:
            # Add new adapter
            model.load_adapter(adapter_dir)

        else:
            # Add new adapter
            model.add_adapter(adapter_name)
            
        # Add a matching classification head
        model.add_classification_head(
                adapter_name,
                num_labels=num_lables,
                id2label={0:'Background', 1:'CompareOrContrast', 2:'Extends', 
                        3:'Future', 4:'Motivation', 5:'Uses'},
                overwrite_ok = True)
            
    # Activate the adapter
    model.train_adapter(adapter_name)    
     
    return model

Pretraining Block

In [None]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import load_metric
from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback

import json

def pretraining_loop(num_models, training_args, dataset, 
                     data_collator, adapter_name, 
                    #  DAPT_n_TAPT, TAPT_dataset
                     ):
    """The Loop for running num_models number of models to account for run2run variance. Will run the model 
        and evaluate.

    Args:
        num_models (int): Number of models to loop through
        training_args (transformers.TrainingArguments): The arguments to pass to the trainer
        dataset (dataset): The dataset to train on
        data_collator (data_collator): The data collator for the trainer to use
        adapter_name (str): Name of the adapter to create
    """

    for i in range(num_models):
        adapter = f"{adapter_name}_{i}"
        model = model_init(adapter_name = adapter, pretraining=True)
        
        writer = SummaryWriter(log_dir= f'runs/{adapter}')
        writer = TensorBoardCallback(writer)

        trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            data_collator=data_collator,  
            callbacks=[writer] 
        )
        
        trainer.train()
        
        f = open(f"{training_args.output_dir}/evaulations.txt", "a")
        f.write(adapter)
        f.write(json.dumps(trainer.evaluate(dataset['test'])))
        f.write('\n')
        f.close()
        
        model.save_all_adapters(training_args.output_dir, with_head=False)
        # model.save_pretrained(f"{adapter_name}")
        
        # if DAPT_n_TAPT:
        #     trainer = AdapterTrainer(
        #         model=model,
        #         args=training_args,
        #         train_dataset=TAPT_dataset["train"],
        #         eval_dataset=TAPT_dataset["validation"],
        #         data_collator=data_collator,  
        #         callbacks=[writer] 
        #     )
            
        #     trainer.train()
        
        #     f = open("DAPT_TAPT_evaulations.txt", "a")
        #     f.write(adapter_name)
        #     f.write(trainer.evaluate(TAPT_dataset['test']))
        #     f.write('\n')
        #     f.close()
            
        #     model.save_pretrained(f"{adapter_name}_DAPT_TAPT")

DAPT Training

In [None]:
# training_args = TrainingArguments(
#     learning_rate=5e-4,
#     num_train_epochs=1,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     logging_steps=100,
#     output_dir="./training_output/pretraining/DAPT",
#     overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=True,
#     evaluation_strategy = 'steps',
#     # load_best_model_at_end = True,
#     save_steps = 100,
#     gradient_accumulation_steps = 64,
#     warmup_ratio = 0.06,
#     weight_decay=0.01,
#     adam_epsilon = 1e-6,
# )

In [None]:
# pretraining_loop(num_models = 5, 
#                  training_args = training_args, 
#                 #  dataset = DAPT_dataset, TODO: Need to add DAPT training set
#                  data_collator = data_collator, 
#                  adapter_name = "DAPT_sci-erc")

DAPT+TAPT Training

TAPT Training

In [7]:
training_args = TrainingArguments(
    learning_rate=0.0001,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=10,
    output_dir="./training_output/pretraining/TAPT",
    overwrite_output_dir=True,
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100,
    gradient_accumulation_steps = 8,
    warmup_ratio = 0.06,
    # load_best_model_at_end = True,
    weight_decay=0.01,
    adam_epsilon = 1e-6,
)

NameError: name 'TrainingArguments' is not defined

In [11]:
pretraining_loop(num_models = 2, 
                 training_args = training_args, 
                 dataset = dataset_pretraining, 
                 data_collator = data_collator, 
                 adapter_name = f"TAPT_{dataset_name}")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaAdapterModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You

KeyboardInterrupt: 

Fine Tuning Models

In [None]:
from datasets import load_metric
metric = load_metric('f1')

def compute_metric(EvalPrediction):
  
  logits, labels = EvalPrediction
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels, average= 'macro')

In [None]:
def finetuning_loop(num_models, training_args, dataset, adapter_name, num_labels, load_adapter = False, adapter_dir = 'Path'):
    """The loop for finetuning num_models number of models to account for run2run variance

    Args:
        num_models (int): Number of models to loop through
        training_args (transformers.TrainingArguments): The arguments to pass to the trainer
        dataset (dataset): The dataset to train on
        adapter_name (str): Name of the adapter to create/load
        num_labels (int): Number of labels for classification task
        load_adapter (bool, optional): Whether to load the adapter based on adapter_name. Defaults to False.
        adapter_dir (str, optional): Path to the adapter to load when load_adapter is True. Defaults to 'Path'.
    """

    for i in range(num_models):
        adapter = f"{adapter_name}_{i}"
        model = model_init(adapter_name = adapter, num_lables = num_labels, pretraining=False, load_adapter = load_adapter, adapter_dir = f"{adapter_dir}/{adapter}")
        
        writer = SummaryWriter(log_dir= f'runs/{adapter}')
        writer = TensorBoardCallback(writer)

        trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            callbacks=[writer],
            compute_metrics = compute_metric 
        )
        
        trainer.train()
        
        f = open(f"{training_args.output_dir}/evaulations.txt", "a")
        f.write(adapter)
        f.write(json.dumps(trainer.evaluate(dataset['test'])))
        f.write('\n')
        f.close()
        
        # model.save_pretrained(f"{adapter_name}")
        model.save_all_adapters(training_args.output_dir)
        
        trainer.remove_callback(writer)

DAPT Finetuning

In [None]:
# training_args = TrainingArguments(
#     learning_rate=1e-4,
#     num_train_epochs=50,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     logging_steps=100,
#     output_dir="./training_output/finetuning/DAPT",
#     overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
#     evaluation_strategy = 'epoch',
#     # load_best_model_at_end = True,
#     save_steps = 100
# )

In [None]:
# finetuning_loop(num_models = 5, 
#                  training_args = training_args, 
#                  dataset = scierc_dataset_finetuning,  
#                  adapter_name = "DAPT_sci-erc",
#                  load_adapter = True)

DAPT+TAPT Finetuning

In [None]:
# training_args = TrainingArguments(
#     learning_rate=1e-4,
#     num_train_epochs=50,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     logging_steps=10,
#     output_dir="./training_output/finetuning/DAPT_TAPT",
#     overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
#     evaluation_strategy = 'epoch',
#     # load_best_model_at_end = True,
#     save_steps = 100
# )

In [None]:
# finetuning_loop(num_models = 5, 
#                  training_args = training_args, 
#                  dataset = scierc_dataset_finetuning,  
#                  adapter_name = "DAPT_TAPT_sci-erc",
#                  load_adapter = True,
#                  adapter_dir = "./training_output/pretraining/DAPT_TAPT",
#                  num_labels = num_of_labels)

TAPT Finetuning

In [None]:
# training_args = TrainingArguments(
#     learning_rate=2e-5,
#     num_train_epochs=50,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     logging_steps=100,
#     output_dir="./training_output/finetuning/TAPT",
#     overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
#     evaluation_strategy = 'epoch',
#     # load_best_model_at_end = True,
#     save_steps = 100,
#     lr_scheduler_type = 'constant',
#     log_level  = 'error'
    
# )

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps=100,
    output_dir="./training_output/finetuning/TAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    evaluation_strategy = 'epoch',
    # load_best_model_at_end = True,
    save_steps = 100,
    # lr_scheduler_type = 'constant',
)

In [None]:
finetuning_loop(num_models = 2, 
                 training_args = training_args, 
                 dataset = dataset_finetuning,  
                 adapter_name = dataset_name,
                 load_adapter = True,
                 adapter_dir = "./training_output/pretraining/TAPT",
                 num_labels = num_of_labels)

Only Finetuning

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps=100,
    output_dir="./training_output/finetuning/No_Pretrain",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100,
    # lr_scheduler_type = 'constant',
)

In [None]:
finetuning_loop(num_models = 2, 
                 training_args = training_args, 
                 dataset = dataset_finetuning,  
                 adapter_name = dataset_name,
                 load_adapter = False,
                 num_labels = num_of_labels)