In [None]:
#Loading Libraries
# %conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
%pip install -U adapter-transformers
%conda install -y -c conda-forge tensorboard
%pip install optuna

In [1]:
# Loading dataset
from datasets import load_dataset

scierc_name = 'nsusemiehl/SciERC'
scierc_dataset = load_dataset(scierc_name)
print(scierc_dataset.num_rows)

Using custom data configuration nsusemiehl--SciERC-f57c64a52b9c80c0
Reusing dataset json (C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|██████████| 3/3 [00:00<00:00, 999.12it/s]

{'train': 3219, 'test': 974, 'validation': 455}





In [2]:
scierc_dataset['train'][255]

{'text': 'We present two [[ methods ]] for capturing << nonstationary chaos >> , then present a few examples including biological signals , ocean waves and traffic flow .',
 'label': 'USED-FOR',
 'metadata': [3, 3, 6, 7]}

This block creates dataset for pretraining

In [3]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# Tokenize the set for the transformer
def encode_batch_pretraining(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

# Encode the input data
# NOTE: num_proc does not seem to work, for some reason it can't find the tokenizer
scierc_dataset_pretraining = scierc_dataset.map(encode_batch_pretraining, 
                                    batched=True, 
                                    remove_columns=scierc_dataset['train'].column_names, 
                                    )

# We make the labels the same as the input as this is language learning 
def add_labels(examples):
  examples["labels"] = examples["input_ids"].copy()
  return examples
  
scierc_dataset_pretraining = scierc_dataset_pretraining.map(add_labels, batched=True)
scierc_dataset_pretraining.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-74c87c445e04c867.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-876e0d37d233bb91.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-ecd8f07822814f57.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-b972bbb5f00d5d6e.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl-

In [4]:
# Collater adds padding in the form of EOS tokens, makes data augmentations of random masking ('mlm_probability)
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Here we are creating the dataset for task finetuning

In [5]:
# Finding the number of labels
import numpy as np
labels = np.unique(np.array(scierc_dataset['train']['label']))
num_of_labels = labels.size

print(labels)
print(num_of_labels)

['COMPARE' 'CONJUNCTION' 'EVALUATE-FOR' 'FEATURE-OF' 'HYPONYM-OF'
 'PART-OF' 'USED-FOR']
7


In [6]:
# encoding the labels
def encode_labels(dataset):
    for i in range(num_of_labels):
        if dataset['label'] == labels[i]:
            dataset['label'] = i
    return dataset

scierc_dataset = scierc_dataset.map(encode_labels)
scierc_dataset['train'][0]

Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-3fa4decd4606a523.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-4d2e52dbe4cdbad6.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-214c6724dd02783d.arrow


{'text': 'The agreement in question involves number in [[ nouns ]] and << reflexive pronouns >> and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .',
 'label': 1,
 'metadata': [7, 7, 9, 10]}

In [7]:
def encode_batch_finetuning(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

# Encode the input data
scierc_dataset_finetuning = scierc_dataset.map(encode_batch_finetuning, batched=True)
# The transformers model expects the target class column to be named "labels"
scierc_dataset_finetuning = scierc_dataset_finetuning.rename_column("label", 'labels')
# Transform to pytorch tensors and only output the required columns
scierc_dataset_finetuning.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-6c719cda162c2a70.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-a2e89e74a8a70442.arrow
Loading cached processed dataset at C:\Users\The Doctor\.cache\huggingface\datasets\json\nsusemiehl--SciERC-f57c64a52b9c80c0\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b\cache-1e077601566683c7.arrow


# Model Creation

In [8]:
from transformers import RobertaConfig
from transformers import RobertaAdapterModel

def model_init(adapter_name = 'default_adapter', 
               num_lables = 0, 
               pretraining = False,
               load_adapter = False,
               adapter_dir = 'path'):
    """Creates a new roBERTa model with the given name for its adapter.

    Args:
        adapter_name (str): The name of the adapter to load/create. Defaults to 'default_adapter'.
        num_lables (int, optional): The number of labels for classification task. Defaults to 0.
        pretraining (bool, optional): Whether to create a model for pretraining or classification. Defaults to False.
        load_adapter (bool, optional): Whether to load an adapter with the adapter_name given or create a new one. Defaults to False.
        adapter_dir (str, optional): Directory to load the adapter. If load_adapter you need to specify this.  Defaults to 'path'.

    Returns:
        RobertaAdapterModel: A roBERTA model with an adapter added to it.
    """
    
    if pretraining:
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            # num_labels=num_of_labels,*-8536.22.03
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        if load_adapter:
            # Add new adapter
            model.load_adapter(adapter_dir)

        else:
            # Add new adapter
            model.add_adapter(adapter_name)
            
        # Add a matching classification head
        model.add_masked_lm_head(adapter_name)
            
    else:
        config = RobertaConfig.from_pretrained(
            "roberta-base",
            num_labels=num_lables,
        )
        model = RobertaAdapterModel.from_pretrained(
            "roberta-base",
            config=config,
        )
        
        if load_adapter:
            # Add new adapter
            model.load_adapter(adapter_dir)

        else:
            # Add new adapter
            model.add_adapter(adapter_name)
            
        # Add a matching classification head
        model.add_classification_head(
                adapter_name,
                num_labels=num_lables,
                id2label={0:'COMPARE', 1:'CONJUNCTION', 2:'EVALUATE-FOR', 
                        3:'FEATURE-OF', 4:'HYPONYM-OF', 5:'PART-OF', 6:'USED-FOR'},
                overwrite_ok = True)
            
    # Activate the adapter
    model.train_adapter(adapter_name)    
     
    return model

Pretraining Block

In [9]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import load_metric
from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback

import json

def pretraining_loop(num_models, training_args, dataset, 
                     data_collator, adapter_name, 
                    #  DAPT_n_TAPT, TAPT_dataset
                     ):
    """_summary_

    Args:
        num_models (_type_): _description_
        training_args (_type_): _description_
        dataset (_type_): _description_
        data_collator (_type_): _description_
        adapter_name (_type_): _description_
    """

    for i in range(num_models):
        adapter = f"{adapter_name}_{i}"
        model = model_init(adapter_name = adapter, pretraining=True)
        
        writer = SummaryWriter(log_dir= f'runs/{adapter}')
        writer = TensorBoardCallback(writer)

        trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            data_collator=data_collator,  
            callbacks=[writer] 
        )
        
        trainer.train()
        
        f = open(f"{training_args.output_dir}/evaulations.txt", "a")
        f.write(adapter)
        f.write(json.dumps(trainer.evaluate(dataset['test'])))
        f.write('\n')
        f.close()
        
        model.save_all_adapters(training_args.output_dir, with_head=False)
        # model.save_pretrained(f"{adapter_name}")
        
        # if DAPT_n_TAPT:
        #     trainer = AdapterTrainer(
        #         model=model,
        #         args=training_args,
        #         train_dataset=TAPT_dataset["train"],
        #         eval_dataset=TAPT_dataset["validation"],
        #         data_collator=data_collator,  
        #         callbacks=[writer] 
        #     )
            
        #     trainer.train()
        
        #     f = open("DAPT_TAPT_evaulations.txt", "a")
        #     f.write(adapter_name)
        #     f.write(trainer.evaluate(TAPT_dataset['test']))
        #     f.write('\n')
        #     f.close()
            
        #     model.save_pretrained(f"{adapter_name}_DAPT_TAPT")

DAPT Training

In [10]:
training_args = TrainingArguments(
    learning_rate=5e-4,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    output_dir="./training_output/pretraining/DAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100,
    gradient_accumulation_steps = 64,
    warmup_ratio = 0.06
)

In [None]:
pretraining_loop(num_models = 5, 
                 training_args = training_args, 
                #  dataset = DAPT_dataset, TODO: Need to add DAPT training set
                 data_collator = data_collator, 
                 adapter_name = "DAPT_sci-erc")

DAPT+TAPT Training

TAPT Training

In [10]:
training_args = TrainingArguments(
    learning_rate=0.0001,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=10,
    output_dir="./training_output/pretraining/TAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100,
    gradient_accumulation_steps = 8,
    warmup_ratio = 0.06,
    # load_best_model_at_end = True
)

In [11]:
pretraining_loop(num_models = 1, 
                 training_args = training_args, 
                 dataset = scierc_dataset_pretraining, 
                 data_collator = data_collator, 
                 adapter_name = "TAPT_sci-erc")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaAdapterModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You

{'loss': 17.8218, 'learning_rate': 1.388888888888889e-05, 'epoch': 0.79}


                                                   
  1%|          | 10/1200 [00:35<1:02:40,  3.16s/it]

{'eval_loss': 17.37076759338379, 'eval_runtime': 2.6434, 'eval_samples_per_second': 172.127, 'eval_steps_per_second': 5.675, 'epoch': 0.79}


  2%|▏         | 20/1200 [01:07<1:02:08,  3.16s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 18.0503, 'learning_rate': 2.777777777777778e-05, 'epoch': 1.63}


                                                   
  2%|▏         | 20/1200 [01:10<1:02:08,  3.16s/it]

{'eval_loss': 16.33852767944336, 'eval_runtime': 2.6634, 'eval_samples_per_second': 170.833, 'eval_steps_per_second': 5.632, 'epoch': 1.63}


  2%|▎         | 30/1200 [01:43<1:02:30,  3.21s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 16.847, 'learning_rate': 4.166666666666667e-05, 'epoch': 2.48}


                                                   
  2%|▎         | 30/1200 [01:45<1:02:30,  3.21s/it]

{'eval_loss': 15.088282585144043, 'eval_runtime': 2.6544, 'eval_samples_per_second': 171.413, 'eval_steps_per_second': 5.651, 'epoch': 2.48}


  3%|▎         | 40/1200 [02:18<1:03:55,  3.31s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 15.319, 'learning_rate': 5.555555555555556e-05, 'epoch': 3.32}


                                                   
  3%|▎         | 40/1200 [02:21<1:03:55,  3.31s/it]

{'eval_loss': 13.4595947265625, 'eval_runtime': 2.6694, 'eval_samples_per_second': 170.449, 'eval_steps_per_second': 5.619, 'epoch': 3.32}


  4%|▍         | 50/1200 [02:53<1:07:10,  3.51s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 13.3959, 'learning_rate': 6.944444444444444e-05, 'epoch': 4.16}


                                                   
  4%|▍         | 50/1200 [02:56<1:07:10,  3.51s/it]

{'eval_loss': 11.486201286315918, 'eval_runtime': 2.6589, 'eval_samples_per_second': 171.121, 'eval_steps_per_second': 5.641, 'epoch': 4.16}


  5%|▌         | 60/1200 [03:27<59:38,  3.14s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 10.8082, 'learning_rate': 8.333333333333334e-05, 'epoch': 4.95}


                                                 
  5%|▌         | 60/1200 [03:30<59:38,  3.14s/it]

{'eval_loss': 9.695045471191406, 'eval_runtime': 2.7129, 'eval_samples_per_second': 167.718, 'eval_steps_per_second': 5.529, 'epoch': 4.95}


  6%|▌         | 70/1200 [04:02<59:06,  3.14s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 9.8157, 'learning_rate': 9.722222222222223e-05, 'epoch': 5.79}


                                                 
  6%|▌         | 70/1200 [04:05<59:06,  3.14s/it]

{'eval_loss': 8.364571571350098, 'eval_runtime': 2.6625, 'eval_samples_per_second': 170.891, 'eval_steps_per_second': 5.634, 'epoch': 5.79}


  7%|▋         | 80/1200 [04:38<58:57,  3.16s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 8.6135, 'learning_rate': 9.929078014184398e-05, 'epoch': 6.63}


                                                 
  7%|▋         | 80/1200 [04:40<58:57,  3.16s/it]

{'eval_loss': 7.573639869689941, 'eval_runtime': 2.6624, 'eval_samples_per_second': 170.897, 'eval_steps_per_second': 5.634, 'epoch': 6.63}


  8%|▊         | 90/1200 [05:13<59:32,  3.22s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 8.0005, 'learning_rate': 9.840425531914894e-05, 'epoch': 7.48}


                                                 
  8%|▊         | 90/1200 [05:16<59:32,  3.22s/it]

{'eval_loss': 7.161333084106445, 'eval_runtime': 2.6665, 'eval_samples_per_second': 170.634, 'eval_steps_per_second': 5.625, 'epoch': 7.48}


  8%|▊         | 100/1200 [05:48<1:00:29,  3.30s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 7.5853, 'learning_rate': 9.75177304964539e-05, 'epoch': 8.32}


                                                    
  8%|▊         | 100/1200 [05:51<1:00:29,  3.30s/it]Saving model checkpoint to ./training_output/pretraining/TAPT\checkpoint-100
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\adapter_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\pytorch_adapter.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\head_config.json


{'eval_loss': 6.86968994140625, 'eval_runtime': 2.666, 'eval_samples_per_second': 170.669, 'eval_steps_per_second': 5.626, 'epoch': 8.32}


Module weights saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-100\TAPT_sci-erc_0\pytorch_model_head.bin
  9%|▉         | 110/1200 [06:25<1:03:43,  3.51s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 7.3055, 'learning_rate': 9.663120567375887e-05, 'epoch': 9.16}


                                                    
  9%|▉         | 110/1200 [06:27<1:03:43,  3.51s/it]

{'eval_loss': 6.635702610015869, 'eval_runtime': 2.6589, 'eval_samples_per_second': 171.122, 'eval_steps_per_second': 5.641, 'epoch': 9.16}


 10%|█         | 120/1200 [06:58<56:06,  3.12s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.696, 'learning_rate': 9.574468085106384e-05, 'epoch': 9.95}


                                                  
 10%|█         | 120/1200 [07:01<56:06,  3.12s/it]

{'eval_loss': 6.582831859588623, 'eval_runtime': 2.6634, 'eval_samples_per_second': 170.833, 'eval_steps_per_second': 5.632, 'epoch': 9.95}


 11%|█         | 130/1200 [07:33<56:07,  3.15s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.9756, 'learning_rate': 9.485815602836879e-05, 'epoch': 10.79}


                                                  
 11%|█         | 130/1200 [07:36<56:07,  3.15s/it]

{'eval_loss': 6.478057384490967, 'eval_runtime': 2.6639, 'eval_samples_per_second': 170.799, 'eval_steps_per_second': 5.631, 'epoch': 10.79}


 12%|█▏        | 140/1200 [08:09<55:44,  3.15s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.8633, 'learning_rate': 9.397163120567376e-05, 'epoch': 11.63}


                                                  
 12%|█▏        | 140/1200 [08:11<55:44,  3.15s/it]

{'eval_loss': 6.315354347229004, 'eval_runtime': 2.6504, 'eval_samples_per_second': 171.672, 'eval_steps_per_second': 5.66, 'epoch': 11.63}


 12%|█▎        | 150/1200 [08:44<56:01,  3.20s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.7847, 'learning_rate': 9.308510638297873e-05, 'epoch': 12.48}


                                                  
 12%|█▎        | 150/1200 [08:47<56:01,  3.20s/it]

{'eval_loss': 6.320107936859131, 'eval_runtime': 2.6564, 'eval_samples_per_second': 171.283, 'eval_steps_per_second': 5.647, 'epoch': 12.48}


 13%|█▎        | 160/1200 [09:19<57:13,  3.30s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.6586, 'learning_rate': 9.21985815602837e-05, 'epoch': 13.32}


                                                  
 13%|█▎        | 160/1200 [09:22<57:13,  3.30s/it]

{'eval_loss': 6.201133728027344, 'eval_runtime': 2.6669, 'eval_samples_per_second': 170.607, 'eval_steps_per_second': 5.624, 'epoch': 13.32}


 14%|█▍        | 170/1200 [09:55<1:00:02,  3.50s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.632, 'learning_rate': 9.131205673758866e-05, 'epoch': 14.16}


                                                    
 14%|█▍        | 170/1200 [09:57<1:00:02,  3.50s/it]

{'eval_loss': 6.158675193786621, 'eval_runtime': 2.6644, 'eval_samples_per_second': 170.769, 'eval_steps_per_second': 5.63, 'epoch': 14.16}


 15%|█▌        | 180/1200 [10:28<52:59,  3.12s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.1577, 'learning_rate': 9.042553191489363e-05, 'epoch': 14.95}


                                                  
 15%|█▌        | 180/1200 [10:31<52:59,  3.12s/it]

{'eval_loss': 6.166477680206299, 'eval_runtime': 2.6594, 'eval_samples_per_second': 171.09, 'eval_steps_per_second': 5.64, 'epoch': 14.95}


 16%|█▌        | 190/1200 [11:03<52:45,  3.13s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.4852, 'learning_rate': 8.953900709219858e-05, 'epoch': 15.79}


                                                  
 16%|█▌        | 190/1200 [11:06<52:45,  3.13s/it]

{'eval_loss': 6.042835235595703, 'eval_runtime': 2.6664, 'eval_samples_per_second': 170.641, 'eval_steps_per_second': 5.626, 'epoch': 15.79}


 17%|█▋        | 200/1200 [11:39<52:32,  3.15s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.4219, 'learning_rate': 8.865248226950354e-05, 'epoch': 16.63}


                                                  
 17%|█▋        | 200/1200 [11:41<52:32,  3.15s/it]Saving model checkpoint to ./training_output/pretraining/TAPT\checkpoint-200
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\adapter_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\pytorch_adapter.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\head_config.json


{'eval_loss': 5.978287220001221, 'eval_runtime': 2.6724, 'eval_samples_per_second': 170.257, 'eval_steps_per_second': 5.613, 'epoch': 16.63}


Module weights saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-200\TAPT_sci-erc_0\pytorch_model_head.bin
 18%|█▊        | 210/1200 [12:15<52:59,  3.21s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.4007, 'learning_rate': 8.77659574468085e-05, 'epoch': 17.48}


                                                  
 18%|█▊        | 210/1200 [12:18<52:59,  3.21s/it]

{'eval_loss': 5.98807954788208, 'eval_runtime': 2.6624, 'eval_samples_per_second': 170.897, 'eval_steps_per_second': 5.634, 'epoch': 17.48}


 18%|█▊        | 220/1200 [12:50<53:51,  3.30s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.33, 'learning_rate': 8.687943262411347e-05, 'epoch': 18.32}


                                                  
 18%|█▊        | 220/1200 [12:53<53:51,  3.30s/it]

{'eval_loss': 5.9161553382873535, 'eval_runtime': 2.6865, 'eval_samples_per_second': 169.367, 'eval_steps_per_second': 5.584, 'epoch': 18.32}


 19%|█▉        | 230/1200 [13:26<56:20,  3.49s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.3319, 'learning_rate': 8.599290780141844e-05, 'epoch': 19.16}


                                                  
 19%|█▉        | 230/1200 [13:28<56:20,  3.49s/it]

{'eval_loss': 5.889645576477051, 'eval_runtime': 2.6654, 'eval_samples_per_second': 170.705, 'eval_steps_per_second': 5.628, 'epoch': 19.16}


 20%|██        | 240/1200 [13:59<49:54,  3.12s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 5.8787, 'learning_rate': 8.510638297872341e-05, 'epoch': 19.95}


                                                  
 20%|██        | 240/1200 [14:02<49:54,  3.12s/it]

{'eval_loss': 5.9296722412109375, 'eval_runtime': 2.6614, 'eval_samples_per_second': 170.961, 'eval_steps_per_second': 5.636, 'epoch': 19.95}


 21%|██        | 250/1200 [14:34<49:39,  3.14s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.2291, 'learning_rate': 8.421985815602838e-05, 'epoch': 20.79}


                                                  
 21%|██        | 250/1200 [14:37<49:39,  3.14s/it]

{'eval_loss': 5.8891072273254395, 'eval_runtime': 2.6474, 'eval_samples_per_second': 171.866, 'eval_steps_per_second': 5.666, 'epoch': 20.79}


 22%|██▏       | 260/1200 [15:10<49:28,  3.16s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.2064, 'learning_rate': 8.333333333333334e-05, 'epoch': 21.63}


                                                  
 22%|██▏       | 260/1200 [15:12<49:28,  3.16s/it]

{'eval_loss': 5.852653980255127, 'eval_runtime': 2.6624, 'eval_samples_per_second': 170.897, 'eval_steps_per_second': 5.634, 'epoch': 21.63}


 22%|██▎       | 270/1200 [15:45<49:38,  3.20s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.1534, 'learning_rate': 8.244680851063831e-05, 'epoch': 22.48}


                                                  
 22%|██▎       | 270/1200 [15:48<49:38,  3.20s/it]

{'eval_loss': 5.81445837020874, 'eval_runtime': 2.6504, 'eval_samples_per_second': 171.672, 'eval_steps_per_second': 5.66, 'epoch': 22.48}


 23%|██▎       | 280/1200 [16:20<50:32,  3.30s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.125, 'learning_rate': 8.156028368794327e-05, 'epoch': 23.32}


                                                  
 23%|██▎       | 280/1200 [16:23<50:32,  3.30s/it]

{'eval_loss': 5.849323749542236, 'eval_runtime': 2.6634, 'eval_samples_per_second': 170.833, 'eval_steps_per_second': 5.632, 'epoch': 23.32}


 24%|██▍       | 290/1200 [16:55<52:54,  3.49s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.1219, 'learning_rate': 8.067375886524823e-05, 'epoch': 24.16}


                                                  
 24%|██▍       | 290/1200 [16:58<52:54,  3.49s/it]

{'eval_loss': 5.7037858963012695, 'eval_runtime': 2.6879, 'eval_samples_per_second': 169.274, 'eval_steps_per_second': 5.58, 'epoch': 24.16}


 25%|██▌       | 300/1200 [17:29<47:23,  3.16s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 5.6887, 'learning_rate': 7.978723404255319e-05, 'epoch': 24.95}


                                                  
 25%|██▌       | 300/1200 [17:32<47:23,  3.16s/it]Saving model checkpoint to ./training_output/pretraining/TAPT\checkpoint-300
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\adapter_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\pytorch_adapter.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\head_config.json


{'eval_loss': 5.7752509117126465, 'eval_runtime': 2.6756, 'eval_samples_per_second': 170.058, 'eval_steps_per_second': 5.606, 'epoch': 24.95}


Module weights saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\pytorch_model_head.bin
Configuration saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\head_config.json
Module weights saved in ./training_output/pretraining/TAPT\checkpoint-300\TAPT_sci-erc_0\pytorch_model_head.bin
 26%|██▌       | 310/1200 [18:06<46:57,  3.17s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.0436, 'learning_rate': 7.890070921985815e-05, 'epoch': 25.79}


                                                  
 26%|██▌       | 310/1200 [18:08<46:57,  3.17s/it]

{'eval_loss': 5.671374320983887, 'eval_runtime': 2.6644, 'eval_samples_per_second': 170.769, 'eval_steps_per_second': 5.63, 'epoch': 25.79}


 27%|██▋       | 320/1200 [18:41<46:32,  3.17s/it]  ***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 6.0292, 'learning_rate': 7.801418439716312e-05, 'epoch': 26.63}


                                                  
 27%|██▋       | 320/1200 [18:44<46:32,  3.17s/it]

{'eval_loss': 5.668488025665283, 'eval_runtime': 2.4452, 'eval_samples_per_second': 186.077, 'eval_steps_per_second': 6.134, 'epoch': 26.63}


 28%|██▊       | 330/1200 [19:16<46:18,  3.19s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 5.9657, 'learning_rate': 7.712765957446809e-05, 'epoch': 27.48}


                                                  
 28%|██▊       | 330/1200 [19:19<46:18,  3.19s/it]

{'eval_loss': 5.674651145935059, 'eval_runtime': 2.6684, 'eval_samples_per_second': 170.513, 'eval_steps_per_second': 5.621, 'epoch': 27.48}


 28%|██▊       | 340/1200 [19:51<47:23,  3.31s/it]***** Running Evaluation *****
  Num examples = 455
  Batch size = 32


{'loss': 5.9914, 'learning_rate': 7.624113475177306e-05, 'epoch': 28.32}


                                                  
 28%|██▊       | 340/1200 [19:54<47:23,  3.31s/it]

{'eval_loss': 5.573325157165527, 'eval_runtime': 2.6539, 'eval_samples_per_second': 171.445, 'eval_steps_per_second': 5.652, 'epoch': 28.32}


 28%|██▊       | 342/1200 [20:00<53:40,  3.75s/it]

Fine Tuning Models

In [None]:
from datasets import load_metric
metric = load_metric('f1')

def compute_metric(EvalPrediction):
  
  logits, labels = EvalPrediction
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels, average= 'macro')

In [None]:
def finetuning_loop(num_models, training_args, dataset, adapter_name, num_labels, load_adapter = False, adapter_dir = 'Path'):

    for i in range(num_models):
        adapter = f"{adapter_name}_{i}"
        model = model_init(adapter_name = adapter, num_lables = num_labels, pretraining=False, load_adapter = load_adapter, adapter_dir = f"{adapter_dir}/{adapter}")
        
        writer = SummaryWriter(log_dir= f'runs/{adapter}')
        writer = TensorBoardCallback(writer)

        trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            callbacks=[writer],
            compute_metrics = compute_metric 
        )
        
        trainer.train()
        
        f = open(f"{training_args.output_dir}/evaulations.txt", "a")
        f.write(adapter)
        f.write(json.dumps(trainer.evaluate(dataset['test'])))
        f.write('\n')
        f.close()
        
        # model.save_pretrained(f"{adapter_name}")
        model.save_all_adapters(training_args.output_dir)
        
        trainer.remove_callback(writer)

DAPT Finetuning

In [12]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    output_dir="./training_output/finetuning/DAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    evaluation_strategy = 'epoch',
    # load_best_model_at_end = True,
    save_steps = 100
)

In [None]:
finetuning_loop(num_models = 5, 
                 training_args = training_args, 
                 dataset = scierc_dataset_finetuning,  
                 adapter_name = "DAPT_sci-erc",
                 load_adapter = True)

DAPT+TAPT Finetuning

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=10,
    output_dir="./training_output/finetuning/DAPT_TAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    evaluation_strategy = 'epoch',
    # load_best_model_at_end = True,
    save_steps = 100
)

In [None]:
finetuning_loop(num_models = 5, 
                 training_args = training_args, 
                 dataset = scierc_dataset_finetuning,  
                 adapter_name = "DAPT_TAPT_sci-erc",
                 load_adapter = True,
                 adapter_dir = "./training_output/pretraining/DAPT_TAPT",
                 num_labels = num_of_labels)

TAPT Finetuning

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_steps=25,
    output_dir="./training_output/finetuning/TAPT",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
    evaluation_strategy = 'epoch',
    # load_best_model_at_end = True,
    save_steps = 100
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
finetuning_loop(num_models = 1, 
                 training_args = training_args, 
                 dataset = scierc_dataset_finetuning,  
                 adapter_name = "TAPT_sci-erc",
                 load_adapter = True,
                 adapter_dir = "./training_output/pretraining/TAPT",
                 num_labels = num_of_labels)

Only Finetuning

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=50,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_steps=100,
    output_dir="./training_output/finetuning/No_Pretrain",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    # load_best_model_at_end = True,
    save_steps = 100
)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
finetuning_loop(num_models = 1, 
                 training_args = training_args, 
                 dataset = scierc_dataset_finetuning,  
                 adapter_name = "sci-erc",
                 load_adapter = False,
                 num_labels = num_of_labels)