# Adapters

Notebook for adapter training.

## User input

adapter_type options: bottleneck, prefix, prompt

In [16]:
task_param = "cola"
adapter_type = "bottleneck"

training_params = {
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'per_device_train_batch_size': 32,
    'per_device_eval_batch_size': 32,
    'output_dir': 'adapter-distillroberta-base'
}

## Setup

### Modules

In [17]:
import os
import numpy as np
import torch
from collections import namedtuple

# Dataset
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaConfig, TrainingArguments, EvalPrediction, default_data_collator
from adapters import AutoAdapterModel, AdapterTrainer, ConfigUnion, BnConfig, ParBnConfig, PrefixTuningConfig, PromptTuningConfig

### Device

In [18]:
device_name = "cpu"  # Default device is CPU
if torch.cuda.is_available():
    if 'COLAB_GPU' in os.environ: # Detects if notebook is being run in a colab environment
        print("colab environment")
        device_name = "cuda" # if you're using a T4 GPU on Colab, the device name should be "cuda"
    else:
        device_name = "cuda:0" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

device = torch.device(device_name)
print(device_name)

mps


### Task

In [19]:
TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "num_classes", "col_names"])

task_configs = {
    "cola": TaskConfig("one", "BC", 2, ['sentence']),
    "sst2": TaskConfig("one", "BC", 2, ['sentence']),
    "mrpc": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "R", None, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "BC", 2, ['question1', 'question2']),
    "mnli_matched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "mnli_mismatched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "BC", 2, ['question', 'sentence']),
    "rte": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

### Load dataset

In [20]:
if task_param == "mnli_matched": 
    data = load_dataset("glue", "mnli") 
    val_key = "validation_matched"
    test_key = "test_matched"
elif task_param == "mnli_mismatched":
    data = load_dataset("glue", "mnli") 
    val_key = "validation_mismatched"
    test_key = "test_mismatched"
else:
    data = load_dataset("glue", task_param)
    val_key = "validation"
    test_key = "test"

data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

### Process dataset

In [21]:
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

max_len = 512 # TODO: How is this value decided?

def tokenize(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     add_special_tokens=True, 
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

def tokenize_double(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     examples[task_config.col_names[1]],
                     add_special_tokens=True,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

# Tokenize the input
data = data.map(tokenize, batched=True)
# The transformers model expects the target class column to be named "labels"
data = data.rename_column(original_column_name="label", new_column_name="labels") 
# Transform to pytorch tensors and only output the required columns
data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

## Adapter

### Model setup

In [22]:
if task_config.class_type == "R":
    config = RobertaConfig.from_pretrained("distilroberta-base")
else:
    config = RobertaConfig.from_pretrained(
        "distilroberta-base",
        num_labels=task_config.num_classes)

model = AutoAdapterModel.from_pretrained(
    "distilroberta-base",
    config=config).to(device)

pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"total trainable parameters for raw model: {pytorch_total_params}")

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


total trainable parameters for raw model: 82760793


### Adapter setup

In [23]:
if adapter_type == "bottleneck":
    adapter_config = BnConfig(mh_adapter=True,
                            output_adapter=True,
                            reduction_factor=2,
                            non_linearity='relu')
elif adapter_type == "prefix":
    adapter_config = PrefixTuningConfig()
elif adapter_type == "prompt":
    adapter_config = PromptTuningConfig()

# Other adapter config options: 
# ParBnConfig(reduction_factor=4)
# PrefixTuningConfig(flat=False, prefix_length=30)
# ConfigUnion(
#     PrefixTuningConfig(prefix_length=20),
#     ParBnConfig(reduction_factor=4),)

# Add a new adapter
model.add_adapter(task_param, config=adapter_config)

# Add a matching prediction head
if task_config.class_type == "R":
    model.add_regression_head(task_param)
else:
    model.add_classification_head(
        task_param,
        num_labels=task_config.num_classes)

# Freeze all weights in the model except for those of the adapter
model.train_adapter(task_param)

# Activate the adapter
model.set_active_adapters(task_param)

pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"total trainable parameters for fine-tuning method: {pytorch_total_params}")

total trainable parameters for fine-tuning method: 8326235


## Training

### Trainer setup

In [9]:
def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

training_args = TrainingArguments(
    learning_rate=training_params['learning_rate'],
    num_train_epochs=training_params['num_train_epochs'],
    per_device_train_batch_size=training_params['per_device_train_batch_size'],
    per_device_eval_batch_size=training_params['per_device_eval_batch_size'],
    output_dir=training_params['output_dir'],
    overwrite_output_dir=True,
    remove_unused_columns=False) # Ensures dataset labels are properly passed to the model

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    compute_metrics=compute_accuracy,
    data_collator=default_data_collator)

### Training loop

In [10]:
trainer.train()

  0%|          | 0/268 [00:00<?, ?it/s]

KeyboardInterrupt: 