# Adapter Training

## Setup

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import itertools
import pathlib
import csv
from datetime import datetime
from collections import namedtuple
from tqdm.notebook import tqdm

# Dataset
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaConfig, TrainingArguments, EvalPrediction, default_data_collator
from adapters import AutoAdapterModel, AdapterTrainer, BnConfig, PrefixTuningConfig, PromptTuningConfig, ConfigUnion,  ParBnConfig

In [None]:
device_name = "cpu"  # Default device is CPU
if torch.cuda.is_available():
    if 'COLAB_GPU' in os.environ: # Detects if notebook is being run in a colab environment
        print("colab environment")
        device_name = "cuda" # if you're using a T4 GPU on Colab, the device name should be "cuda"
    else:
        device_name = "cuda:0" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

device = torch.device(device_name)
print(device_name)

## User input

adapter_type options: bottleneck, prefix, prompt

In [None]:
task_param = "sst2"
adapter_type = "bottleneck"
running_test_set = False
output_dir = 'adapter-distillroberta-base'

param_grid_bottleneck_literature = {
    'learning_rate': [3e-5, 3e-4, 3e-3],
    'num_train_epochs': [3, 20],
    'per_device_train_batch_size': [32],
    'per_device_eval_batch_size': [32],
    'bn_reduction_factor': [8, 64, 256],
}

param_grid_prefix_literature = {
    'learning_rate': [5e-3, 7e-3, 1e-2, 1e-4],
    'num_train_epochs': [3, 20],
    'per_device_train_batch_size': [32],
    'per_device_eval_batch_size': [32],
    'prefix_length': [20, 60, 100],
}

param_grid_test = {
    'learning_rate': [0.0001],
    'num_train_epochs': [3],
    'per_device_train_batch_size': [32],
    'per_device_eval_batch_size': [32],
    # 'bn_reduction_factor': [8],
    'prefix_length': [60],
}

if running_test_set:
    param_grid = param_grid_test
elif adapter_type == "bottleneck":
    param_grid = param_grid_bottleneck_literature
elif adapter_type == "prefix":
    param_grid = param_grid_prefix_literature

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
print(f"{len(all_params)} hyperparameter combinations")

### Task

In [None]:
TaskConfig = namedtuple("TaskConfig", ["sentence_type", "class_type", "num_classes", "col_names"])

task_configs = {
    "cola": TaskConfig("one", "BC", 2, ['sentence']),
    "sst2": TaskConfig("one", "BC", 2, ['sentence']),
    "mrpc": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
    "stsb": TaskConfig("two", "R", None, ['sentence1', 'sentence2']),
    "qqp": TaskConfig("two", "BC", 2, ['question1', 'question2']),
    "mnli_matched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "mnli_mismatched": TaskConfig("two", "MC", 3, ['premise', 'hypothesis']),
    "qnli": TaskConfig("two", "BC", 2, ['question', 'sentence']),
    "rte": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
    "wnli": TaskConfig("two", "BC", 2, ['sentence1', 'sentence2']),
}

task_config = task_configs[task_param]

## Dataset

### Load dataset

In [None]:
if task_param == "mnli_matched": 
    data = load_dataset("glue", "mnli") 
    val_key = "validation_matched"
    test_key = "test_matched"
elif task_param == "mnli_mismatched":
    data = load_dataset("glue", "mnli") 
    val_key = "validation_mismatched"
    test_key = "test_mismatched"
else:
    data = load_dataset("glue", task_param)
    val_key = "validation"
    test_key = "test"

data

### Process dataset

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

max_len = 512 # TODO: How is this value decided?

def tokenize(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     add_special_tokens=True, 
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

def tokenize_double(examples):
    return tokenizer(examples[task_config.col_names[0]],
                     examples[task_config.col_names[1]],
                     add_special_tokens=True,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')

# Tokenize the input
data = data.map(tokenize, batched=True)
# The transformers model expects the target class column to be named "labels"
data = data.rename_column(original_column_name="label", new_column_name="labels") 
# Transform to pytorch tensors and only output the required columns
data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

## Model & Adapter Setup

### Model setup

In [None]:
if task_config.class_type == "R":
    config = RobertaConfig.from_pretrained("distilroberta-base")
else:
    config = RobertaConfig.from_pretrained(
        "distilroberta-base",
        num_labels=task_config.num_classes)

model = AutoAdapterModel.from_pretrained(
    "distilroberta-base",
    config=config).to(device)

num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"total trainable parameters for raw model: {num_trainable_params}")

In [None]:
def compute_accuracy(prediction_object):
  preds = np.argmax(prediction_object.predictions, axis=1)
  return {"accuracy": (preds == prediction_object.label_ids).mean()}

def load_adapter_hyperparameters(model, params):
    if adapter_type == "bottleneck":
        adapter_config = BnConfig(mh_adapter=True,
                                output_adapter=True,
                                reduction_factor=params['bn_reduction_factor'],
                                non_linearity='relu')
    elif adapter_type == "prefix":
        adapter_config = PrefixTuningConfig(prefix_length=params['prefix_length'])
    elif adapter_type == "prompt":
        adapter_config = PromptTuningConfig()

    # Other adapter config options: 
    # ParBnConfig(reduction_factor=4)
    # PrefixTuningConfig(flat=False, prefix_length=30)
    # ConfigUnion(
    #     PrefixTuningConfig(prefix_length=20),
    #     ParBnConfig(reduction_factor=4),)

    # Add a new adapter
    default_name = "default"

    model.delete_adapter(default_name)
    model.add_adapter(default_name, config=adapter_config)

    # Add a matching prediction head
    if task_config.class_type == "R":
        model.add_regression_head(default_name,
                                overwrite_ok=True)
    else:
        model.add_classification_head(
            default_name,
            num_labels=task_config.num_classes,
            overwrite_ok=True)

    # Freeze all weights in the model except for those of the adapter
    model.train_adapter(default_name)

    # Activate the adapter
    model.set_active_adapters(default_name)

    num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # print(f"total trainable parameters for fine-tuning method: {num_trainable_params}")

    training_args = TrainingArguments(
        learning_rate=params['learning_rate'],
        num_train_epochs=params['num_train_epochs'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=params['per_device_eval_batch_size'],
        output_dir=output_dir,
        overwrite_output_dir=True,
        remove_unused_columns=False) # Ensures dataset labels are properly passed to the model

    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=data["train"],
        eval_dataset=data["validation"],
        compute_metrics=compute_accuracy,
        data_collator=default_data_collator)
    
    return trainer

## Training Loop

In [None]:
# Setup for saving results
results_folder = pathlib.Path(f"../../fast-results/results/{adapter_type}/{task_param}")
results_folder.mkdir(parents=True, exist_ok=True)
save_file_id = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = results_folder / f"val_{save_file_id}.csv"
display_best = float("-inf")

with open(results_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = ["eval_accuracy", "training_time", "eval_time"] + list(all_params[0].keys())
    writer.writerow(header)

bar = tqdm(enumerate(all_params), total=len(all_params))
for i, params in bar:
    # print(params) # verbose logging
    trainer = load_adapter_hyperparameters(model, params)
    train_stats = trainer.train()
    eval_result = trainer.evaluate()

    display_recent = eval_result['eval_accuracy']
    display_best = max(display_best, display_recent)

    bar.set_description(f"Best: {display_best:.5f}, Last: {display_recent:.5f}")

    training_time_per_epoch = train_stats.metrics['train_runtime'] / params['num_train_epochs']
    eval_accuracy = eval_result['eval_accuracy']
    eval_time_per_epoch = eval_result['eval_runtime']
    # Write to results csv
    with open(results_file, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row = [eval_accuracy, training_time_per_epoch, eval_time_per_epoch] + list(params.values())
        writer.writerow(row)

## Generate Test Set Predictions

In [None]:
predictions = trainer.predict(data["test"])

test_y_pred_file = results_folder / f"y_pred_{save_file_id}.tsv"
y_pred = np.argmax(predictions.predictions, axis=1)
y_pred_df = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_df["index"] = y_pred_df.index
y_pred_df = y_pred_df[["index", "prediction"]]
y_pred_df.to_csv(test_y_pred_file, sep='\t', index=False, header=True)
print(f"saving predictions to ./{test_y_pred_file}")