In [None]:
!pip3 install datasets
!pip3 install evaluate
!pip3 install transformers
!pip3 install adapters
!pip3 install accelerate==0.30

In [None]:
import logging
import os
import random
import sys
import time
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from adapters import AdapterArguments, AdapterTrainer, AutoAdapterModel, setup_adapter_training, AdapterConfig
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

In [None]:
task_to_keys = {
    "boolq": ("question", "passage"), #works
    "cb": ("hypothesis", "premise"), #works (the tags should be swapped)
    "copa": ("premise", "choice1", "choice2"), #nope
    "multirc": ("paragraph", "question"), #nope
    "record": ("passage", "query"), #nope
    "rte": ("premise", "hypothesis"),#works
    "wic": ("sentence1", "sentence2"), #nope
    "wsc": ("text", None),
}

In [None]:
# boolq = load_dataset("super_glue", "boolq")
# cb = load_dataset("super_glue", "cb")
# copa = load_dataset("super_glue", "copa")
# multirc = load_dataset("super_glue", "multirc")
# record = load_dataset("super_glue", "record")
# rte = load_dataset("super_glue", "rte")
# wic = load_dataset("super_glue", "wic")
# wsc = load_dataset("super_glue", "wsc")

In [None]:
# boolq["train"][0] #question : passage : label 
# cb["train"][0] #premise : hypothesis: label
# copa["train"][1]
# multirc["train"][0]

{'paragraph': 'While this process moved along, diplomacy continued its rounds. Direct pressure on the Taliban had proved unsuccessful. As one NSC staff note put it, "Under the Taliban, Afghanistan is not so much a state sponsor of terrorism as it is a state sponsored by terrorists." In early 2000, the United States began a high-level effort to persuade Pakistan to use its influence over the Taliban. In January 2000, Assistant Secretary of State Karl Inderfurth and the State Department\'s counterterrorism coordinator, Michael Sheehan, met with General Musharraf in Islamabad, dangling before him the possibility of a presidential visit in March as a reward for Pakistani cooperation. Such a visit was coveted by Musharraf, partly as a sign of his government\'s legitimacy. He told the two envoys that he would meet with Mullah Omar and press him on  Bin Laden. They left, however, reporting to Washington that Pakistan was unlikely in fact to do anything," given what it sees as the benefits of 

In [None]:
def filter_unused_args(args):
    filtered_args = []
    for arg in args:
        if not arg.startswith("-f") and not (arg.endswith(".json") or arg.endswith(".py")):
            filtered_args.append(arg)
    return filtered_args
@dataclass
class DataTrainingArguments:
    task_name: Optional[str] = field(
        default='boolq',
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default='super_glue', metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )
    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})

    def __post_init__(self):
        if self.task_name is not None:
            self.task_name = self.task_name.lower()
            if self.task_name not in task_to_keys.keys():
                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
        elif self.dataset_name is not None:
            pass
        elif self.train_file is None or self.validation_file is None:
            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
        else:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
            assert (
                validation_extension == train_extension
            ), "`validation_file` should have the same extension (csv or json) as `train_file`."


@dataclass
class ModelArguments:
    model_name_or_path: str = field(
        default='distilbert/distilbert-base-uncased',
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizers (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name, or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    ignore_mismatched_sizes: bool = field(
        default=False,
        metadata={"help": "Will enable loading a pretrained model whose head dimensions are different."},
    )
@dataclass
class TrainingArguments(transformers.TrainingArguments):
    output_dir: str = field(
        default="./results",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
    )
    adapter: str = field(
        default=True,
        metadata={"help": "Whether you wanna train adapter or fine-tune"}
    )



parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
filtered_args = filter_unused_args(sys.argv)
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=filtered_args)

In [None]:
# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN)
set_seed(42)

In [None]:
### Set Hyperparams Here
model_args.model_name_or_path = 'distilbert/distilbert-base-uncased'
data_args.dataset_name = 'super_glue'
data_args.task_name = 'record'
training_args.adapter = False
training_args.adapter_type = "seq_bn"
training_args.save_strategy = 'no'

In [None]:
raw_datasets = load_dataset(
    data_args.dataset_name,
    data_args.task_name,
    cache_dir=model_args.cache_dir,
    use_auth_token=model_args.use_auth_token
)



In [None]:
if data_args.task_name == "record":
    from datasets import Dataset, DatasetDict
    import re

    def preprocess_dataset(data):
        passages, queries, labels, idxs, entities = [], [], [], [], []
        
        for row in data:
            passage_text = row['passage']
            query_text = row['query']
            answer = row['answers'][0]
            entities_list = row['entities']
            idx_ = row['idx']

            for entity in entities_list:
                passages.append(passage_text)
                cleaned_entity = re.sub(r"\\", "", entity)
                queries.append(re.sub("@placeholder", cleaned_entity, query_text))
                labels.append(answer == cleaned_entity)
                idxs.append(idx_)
                entities.append(entity)

        dataset = Dataset.from_dict({
            "passage": passages,
            "query": queries,
            "label": labels,
            "idx": idxs
        })
        return dataset.class_encode_column("label")

    train_data = preprocess_dataset(raw_datasets["train"])
    validation_data = preprocess_dataset(raw_datasets["validation"])

    raw_datasets = DatasetDict({'train': train_data, 'validation': validation_data})


Stringifying the column:   0%|          | 0/1179400 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1179400 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/113236 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/113236 [00:00<?, ? examples/s]

In [None]:
is_regression = data_args.task_name == "stsb"
if not is_regression:
    label_list = raw_datasets["train"].features["label"].names
    num_labels = len(label_list)
    print(label_list)
else:
    num_labels = 1

config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
    use_auth_token=model_args.use_auth_token,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    use_auth_token=model_args.use_auth_token
)

if training_args.adapter:
    model = AutoAdapterModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        use_auth_token=model_args.use_auth_token,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes
    )
    model.add_classification_head(
    data_args.dataset_name,
    num_labels=num_labels,
    id2label={i: v for i, v in enumerate(label_list)} if not is_regression else None,
    )
    adapter_config_kwargs = {}
    adapter_load_kwargs = {}
    adapter_config = AdapterConfig.load(training_args.adapter_type, **adapter_config_kwargs)
    model.add_adapter(data_args.task_name, config=adapter_config)
    model.train_adapter([data_args.task_name])
    model.set_active_adapters(data_args.task_name)
else:
    model = transformers.AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
        use_auth_token=model_args.use_auth_token,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes
    )


['False', 'True']


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression

In [None]:
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
padding = "max_length" if data_args.pad_to_max_length else False

label_to_id = None
if (
    model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
    and data_args.task_name is not None
    and not is_regression
):
    print(f"pretrainconfig labels mapping : {PretrainedConfig(num_labels=num_labels).label2id}, model label mappings : {model.config.label2id}")
    label_name_to_id = {k: v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
            "\nIgnoring the model labels as a result.",
        )
elif data_args.task_name is None and not is_regression:
    label_to_id = {v: i for i, v in enumerate(label_list)}
if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in config.label2id.items()}
elif data_args.task_name is not None and not is_regression:
    model.config.label2id = {l: i for i, l in enumerate(label_list)}
    model.config.id2label = {id: label for label, id in config.label2id.items()}

max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

In [None]:
def preprocess_function(examples):
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    return result

raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=not data_args.overwrite_cache,
    desc="Running tokenizer on dataset",
)

train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
    max_train_samples = min(len(train_dataset), data_args.max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples))

eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
if data_args.max_eval_samples is not None:
    max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples))

# predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
# if data_args.max_predict_samples is not None:
#     max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
#     predict_dataset = predict_dataset.select(range(max_predict_samples))

# metric = evaluate.load("super_glue", data_args.task_name)

# def compute_metrics(p: EvalPrediction):
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
#     preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
#     if data_args.task_name is not None:
#         result = metric.compute(predictions=preds, references=p.label_ids)
#         if len(result) > 1:
#             result["combined_score"] = np.mean(list(result.values())).item()
#         return result
#     elif is_regression:
#         return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
#     else:
#         return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

# data_collator = default_data_collator

Running tokenizer on dataset:   0%|          | 0/1179400 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/113236 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

metric = evaluate.load("super_glue", data_args.task_name)
def compute_metrics(p):
    if data_args.task_name == "record":
        preds = p.predictions
        labels = p.label_ids

        print(f"Predictions shape: {preds.shape}")
        print(f"Labels shape: {labels.shape}")

        if preds.ndim == 2:  # This means we have logits
            preds = np.argmax(preds, axis=1)

        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average='weighted')  # Adjust 'average' as needed

        return {
            "accuracy": accuracy,
            "f1": f1,
        }
    else : 
        def compute_metrics(p: EvalPrediction):
            preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
            if data_args.task_name is not None:
                result = metric.compute(predictions=preds, references=p.label_ids)
                if len(result) > 1:
                    result["combined_score"] = np.mean(list(result.values())).item()
                return result
            elif is_regression:
                return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
            else:
                return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

data_collator = default_data_collator

In [None]:
trainer_class = AdapterTrainer if training_args.adapter else Trainer
print(trainer_class)
trainer = trainer_class(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

start_time = time.time()

checkpoint = get_last_checkpoint(training_args.output_dir) if training_args.resume_from_checkpoint is None else training_args.resume_from_checkpoint
train_result = trainer.train()

end_time = time.time()
training_time = end_time - start_time

metrics = train_result.metrics
max_train_samples = data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
metrics["training_time"] = training_time

trainer.save_model()  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
metrics = train_result.metrics
print("Training metrics:", metrics)

eval_metrics = trainer.evaluate(eval_dataset=eval_dataset)
print("Evaluation metrics:", eval_metrics)

if 'accuracy' in eval_metrics:
    accuracy = eval_metrics['accuracy']
    print(f"Accuracy: {accuracy:.4f}")