In [16]:
import torch
from copy import copy
from transformers import AutoTokenizer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import os
from math import inf
from datasets import load_dataset
from transformers import AutoModelForMultipleChoice
from adapters import init,AutoAdapterModel
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer
import pandas as pd
from adapters import AdapterSetup, AutoAdapterModel


In [17]:
def preprocess_function(examples):
    # Repeat each prompt for 5 times to go with the 5 possibilities of each option
    first_sentences = [[context] * 2 for context in examples["startphrase"]]
    # Grab all options
    second_sentences = [[ending1, examples['ending2'][i]] for i, ending1 in enumerate(examples['ending1'])]
#     print(first_sentences)
#     print("====")

    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

#     print(first_sentences)
#     print("====")
#     print(second_sentences)

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {
        k: [v[i : i + 2] for i in range(0, len(v), 2)]
        for k, v in tokenized_examples.items()
    }

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: AutoTokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch
    
def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

In [18]:
new_dialects = ["std-dia", "aus", "hon", "nig", "col", "wel"]
train_dialects = ["std-dia", "aus", "hon", "nig", "col" ]
test_dialects =["wel"]

In [37]:

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")


# model = AutoAdapterModel.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMultipleChoice.from_pretrained("FacebookAI/xlm-roberta-base")
# Enable adapter support
init(model) 


##############3 Load adapters 
for dialect in train_dialects:
    model.load_adapter(f"/l/users/abdelrahman.sadallah/dialectal_gen/{dialect}-adapter-figqa-xlmr", config='seq_bn', with_head=False)



model.add_adapter_fusion(train_dialects, "dynamic")
model.train_adapter_fusion(train_dialects)




Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
model

XLMRobertaAdapterModel(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (key): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (value): LoRALinearTorch(
                in_features=768, out_features=768, bias=

In [39]:
dialect = 'wel'

In [None]:

if dialect != "std-dia":
    # ds = load_dataset("ashabrawy/dia_copa", dialect, cache_dir="/scratch/afz225/.cache")
    ds = load_dataset("ashabrawy/dia_figqa", dialect)
    columns = copy(ds['train'].column_names)
    # columns.remove('label')
    columns.remove('labels')
    ds = ds.map(preprocess_function, batched=True, remove_columns=columns)
    # ds = ds.rename_column(original_column_name="label", new_column_name="labels")
    # ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

else:
    # eval_ds = load_dataset("super_glue", "copa", cache_dir="/scratch/afz225/.cache")['validation']
    ds = load_dataset("nightingal3/fig-qa")
    columns = copy(ds.column_names)
    # columns.remove('label')
    columns.remove('labels')
    ds = ds.map(preprocess_function, batched=True, remove_columns=columns)
    # eval_ds = eval_ds.rename_column(original_column_name="label", new_column_name="labels")

In [36]:
ds['train']['input_ids'][0]

[[0,
  1840,
  19,
  2565,
  111,
  140147,
  6664,
  1902,
  90254,
  5,
  2,
  2,
  1840,
  19,
  103036,
  7,
  831,
  186,
  10,
  9,
  14473,
  13,
  4126,
  5,
  2],
 [0,
  1840,
  19,
  2565,
  111,
  140147,
  6664,
  1902,
  90254,
  5,
  2,
  2,
  1840,
  103036,
  7,
  53418,
  186,
  63207,
  297,
  5,
  2]]

In [40]:



# model.add_classification_head(
#     dialect,
#     num_labels=2,
#     overwrite_ok=True
# )

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir=f"/l/users/abdelrahman.sadallah/dialectal_gen-adapterfusion-{dialect}",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_accuracy,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.204, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Expected input batch_size (1152) to match target batch_size (16).

In [None]:
model.save_adapter_fusion(f"/l/users/abdelrahman.sadallah/dialectal_gen/{dialect}-adapterfusion-figqa-xlmr", train_dialects, with_head=True)


eval_metrics = trainer.evaluate(ds["validation"])
eval_metrics['test_set'] = dialect

results = pd.DataFrame(eval_metrics, index=[0])

print(results)
results.to_csv(f"{dialect}-adapterfusion-figqa-xlmr.csv")

In [None]:
model