In [1]:
#!pip install "adapter-transformers@git+https://github.com/akufeldt/adapter-transformers.git@debug#egg=adapter-transformers&subdirectory=adapter-transformers"

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments
from transformers.adapters import AdapterTrainer

import pandas as pd
import numpy as np
import evaluate

import random
import math
import time
from tqdm import tqdm
import os
import json

In [3]:
from datasets import concatenate_datasets

In [4]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)

In [5]:
os.environ["WANDB_DISABLED"] = "true"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load in model

In [7]:
model_name = 'm2m100_418M'
experiment = 'en-ha-lang-adapter-1'
dataset_name = 'data/en-ha'

In [8]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

# Create adapters

In [9]:
# NOTE : also try with original_ln_after=False, which is more theoretically correct but may not result in best performance
enc_config = "pfeiffer[output_adapter=False,monolingual_enc_adapter=True]"
dec_config = "pfeiffer[output_adapter=False,monolingual_dec_adapter=True]"

# Add lang adapters
model.add_adapter("enc_en", config=enc_config)
model.add_adapter("dec_ha", config=dec_config)

In [10]:
"""
# Add lang adapters
model.add_adapter("enc_indo_euro", config=enc_config)
model.add_adapter("dec_afro_asiatic", config=dec_config"""

'model.add_adapter("enc_indo_euro", config=enc_config)\nmodel.add_adapter("dec_afro_asiatic", config=dec_config'

# Prepare data

In [11]:
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [12]:
dataset = DatasetDict({'train':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_train.csv')),
                        'validation':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_dev.csv')),
                        'test':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_test.csv'))})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ha'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['en', 'ha'],
        num_rows: 1113
    })
})

In [14]:
def preprocess_function(examples):
    inputs = [example for example in examples[src_lang]]
    targets = [example for example in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")
    return model_inputs

In [15]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names['train'])

Map:   0%|          | 0/9818 [00:00<?, ? examples/s]

Map:   0%|          | 0/1113 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1113
    })
})

# Training Setup

In [17]:
sacrebleu = evaluate.load("sacrebleu")

In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    preds = np.argmax(pred_ids, axis=-1)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Removeme
    import warnings
    warnings.warn(f"preds: {decoded_preds[0]}\n)")
    warnings.warn(f"labels: {decoded_labels[0]}\n)")

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [21]:
import transformers.adapters.composition as ac

# Activate lang adapters
model.train_adapter_pair(ac.Pair("enc_en","dec_ha"))

"""
# Activate family adapters
encoder_adapters = ac.Stack("enc_indo_euro","enc_en")
decoder_adapters = ac.Stack("dec_afro_asiatic","dec_ha")
model.train_adapter_pair(ac.Pair(encoder_adapters,decoder_adapters))
"""

In [23]:
training_args = TrainingArguments(
    f"./lang_adapters/{experiment}/model",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    warmup_steps=1000,
    # lr_scheduler_type='constant',
    # gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    # gradient_checkpointing=True,
    # predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using cuda_amp half precision backend


In [24]:
print(getattr(model.base_model, "model_frozen", False))

True


In [None]:
trainer.train()

***** Running training *****
  Num examples = 9818
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3080
  Number of trainable parameters = 4757760


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,6.9151,6.831612,0.5808,256.0


***** Running Evaluation *****
  Num examples = 1113
  Batch size = 16
)
)
Saving model checkpoint to ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154
Configuration saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/enc_en/adapter_config.json
Module weights saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/enc_en/pytorch_adapter.bin
Configuration saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/enc_en/head_config.json
Module weights saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/enc_en/pytorch_model_head.bin
Configuration saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/dec_ha/adapter_config.json
Module weights saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/dec_ha/pytorch_adapter.bin
Configuration saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/dec_ha/head_config.json
Module weights saved in ./lang_adapters/en-ha-lang-adapter-1/model/checkpoint-154/de

In [None]:
# Save adapters
if not os.path.exists(f'./lang_adapters/{experiment}'):
    os.mkdir(f'./lang_adapters/{experiment}')
    
model.save_adapter(f"./lang_adapters/{experiment}/encoder_english", "enc_en")
model.save_adapter(f"./lang_adapters/{experiment}/decoder_hausa", "dec_ha")

In [None]:
loader = MyCustomWeightsLoader(model)
loader.save(f"./lang_adapters/{experiment}/encoder_english", "enc_en")
loader.save(f"./lang_adapters/{experiment}/decoder_hausa", "dec_ha")

In [None]:
# Evaluate performance
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [None]:
test_outputs = trainer.predict(tokenized_dataset['test'], forced_bos_token_id=tokenizer.get_lang_id("ha"))
test_output_texts = tokenizer.batch_decode(torch.LongTensor(test_outputs.predictions), skip_special_tokens=True)

In [None]:
test_outputs.metrics

In [None]:
if not os.path.exists(f'./lang_adapters/{experiment}'):
    os.mkdir(f'./lang_adapters/{experiment}')

with open(f'./lang_adapters/{experiment}/predictions', 'w') as fp:
    for translation in test_output_texts:
        fp.write(translation + '\n')
fp.close()

json.dump(test_outputs.metrics, open(f'./lang_adapters/{experiment}/metrics', 'w'))


In [None]:
# Load in adapters (for future reference)

#model.load_adapter(f"/lang_adapters/{experiment}/encoder_english", config=enc_config)
#model.load_adapter(f"/lang_adapters/{experiment}/decoder_hausa", config=dec_config)