In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, \
AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers.adapters import TrainingArguments, AdapterTrainer

import numpy as np
import evaluate

import random
import math
import time
from tqdm import tqdm
import os
import json

: 

In [2]:
from datasets import concatenate_datasets

In [3]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)

In [4]:
os.environ["WANDB_DISABLED"] = "true"

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load in model

In [6]:
model_name = 'm2m100_418M'
experiment = 'en-ha-lang-adapter'
# dataset_name = 'eng-hau-unclean-train-valid-0.1'
# dataset_name = 'eng-hau-partial-unclean-train-valid-0.1'
dataset_name = 'lafand-mt-main'

In [7]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

# Create adapters

In [None]:
# NOTE : also try with original_ln_after=False, which is more theoretically correct but may not result in best performance
enc_config = "pfeiffer[output_adapters=False,monolingual_enc_adapter=True]"
dec_config = "pfeiffer[output_adapters=False,monolingual_dec_adapter=True]"

model.add_adapter("enc_eng", config=enc_config)
model.add_adapter("dec_hau", config=dec_config)

# Prepare data

In [8]:
src_lang = 'eng'
tgt_lang = 'hau'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [9]:
# dataset = load_dataset(f'shreevigneshs/capstone-{dataset_name}')
dataset = load_from_disk(f'../data/{dataset_name}')

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['eng', 'hau'],
        num_rows: 10212
    })
    validation: Dataset({
        features: ['eng', 'hau'],
        num_rows: 1135
    })
    test: Dataset({
        features: ['eng', 'hau'],
        num_rows: 1012
    })
})

In [11]:
def preprocess_function(examples):
    inputs = [example for example in examples[src_lang]]
    targets = [example for example in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")
    return model_inputs

In [12]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names['train'])

Loading cached processed dataset at /projects/nlpuser/vigneshs/projects/continous_learning/data/combined/eng-hau-partial-unclean-train-valid-0.1/train/cache-7179a74d05b82025.arrow
Loading cached processed dataset at /projects/nlpuser/vigneshs/projects/continous_learning/data/combined/eng-hau-partial-unclean-train-valid-0.1/validation/cache-30c444646ebf200f.arrow
Loading cached processed dataset at /projects/nlpuser/vigneshs/projects/continous_learning/data/combined/eng-hau-partial-unclean-train-valid-0.1/test/cache-06b050ae8e1cf422.arrow


In [13]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10212
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1135
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1012
    })
})

# Training Setup

In [21]:
sacrebleu = evaluate.load("sacrebleu")

In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# Activate adapters for training
model.train_adapter("enc_eng")
model.train_adapter("dec_hau")

# Not sure what diff is btwn train_adapter and set_active_adapters
model.set_active_adapters(["enc_eng","dec_hau"])

In [24]:
training_args = TrainingArguments(
    f"{model_name}-{experiment}-{dataset_name}",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,    #should be 10 but testing at 2
    predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    # logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using cuda_amp half precision backend


In [25]:
trainer.train()

***** Running training *****
  Num examples = 20424
  Num Epochs = 10
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 36
  Gradient Accumulation steps = 1
  Total optimization steps = 5680
  Number of trainable parameters = 483905536


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.0851,0.39171,13.9038,39.0432
2,0.3744,0.338693,16.5652,41.237
3,0.3223,0.314235,19.1846,40.5207
4,0.2941,0.299902,20.2916,38.0599
5,0.2706,0.291179,21.6486,38.174
6,0.2533,0.284668,22.4833,38.4304
7,0.2406,0.278965,23.0087,38.9119
8,0.2187,0.276425,23.1987,37.9
9,0.2145,0.27437,23.5913,38.5449
10,0.2081,0.273545,23.7186,38.0427


***** Running Evaluation *****
  Num examples = 2270
  Batch size = 12
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

G

TrainOutput(global_step=5680, training_loss=0.42134734543276503, metrics={'train_runtime': 13904.7348, 'train_samples_per_second': 14.689, 'train_steps_per_second': 0.408, 'total_flos': 1.1065236098383872e+17, 'train_loss': 0.42134734543276503, 'epoch': 10.0})

In [26]:
# Test src to tgt

In [27]:
src_lang = 'eng'
tgt_lang = 'hau'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [28]:
test_outputs = trainer.predict(tokenized_dataset_en_ha['test'], forced_bos_token_id=tokenizer.get_lang_id("ha"))
test_output_texts = tokenizer.batch_decode(torch.LongTensor(test_outputs.predictions), skip_special_tokens=True)


***** Running Prediction *****
  Num examples = 1012
  Batch size = 12
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start

In [29]:
test_outputs.metrics

{'test_loss': 0.5302733778953552,
 'test_bleu': 7.9644,
 'test_gen_len': 51.8162,
 'test_runtime': 289.769,
 'test_samples_per_second': 3.492,
 'test_steps_per_second': 0.293}

In [30]:
if not os.path.exists(f'./data/flores-200/en-ha/{model_name}'):
    os.mkdir(f'./data/flores-200/en-ha/{model_name}')
if not os.path.exists(f'./data/flores-200/en-ha/{model_name}/{experiment}'):
    os.mkdir(f'./data/flores-200/en-ha/{model_name}/{experiment}')


with open(f'./data/flores-200/en-ha/{model_name}/{experiment}/flores.predictions.en-ha.ha', 'w') as fp:
    for translation in test_output_texts:
        fp.write(translation + '\n')
fp.close()

json.dump(test_outputs.metrics, open(f'data/flores-200/en-ha/{model_name}/{experiment}/flores.prediction.en-ha.ha.metrics', 'w'))


In [31]:
# Test tgt to src

In [32]:
tokenizer.src_lang = "ha"
tokenizer.tgt_lang = "en"

In [33]:
reverse_test_outputs = trainer.predict(tokenized_dataset_ha_en['test'], forced_bos_token_id=tokenizer.get_lang_id("en"))
reverse_test_output_texts = tokenizer.batch_decode(torch.LongTensor(reverse_test_outputs.predictions), skip_special_tokens=True)


***** Running Prediction *****
  Num examples = 1012
  Batch size = 12
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start

In [34]:
reverse_test_outputs.metrics

{'test_loss': 0.3550068140029907,
 'test_bleu': 12.4294,
 'test_gen_len': 37.5227,
 'test_runtime': 202.2103,
 'test_samples_per_second': 5.005,
 'test_steps_per_second': 0.42}

In [35]:
if not os.path.exists(f'./data/flores-200/ha-en/{model_name}'):
    os.mkdir(f'./data/flores-200/ha-en/{model_name}')
if not os.path.exists(f'./data/flores-200/ha-en/{model_name}/{experiment}'):
    os.mkdir(f'./data/flores-200/ha-en/{model_name}/{experiment}')


with open(f'./data/flores-200/ha-en/{model_name}/{experiment}/flores.predictions.ha-en.en', 'w') as fp:
    for translation in reverse_test_output_texts:
        fp.write(translation + '\n')
fp.close()

json.dump(reverse_test_outputs.metrics, open(f'data/flores-200/ha-en/{model_name}/{experiment}/flores.prediction.ha-en.en.metrics', 'w'))
