In [None]:
!pip install requirements.txt

In [None]:
import os
# your wandb key for experiment tracking
os.environ["WANDB_API_KEY"] = ""
# your Huggingface hub API key for saving the model and tokenizer
hf_token= ""

In [None]:
from huggingface_hub import login
import wandb
login(hf_token)
wandb.login()

In [None]:
CONFIG = {
    'batch_size' : 32,
    'lr' : 1e-4,
    'epochs': 10,
    'seed':42,
    'weight_decay':0.01,
    'model_name': 'google-t5/t5-small',
    'model_path': './hf_best_model',
    'tokenizer_path': './hf_tokenizer',
    'max_length': 32,
    'warmup_ratio':0.2,
    'run_name':'baseline'
    
}



SRC_LANG = "dyu"
TRG_LANG = "fr"
HF_USERNAME = "" # replace with HuggingFace hub username
HF_REPO_NAME = "" # replace with HuggingFace repo name
CHARS_TO_REMOVE_REGEX = '[!"&\(\),-./:;=?+.\n\[\]]'
PREFIX = "translate Dyula to French: "


In [None]:
import re
import evaluate
import numpy as np
from datasets import load_dataset



def remove_special_characters(text):
    text = re.sub(CHARS_TO_REMOVE_REGEX, " ", text.lower())
    return text.strip()

def clean_text(batch):
    batch["translation"][SRC_LANG] = remove_special_characters(batch["translation"][SRC_LANG])
    batch["translation"][TRG_LANG] = remove_special_characters(batch["translation"][TRG_LANG])
    return batch

def preprocess_function(examples):
    inputs = [PREFIX + example[SRC_LANG] for example in examples["translation"]]
    labels = [example[TRG_LANG] for example in examples["translation"]]
    
    model_inputs = tokenizer(inputs,max_length=CONFIG['max_length'], truncation=True)
    labels = tokenizer(text_target=labels, max_length=CONFIG['max_length'], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
        
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import pipeline,AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig

model_name = CONFIG['model_name']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
GEN_CONFIG = {
    "do_sample": False,
    "max_new_tokens": 32,
    "temperature": 1.0,
    'decoder_start_token_id': model.config.decoder_start_token_id
}
gen_config = GenerationConfig(**GEN_CONFIG)

In [None]:
dataset = load_dataset("uvci/Koumankan_mt_dyu_fr")
dataset = dataset.map(clean_text)
dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
metric = evaluate.load("sacrebleu")

In [None]:
# Prepare datasets
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset['test']

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir = './results',
    num_train_epochs=CONFIG['epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    learning_rate=CONFIG['lr'],
    warmup_ratio=CONFIG['warmup_ratio'],
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps = 10,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    push_to_hub=True,
    gradient_accumulation_steps=4,
    predict_with_generate=True,
    run_name=CONFIG['run_name'],
    hub_model_id = f"{HF_USERNAME}/{HF_REPO_NAME}",
    generation_config=gen_config
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics= compute_metrics,
)

trainer.train()