# Translator Project

This Notebook provide an easy way to load and test the Model. This follows the code provided by Dekel

## Load Dependencies

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric, Dataset, DatasetDict, load_from_disk, concatenate_datasets
import os

OUTPUT_DIRECTORY = os.path.join(os.getcwd(), 'output')

## Organize the data

In [17]:
# The preprocessing needs to be done in an other Notebook
# We should easily load from here the split_datasets

from datasets import concatenate_datasets, load_dataset

tokenized_datasets_teheran = load_from_disk('/home/azureuser/translator/tr_data/teheran/prepared_dataset')
tokenized_datasets_fauda = load_from_disk('/home/azureuser/translator/tr_data/fauda/prepared_dataset')
tokenized_datasets_inss = load_from_disk('/home/azureuser/translator/tr_data/inss/prepared_dataset')
tokenized_datasets_wiki = load_from_disk('/home/azureuser/translator/tr_data/wiki/prepared_dataset')

train_dataset = concatenate_datasets([
    tokenized_datasets_teheran['train'],
    tokenized_datasets_fauda['train'],
    tokenized_datasets_inss['train'],
    tokenized_datasets_wiki['train']
])

validation_dataset = concatenate_datasets([
    tokenized_datasets_teheran['validation'],
    tokenized_datasets_fauda['validation'],
    tokenized_datasets_inss['validation'],
    tokenized_datasets_wiki['validation']
])


tokenized_datasets = DatasetDict({
        'train': train_dataset,
        'validation': validation_dataset,
})

In [23]:
tokenized_datasets.save_to_disk('./tr_data/tokenized_dataset_used')

Saving the dataset (0/1 shards):   0%|          | 0/24693 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8232 [00:00<?, ? examples/s]

## Load the Model

In [16]:
src_lang = "heb_Hebr"
tgt_lang = "eng_Latn"

model_checkpoint = "facebook/nllb-200-distilled-1.3B" 
# Using local version
# model_checkpoint = "/data2/translation/nllb/nllb-200-distilled-600M-he-en/checkpoint-124998/" 


tokenizer = NllbTokenizer.from_pretrained(model_checkpoint, src_lang=src_lang, tgt_lang=tgt_lang)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## Load The metric

In [19]:
metric = load_metric("sacrebleu")

f1_scores = []

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # calculate the f1 score
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    # calculate scalerblue results
    scalerbleu_result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # connects the metrics to wandb

    # Log F1 score to WandB
    res = {"sacrebleu": scalerbleu_result["score"], "f1_score": f1}

    # Append the F1 score to the list for tracking
    f1_scores.append(f1)

    return res


## Setup Train Environment

In [20]:
args = Seq2SeqTrainingArguments(
    OUTPUT_DIRECTORY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_accumulation_steps=3,
    predict_with_generate=True,
    push_to_hub=False,
    do_train=True,
    do_eval=True,
    fp16=True
)


trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset= tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Sacrebleu,F1 Score
1,0.2578,0.242549,75.56379,0.429968
2,0.0939,0.262334,75.537487,0.433248


TrainOutput(global_step=1544, training_loss=0.16169172000390877, metrics={'train_runtime': 1252.8723, 'train_samples_per_second': 39.418, 'train_steps_per_second': 1.232, 'total_flos': 1.0506107625037824e+16, 'train_loss': 0.16169172000390877, 'epoch': 2.0})

In [24]:
trainer.save_model('./models/tranlator-1.3-all-data')