# Translation (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
from transformers import pipeline, AutoTokenizer, MT5Model, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
from datasets import Dataset
from torch import cuda

import pandas as pd
import numpy as np
import evaluate
import csv
import os

os.environ['HF_TOKEN'] = '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'

In [3]:
!git config --global user.email "sultanbenjamin12@gmail.com"
!git config --global user.name "benjaminsul"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("csv", data_files="datasett.csv")

In [7]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=29)

In [8]:
split_datasets["validation"] = split_datasets.pop("test")

In [None]:
from torch import cuda

cuda.is_available()

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
model_checkpoint = 'google/mt5-large'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", to_device = device)

In [11]:
max_length = 9024

def preprocess_function(examples):
  inputs = [i for i in examples['he']]
  targets = [i for i in examples['tanach']]

  model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
  return model_inputs


In [None]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    #with_indices=True,
    remove_columns=split_datasets["train"].column_names,
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

In [None]:
metric = evaluate.load("sacrebleu")

In [19]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
args = Seq2SeqTrainingArguments(
    f"fine_tuned_he_to_tanach",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

In [21]:
class CustomTrainer(Seq2SeqTrainer):
    def _save(self, output_dir=None, _internal_call=False):
        state_dict = self.model.state_dict()
        for name, param in state_dict.items():
            if not param.is_contiguous():
                state_dict[name] = param.contiguous()
        self.model.save_pretrained(output_dir, state_dict=state_dict)

In [None]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

In [None]:
text = """לזכר הניסים הללו אנו מקיימים את חג החנוכה, בו אנו מדליקים חנוכיה ומשחקים בסביבונים."""


inputs = tokenizer(text, return_tensors='pt', )
summary_ids = model.generate(**inputs, min_length=20,max_length=150, early_stopping=True, no_repeat_ngram_size=2, length_penalty=1.5)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)