In [None]:
!pip install transformers datasets evaluate

In [None]:
! huggingface-cli login

In [None]:
import pandas as pd

books = pd.read_parquet("hf://datasets/Helsinki-NLP/opus_books/en-fr/train-00000-of-00001.parquet")

In [None]:
from datasets import Dataset
books = Dataset.from_pandas(books)

In [None]:

books

In [None]:
print(books.shape)

In [None]:
print(books.shape[0])

In [None]:
books = books.train_test_split(test_size=0.2)

In [None]:
books["train"][:5]

In [None]:
from transformers import AutoTokenizer
#loading a tokenizer model T5
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = prefix + examples["translation"][source_lang]
    targets =  examples["translation"][target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
tokenized_books = books.map(preprocess_function)


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## defining evaluation metric and preparing the evaluation

In [None]:
!pip install sacrebleu

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np

def postprocess_text(preds, labels):
  preds = [ pred.strip() for pred in preds]
  labels = [ [ label.strip()] for label in labels]

  return preds , labels

def compute_metrics(eval_preds):
  preds , labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]

  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)

  labels = np.where(labels != -100 , labels , tokenizer.pad_token_id )
  decoded_labels = tokenizer.batch_decode(labels,skip_special_token = True)

  decoded_preds , decoded_labels = postprocess_text(decoded_preds , decoded_labels)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu" : result ["score"]}

  prediction_lens = [np.count_nonzero(pred!= tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = { k: round(v,4) for k,v in result.items()}

  return result



*   Training T5 model



In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "bechir_opus_books_model",
    eval_strategy="epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16 ,
    per_device_eval_batch_size= 16 ,
    weight_decay=0.01 ,
    save_total_limit=3 ,
    num_train_epochs = 2,
    predict_with_generate = True,
    fp16 = True,
    push_to_hub=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)

In [None]:
trainer.train()

In [None]:
from transformers import AutoModelForSeq2SeqLM ,AutoTokenizer

text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

tokenizer = AutoTokenizer.from_pretrained("bechirzammouri/bechir_opus_books_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained("bechirzammouri/bechir_opus_books_model")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

tokenizer.decode(outputs[0], skip_special_tokens=True)