In [None]:
# Install the necessary libraries
!pip install transformers datasets==2.4.0 evaluate sacrebleu


I try to ue the dataset from opus_books, and make translation model to translate from english to franch. In this code need the authrotize the goodle drive account because will save the model in the gdrive repositiry

In [None]:
# import the library
import os
import numpy as np
from google.colab import drive
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer


# Load the dataset
books = load_dataset("opus_books", "en-fr")

# Split the Dataset
# The dataset is splitted into train(80%) and test(20%)
books = books["train"].train_test_split(test_size=0.2)

# I load the tokenizer from the T5 model (t5-small) to convert text into tokens that can be processed by the model.
# "google-t5/t5-small", are widely used in various NLP applications because of its felxibility to handle tasks like translation
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Preprocessing the data
source_lang = "en"
target_lang = "fr"
prefix = "translate english to french: "
def preprocess_function(datas):
    inputs = [prefix + data[source_lang] for data in datas["translation"]]
    targets = [data[target_lang] for data in datas["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=64, truncation=True)
    return model_inputs

# Implement the preprocessing function to the dataset
# To prepare data for training or inference.
tokenized_books = books.map(preprocess_function, batched=True)

# DataCollatorForSeq2Seq pads sequences dynamically in a batch for efficiency, ensuring inputs match the maximum length without padding the entire dataset.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Evaluate the model using screblue. the purpose is to assess its performance on specific tasks, ensuring it generalizes well, meets quality standards, and identifies areas for improvement.
metric = load_metric("sacrebleu")

# This function is to ensure that the data used for evaluation metrics are clean and consistent, improving the accuracy and reliability of the evaluation results
# by removing unnecessary whitespace to prevent comparison errors between predictions and labels.
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


# Function to calculate the matrices
# Using the prediction and labels to calculate SacreBLEU score
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # If preds is a tuple, take the first element
    if isinstance(preds, tuple):
        preds = preds[0]
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Postprocess the decoded results
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Calculate the metric
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    # Calculate average prediction length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    # Round results
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# This step is to prepare the T5 model for further tasks, such as fine-tuning or inference, using the capabilities of the pre-trained weights.
# This makes it easier to apply the model to specific applications without starting from scratch.
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Define the base output directory path in Google Drive
drive.mount('/content/drive')
base_output_dir = "/content/drive/My Drive/llm_finetuning"
base_output_dir = "/content/drive/My Drive/my_llm_model"

# Check if the directory already exists and rename if necessary
output_dir = base_output_dir
counter = 1
while os.path.exists(output_dir):
    output_dir = f"{base_output_dir}_{counter}"
    counter += 1

# The purpose of defining these hyperparameters is to configure the training process, optimizing for performance and efficiency while ensuring the model is trained effectively for the desired task.
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=2,
)


# Preparing the Trainer
# After setting up the hyperparameters, initialize Seq2SeqTrainer with the model, dataset, tokenizer, data collator, and metrics function defined earlier
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Starting the Training
trainer.train()

# Saving the model and tokenizer after training
model_dir = "./my_finetuned_model"

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)


In [None]:
from transformers import pipeline
model_dir = './my_finetuned_model'

# Initialize the translator for translating from English to French
translator = pipeline("translation_en_to_fr", model=model_dir)

# Get the input from user
text = input("Enter the text you want to translate to French: ")

# The result of translation
translated_result = translator(text)
print("Translation Result:", translated_result[0]['translation_text'])
