# Finetuning the Seamless M4T Model on Romanian Common Voice
## Import Libraries and Prepare Environment

In [38]:
# Install necessary packages
!pip install transformers datasets evaluate

# Import required libraries
from transformers import AutoProcessor, SeamlessM4Tv2Model, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, DatasetDict
from huggingface_hub import notebook_login
import torch
import evaluate

# Log into the Hugging Face Hub
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load and Preprocess the Romanian Dataset

In [42]:
# Load the Romanian Common Voice dataset
common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "ro", split="train+validation")
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "ro", split="test")

# Initialize the processor and model
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

# Find the appropriate sampling rate for the model
sampling_rate = processor.feature_extractor.sampling_rate

# Cast audio to the correct sampling rate using the `datasets` Audio feature
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

# Define a data preparation function
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(
        audios=audio["array"],
        sampling_rate=sampling_rate,
        return_tensors="pt"
    )
    return example

# Apply the data preparation function to the entire dataset
common_voice = common_voice.map(prepare_dataset, remove_columns=["client_id", "path", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])


Using the latest cached version of the module from C:\Users\afrca\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_13_0\2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055 (last modified on Tue May  7 21:00:21 2024) since it couldn't be found locally at mozilla-foundation/common_voice_13_0, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\afrca\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_13_0\2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055 (last modified on Tue May  7 21:00:21 2024) since it couldn't be found locally at mozilla-foundation/common_voice_13_0, or remotely on the Hugging Face Hub.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/8949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3861 [00:00<?, ? examples/s]

## Define a Data Collator and Evaluation Metrics

In [43]:
# Define the data collator
class DataCollatorSpeechSeq2SeqWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        # Separate input features and labels for independent processing
        input_features = [{"input_features": feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

# Initialize the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor)

# Load and define the WER evaluation metric
metric = evaluate.load("wer")

# Define the function to compute metrics
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    return {"wer": metric.compute(predictions=pred_str, references=label_str)}


## Configure Training and Launch

In [65]:
# Install necessary packages
# print torch device
print("test")
#
# # Define training arguments
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./seamless-m4t-ro",
#     per_device_train_batch_size=8,
#     gradient_accumulation_steps=2,
#     learning_rate=1e-5,
#     num_train_epochs=5,
#     evaluation_strategy="steps",
#     eval_steps=100,
#     save_steps=100,
#     logging_steps=50,
#     report_to=["tensorboard"],
#     load_best_model_at_end=True,
#     metric_for_best_model="wer",
#     greater_is_better=False,
#     push_to_hub=True
# )