In [None]:
from datasets import load_dataset, Audio 

csv_path = '/kaggle/input/technical-terms-dataset/data_technical_terms.csv'

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(csv_path)

df['audio_path'] = df['audio_path'].apply(lambda x: f"/kaggle/input/technical-terms-dataset/audio_output/{x}")

df.to_csv('updated_file.csv', index=False)

In [None]:
import os.path
os.path.isfile('/kaggle/input/technical-terms-dataset/audio_output/audio_output/ad_10_20241019_162519.wav')

In [None]:
dataset = load_dataset("csv", data_files="./updated_file.csv")

dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16e3))

In [None]:
dataset = dataset['train']

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech


processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", device='cuda')
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts", device_map='cuda')

In [None]:
tokenizer = processor.tokenizer

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()}

In [None]:
dataset_vocab - tokenizer_vocab

In [None]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name)
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def prepare_dataset(example):
    audio = example["audio_path"]
    print(audio)

    example = processor(
        text=example["sentence"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    example["labels"] = example["labels"][0]

    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [None]:
print(dataset[0])

In [None]:
processed_example = prepare_dataset(dataset[0])

In [None]:
list(processed_example.keys())

In [None]:
tokenizer.decode(processed_example['input_ids'])

In [None]:
processed_example['speaker_embeddings'].shape

In [None]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

spectrogram = torch.tensor(processed_example['labels'])

In [None]:
with torch.no_grad():
    speech = vocoder(spectrogram)

In [None]:
from IPython.display import Audio 
Audio(speech.cpu().numpy(), rate=16000)

In [None]:
dataset = dataset.map( 
    prepare_dataset,
    remove_columns=dataset.column_names,
)

In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids,
            labels=label_features,
            return_tensors="pt",
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor([
                len(feature["input_values"]) for feature in label_features
            ])
            target_lengths = target_lengths.new([
                length - length % model.config.reduction_factor for length in target_lengths
            ])
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [None]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [None]:
features = [
    dataset["train"][0], 
    dataset["train"][1], 
    dataset["train"][20], 
]

batch = data_collator(features)

In [None]:
{k:v.shape for k,v in batch.items()}

In [None]:
model.config.use_cache = False

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


training_args = Seq2SeqTrainingArguments(
    output_dir="final_technical_terms_t5_finetuned",  
    per_device_train_batch_size=32,
    gradient_accumulation_steps=3,
    learning_rate=5e-4,
    warmup_steps=100,
    # max_steps=8000,
    num_train_epochs=50,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=1000,
    optim="adamw_bnb_8bit",
    eval_steps=200,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

In [None]:
trainer.train()