**this notebook was trained on kaggle, for reproducing the results or for running, fork my notebook 👉 [tom/text_to_speech](https://www.kaggle.com/code/blessontomjoseph/text-to-speech/edit)**

In [None]:
!pip install speechbrain
!pip install git+https://github.com/huggingface/transformers.git
!pip install --upgrade accelerate

In [None]:
from transformers import SpeechT5Processor
from transformers import SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan
from datasets import load_dataset,Audio
import soundfile as sf

import os
import torch
import torchaudio
import datasets
import pandas as pd
from torch import manual_seed
from datasets import Dataset

from speechbrain.pretrained import EncoderClassifier
import warnings
warnings.simplefilter("ignore")

In [None]:
manual_seed(32)
class Config:
    device='cuda'if torch.cuda.is_available() else 'cpu'
    sampling_rate=16000

### Pretrained import

In [None]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
model=model.to(Config.device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") 
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) 

### Formatting data

In [None]:
emo_abv={'fru': 'Frustration',
'exc': 'Excitement',
'neu': 'Neutral',
'ang': 'Anger',
'sad': 'Sadness',
'hap': 'Happiness',
'sur': 'Surprise',
'fea': 'Fear',
'oth': 'Other',
'dis': 'Disgust'}

path="/kaggle/input/iemocap-transcriptions-english-french/iemocapTrans.csv"
df=pd.read_csv(path)
df.drop(['_id','translated'],axis=1,inplace=True)
df['emotion']=df['emotion'].apply(lambda x: emo_abv[x])
df.columns=['activation', 'dominance', 'emotion', 'end_time', 'start_time', 'audiofile_name','text', 'valence']

audio_path="/kaggle/input/iemocap-transcriptions-english-french/Iemocap_audio/iemocap_audio/IEMOCAP_wav"
df['audio']=df['audiofile_name'].apply(lambda x: torchaudio.load(os.path.join(audio_path,x+".wav"))[0])
df['text']=df['text']+" [emotion] "+df['emotion']

df_=df[['text','audio']]
dataset = Dataset.from_dict({'text':df_['text'],'target':df_['audio'].to_list()})

### Tokenizing

In [None]:
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name, 
    run_opts={"device": Config.device}, 
    savedir=os.path.join("/tmp", spk_model_name)
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

def prepare_dataset(single_data):
    example = processor(
        text=single_data["text"],
        audio_target=single_data['target'], 
        sampling_rate=Config.sampling_rate,
        return_attention_mask=False,
    )
    example["labels"] = example["labels"][0]
    example["speaker_embeddings"] = create_speaker_embedding(single_data['target'])
    return example

dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
dataset = dataset.train_test_split(test_size=0.1)

### Batching

In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class TTSDataCollatorWithPadding:
    processor: Any
        
    def __call__(self, features):
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        batch = processor.pad(
            input_ids=input_ids,
            labels=label_features,
            return_tensors="pt")        

        batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
        del batch["decoder_attention_mask"]

        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
            target_lengths = target_lengths.new([length - length % model.config.reduction_factor for length in target_lengths])
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]
        batch["speaker_embeddings"] = torch.tensor(speaker_features)
        
        return batch
    
data_collator = TTSDataCollatorWithPadding(processor=processor)

### Training

In [None]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/out_dir", 
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    num_train_epochs=1.0)


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor.tokenizer)

trainer.train()

### Infer

In [None]:
from IPython.display import Audio

example = dataset["test"][304]
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
torch.save(speaker_embeddings,'speaker_embeddings.pt')

def gen(prompt,emo,model,speaker_embeddings):
    text=prompt+" [emotion] "+emo
    inputs = processor(text=text, return_tensors="pt")
    spectrogram = model.generate_speech(inputs["input_ids"].to('cpu'), speaker_embeddings.to('cpu'))
    with torch.no_grad():
            speech=vocoder(spectrogram)
    return Audio(speech.cpu().numpy(), rate=16000)

In [None]:
audio=gen('where are you going you think','Anger',trainer.model.to('cpu'),torch.load('speaker_embeddings.pt'))
audio