<a href="https://colab.research.google.com/github/aalizelau/Text-to-Speech-Fine-Tuning/blob/main/StyleTTS_FT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Prepare Dataset

###Install packages

In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip install torch==2.3.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git
!pip install phonemizer pydub pysrt tqdm

### Download Tool Sets

In [None]:
!git clone https://github.com/aalizelau/Text-to-Speech-Fine-Tuning.git

In [None]:
!cd /content/Text-to-Speech-Fine-Tuning/prepare_dataset

### Processing Audio Files

Before you start, upload your WAV audio file/s to the audio directory!

In [None]:
# create SRT file from audio directory
import glob
wav_files = glob.glob("audio/*.wav")

for file in wav_files:
    !whisperx "{file}" --model large-v3 --output_format srt \
    --condition_on_previous_text True --max_line_width 250 \
    --max_line_count 1 --segment_resolution sentence \
    --align_mode WAV2VEC2_ASR_LARGE_LV60K_960H

In [None]:
# create segmented audios
!python srtsegmenter.py

In [None]:
# add a duration of silence to the end of each audio clip
!python add_padding.py

In [None]:
!apt-get update && apt-get install espeak

In [None]:
#create the train_list.txt and val_list.txt files
!python phonemized.py --language en-us

# 2. Fine-tuning with StyleTTS2

###Install packages and download models

In [None]:
%%shell
git clone https://github.com/yl4579/StyleTTS2.git
cd StyleTTS2
pip install SoundFile munch pydub pyyaml librosa nltk matplotlib accelerate transformers phonemizer einops einops-exts tqdm typing-extensions git+https://github.com/resemble-ai/monotonic_align.git
git-lfs clone https://huggingface.co/yl4579/StyleTTS2-LibriTTS
mv StyleTTS2-LibriTTS/Models .

In [None]:
!pip uninstall torchvision -y

In [None]:
# make sure it is compatible with torch version
!pip install torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118

###Change the finetuning config

In [None]:
#depends on your GPU resources
config_path = "Configs/config_ft.yml"

import yaml
config = yaml.safe_load(open(config_path))


config['data_params']['root_path'] = "Data/wavs"

config['batch_size'] = 2 # not enough RAM
config['max_len'] = 100 # not enough RAM
config['loss_params']['joint_epoch'] = 110

with open(config_path, 'w') as outfile:
  yaml.dump(config, outfile, default_flow_style=True)

###Start finetuning

In [None]:
!python train_finetune.py --config_path ./Configs/config_ft.yml

#3. Inference

In [None]:
text = '''Maltby and Company would issue warrants on them deliverable to the importer, and the goods were then passed to be stored in neighboring warehouses.
'''

In [None]:
# upload a reference audio
path = "Data/wavs/YOUR_REFERENCE_AUDIO.wav"
# this style vector ref_s can be saved as a parameter together with the model weights
ref_s = compute_style(path)

In [None]:
import time
from inference import inference

start = time.time()
wav = inference(text, ref_s, alpha=0.9, beta=0.9, diffusion_steps=10, embedding_scale=1)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
import IPython.display as ipd
display(ipd.Audio(wav, rate=24000, normalize=False))