# Finetuning FastPitch for a new speaker

In this tutorial, we will finetune a single speaker FastPitch (with alignment) model on limited amount of new speaker's data. We cover two finetuning techniques: 
1. We finetune the model parameters only on new speaker's text and speech pairs; 
2. We add a learnable speaker embedding layer to the model, and finetune on a mixture of original speaker's and new speaker's data.

We will first prepare filelists containing the audiopaths and text of the samples on which we wish to finetune the model, then generate and run a training command to finetune Fastpitch on 5 mins of data, and finally synthesize the audio from the trained checkpoint.

## Creating filelists for training

We will first create filelists of audio on which we wish to finetune the FastPitch model. We will create two kinds of filelists, one which contains only the audio files of the new speaker and one which contains the mixed audio files of the new speaker and the speaker used for training the pre-trained FastPitch Checkpoint.

<div class="alert alert-block alert-warning">
    WARNING: This notebook requires downloading the HiFiTTS dataset which is 41GB. We plan on reducing the amount the download amount.
</div>

In [None]:
import random
import os
import json
import torch
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

data_dir = <ADD_PATH_TO_DIRECTORY_CONTAINING_HIFIGAN_DATASET> # Download dataset from https://www.openslr.org/109/. Specify path to Hi_Fi_TTS_v_0
filelist_dir = <ADD_PATH_TO_DIRECTORY_IN_WHICH_WE_WISH_TO_SAVE_FILELISTS> # will be created if it does not exist
exp_base_dir = <ADD_PATH_TO_BASE_EXPERIMENT_DIRECTORY_FOR_CHECKPOINTS_AND_LOGS> # will be created if it does not exist


def make_sub_file_list(speaker_id, clean_other, split, num_samples, total_duration_mins, seed=42):
    """
    Creates a subset of training data for a HiFiTTS speaker. Specify either the num_samples or total_duration_mins
    Saves the filelist in the filelist_dir. split is either "train" or "dev"
    
    Arguments:
    speaker_id -- speaker id of the new HiFiTTS speaker
    clean_other -- "clean" or "other" depending on type of data of new HiFiTTS speaker
    split -- "train" or "dev"
    num_samples -- Number samples of new speaker (set None if specifying total_duration_mins)
    total_duration_mins -- Total duration of new speaker's data (set None if specifying num_samples)
    """
    file_list_name = "{}_manifest_{}_{}.json".format(speaker_id, clean_other, split)
    with open(os.path.join(data_dir, file_list_name), 'r') as f:
        all_records = [json.loads(l) for l in f.read().split("\n") if len(l) > 0]
    for r in all_records:
        r['audio_filepath'] = r['audio_filepath'][r['audio_filepath'].find("wav/"):]
    random.seed(seed)
    random.shuffle(all_records)
    
    if num_samples is not None and total_duration_mins is None:
        sub_records = all_records[:num_samples]
        fname_extension = "ns_{}".format(num_samples)
    elif num_samples is None and total_duration_mins is not None:
        sub_record_duration = 0.0
        sub_records = []
        for r in all_records:
            sub_record_duration += r['duration']
            if sub_record_duration > total_duration_mins*60.0:
                print ("Duration reached {} mins using {} records".format(total_duration_mins, len(sub_records)))
                break
            sub_records.append(r)
        fname_extension = "dur_{}_mins".format( int(round(total_duration_mins )))
    elif num_samples is None and total_duration_mins is None:
        sub_records = all_records
        fname_extension = "ns_all"
    else:
        raise NotImplementedError()
    print ("num sub records", len(sub_records))
    
    if not os.path.exists(filelist_dir):
        os.makedirs(filelist_dir)
    
    target_fp = os.path.join(filelist_dir, "{}_mainifest_{}_{}_local.json".format(speaker_id, split,  fname_extension))
    with open(target_fp, 'w') as f:
        for record in json.loads(json.dumps(sub_records)):
            record['audio_filepath'] = record['audio_filepath'][record['audio_filepath'].find("wav/"):]
            record['audio_filepath'] = os.path.join(data_dir, record['audio_filepath']) 
            f.write(json.dumps(record) + "\n")

def mix_file_list(speaker_id, clean_other, split, num_samples, total_duration_mins, original_speaker_id, original_clean_other, n_orig=None, seed=42):
    """
    Creates a mixed dataset of new and original speaker. num_samples or total_duration_mins specifies the amount 
    of new speaker data to be used and n_orig specifies the number of original speaker samples. Creates a balanced 
    dataset with alternating new and old speaker samples. Saves the filelist in the filelist_dir. 
    
    Arguments:
    speaker_id -- speaker id of the new HiFiTTS speaker
    clean_other -- "clean" or "other" depending on type of data of new HiFiTTS speaker
    split -- "train" or "dev"
    num_samples -- Number samples of new speaker (set None if specifying total_duration_mins)
    total_duration_mins -- Total duration of new speaker's data (set None if specifying num_samples)
    original_speaker_id -- speaker id of the original HiFiTTS speaker (on which FastPitch was trained)
    original_clean_other -- "clean" or "other" depending on type of data of new HiFiTTS speaker
    n_orig -- Number of samples of old speaker to be mixed with new speaker
    
    """
    file_list_name = "{}_manifest_{}_{}.json".format(speaker_id, clean_other, split)
    with open(os.path.join(data_dir, file_list_name), 'r') as f:
        all_records = [json.loads(l) for l in f.read().split("\n") if len(l) > 0]
    for r in all_records:
        r['audio_filepath'] = r['audio_filepath'][r['audio_filepath'].find("wav/"):]
    
    original_file_list_name = "{}_manifest_{}_{}.json".format(original_speaker_id, original_clean_other, "train")
    with open(os.path.join(data_dir, original_file_list_name), 'r') as f:
        original_all_records = [json.loads(l) for l in f.read().split("\n") if len(l) > 0]
    for r in original_all_records:
        r['audio_filepath'] = r['audio_filepath'][r['audio_filepath'].find("wav/"):]
    
    random.seed(seed)
    if n_orig is not None:
        random.shuffle(original_all_records)
        original_all_records = original_all_records[:n_orig]
        
    random.seed(seed)
    random.shuffle(all_records)
    
    if num_samples is not None and total_duration_mins is None:
        sub_records = all_records[:num_samples]
        fname_extension = "ns_{}".format(num_samples)
    elif num_samples is None and total_duration_mins is not None:
        sub_record_duration = 0.0
        sub_records = []
        for r in all_records:
            sub_record_duration += r['duration']
            if sub_record_duration > total_duration_mins * 60.0:
                print ("Duration reached {} mins using {} records".format(total_duration_mins, len(sub_records)))
                break
            sub_records.append(r)
        fname_extension = "dur_{}_mins".format( int(round(total_duration_mins)))
    elif num_samples is None and total_duration_mins is None:
        sub_records = all_records
        fname_extension = "ns_all"
    else:
        raise NotImplementedError()
        
    print(len(original_all_records))
    
    if not os.path.exists(filelist_dir):
        os.makedirs(filelist_dir)
        
    target_fp = os.path.join(filelist_dir, "{}_mainifest_{}_{}_local_mix_{}.json".format(speaker_id, split,  fname_extension, original_speaker_id))
    with open(target_fp, 'w') as f:
        for ridx, original_record in enumerate(original_all_records):
            original_record['audio_filepath'] = original_record['audio_filepath'][original_record['audio_filepath'].find("wav/"):]
            original_record['audio_filepath'] = os.path.join(data_dir, original_record['audio_filepath']) 
            
            new_speaker_record = sub_records[ridx % len(sub_records)]
            new_speaker_record['audio_filepath'] = new_speaker_record['audio_filepath'][new_speaker_record['audio_filepath'].find("wav/"):]
            new_speaker_record['audio_filepath'] = os.path.join(data_dir, new_speaker_record['audio_filepath']) 
            
            new_speaker_record['speaker'] = 1
            original_record['speaker'] = 0
            f.write(json.dumps(original_record) + "\n")
            f.write(json.dumps(new_speaker_record) + "\n")

In [None]:
make_sub_file_list(92, "clean", "train", None, 5)
mix_file_list(92, "clean", "train", None, 5, 8051, "other", n_orig=5000)
make_sub_file_list(92, "clean", "dev", None, None)

## Finetuning the model on filelists

To finetune the FastPitch model on the above created filelists, we use `examples/tts/fastpitch2_finetune.py` script to train the models with the `fastpitch_align_44100.yaml` configuration. This configuration file has been defined for 44100Hz HiFiGan dataset audio. The function `generate_training_command` in this notebook can be used to generate a training command for a given speaker and finetuning technique.

In [None]:
# pitch statistics of the new speakers
# These can be computed from the pitch contours extracted using librosa yin
# Finetuning can still work without these, but we get better results using speaker specific pitch stats
pitch_stats = {
    92 : {
        'mean' : 214.5, # female speaker
        'std' : 30.9,
        'fmin' : 80,
        'fmax' : 512
    },
    6097 : {
        'mean' : 121.9, # male speaker
        'std' : 23.1,
        'fmin' : 30,
        'fmax' : 512
    }
}


def generate_training_command(new_speaker_id, duration_mins, mixing_enabled, original_speaker_id, ckpt, use_new_pitch_stats=False):
    """
    Generates the training command string to be run from the NeMo/ directory. Assumes we have created the finetuning filelists
    using the instructions given above.
    
    Arguments:
    new_speaker_id -- speaker id of the new HiFiTTS speaker
    duration_mins -- total minutes of the new speaker data (same as that used for creating the filelists)
    mixing_enabled -- True or False depending on whether we want to mix the original speaker data or not
    original_speaker_id -- speaker id of the original HiFiTTS speaker
    use_new_pitch_stats -- whether to use pitch_stats dictionary given above or not
    ckpt: Path to pretrained FastPitch checkpoint
    
    Returns:
    Training command string
    """
    def _find_epochs(duration_mins, mixing_enabled, n_orig=None):
        # estimated num of epochs 
        if duration_mins == 5:
            epochs = 1000
        elif duration_mins == 30:
            epochs = 300
        elif duration_mins == 60:
            epochs = 150
        
        if mixing_enabled:
            if duration_mins == 5:
                epochs = epochs/50 + 1
            elif duration_mins == 30:
                epochs = epochs/10 + 1
            elif duration_mins == 60:
                epochs = epochs/5 + 1
        
        return int(epochs)
            
            
    if ckpt.endswith(".nemo"):
        ckpt_arg_name = "init_from_nemo_model"
    else:
        ckpt_arg_name = "init_from_ptl_ckpt"
    if not mixing_enabled:
        train_dataset = "{}_mainifest_train_dur_{}_mins_local.json".format(new_speaker_id, duration_mins)
        val_dataset = "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id)
        prior_folder = os.path.join(data_dir, "Priors{}".format(new_speaker_id))
        exp_dir = "{}_to_{}_no_mixing_{}_mins".format(original_speaker_id, new_speaker_id, duration_mins)
        n_speakers = 1
    else:
        train_dataset = "{}_mainifest_train_dur_{}_mins_local_mix_{}.json".format(new_speaker_id, duration_mins, original_speaker_id)
        val_dataset = "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id)
        prior_folder = os.path.join(data_dir, "Priors_{}_mix_{}".format(new_speaker_id, original_speaker_id))
        exp_dir = "{}_to_{}_mixing_{}_mins".format(original_speaker_id, new_speaker_id, duration_mins)
        n_speakers = 2
    train_dataset = os.path.join(filelist_dir, train_dataset)
    val_dataset = os.path.join(filelist_dir, val_dataset)
    exp_dir = os.path.join(exp_base_dir, exp_dir)
                                    
    max_epochs = _find_epochs(duration_mins, mixing_enabled, n_orig=None)
    config_name = "fastpitch_align_44100.yaml"
    
    training_command = "python examples/tts/fastpitch2_finetune.py --config-name={} train_dataset={} validation_datasets={} +{}={} trainer.max_epochs={} trainer.check_val_every_n_epoch=1 prior_folder={} model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 exp_manager.exp_dir={} model.n_speakers={}".format(
        config_name, train_dataset, val_dataset, ckpt_arg_name, ckpt, max_epochs, prior_folder, exp_dir, n_speakers)
    if use_new_pitch_stats:
        training_command += " model.pitch_avg={} model.pitch_std={} model.pitch_fmin={} model.pitch_fmax={}".format(
            pitch_stats[new_speaker_id]['mean'], 
            pitch_stats[new_speaker_id]['std'],
            pitch_stats[new_speaker_id]['fmin'],
            pitch_stats[new_speaker_id]['fmax']
        )
    training_command += " model.optim.lr=2e-4 ~model.optim.sched"
    return training_command
    

In [None]:
new_speaker_id = 92
duration_mins = 5
mixing = False
original_speaker_id = 8051
ckpt_path = <PATH_TO_PRETRAINED_FASTPITCH_CHECKPOINT>
print(generate_training_command(new_speaker_id, duration_mins, mixing, original_speaker_id, ckpt_path, True))

The generated command should look something like this. We can ofcourse tweak things like epochs/learning rate if we like

`python examples/tts/fastpitch2_finetune.py --config-name=fastpitch_align_44100 train_dataset=filelists/92_mainifest_train_dur_5_mins_local.json validation_datasets=filelists/92_mainifest_dev_ns_all_local.json +init_from_nemo_model=PreTrainedModels/FastPitch.nemo trainer.max_epochs=1000 trainer.check_val_every_n_epoch=1 prior_folder=Hi_Fi_TTS_v_0/Priors92 model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 exp_manager.exp_dir=inetuningDemo/8051_to_92_no_mixing_5_mins model.n_speakers=1 model.pitch_avg=214.5 model.pitch_std=30.9 model.pitch_fmin=80 model.pitch_fmax=512  model.optim.lr=2e-4 ~model.optim.sched`

^ Run the above command from the terminal from the `NeMo/` directory to start finetuning a model. 

## Synthesize samples from finetuned checkpoints

Once we have finetuned our FastPitch model, we can synthesize the audio samples for given text using the following inference steps. We use a HiFiGAN vocoder trained on multiple speakers, get the trained checkpoint path for our trained model and synthesize audio for a given text as follows.

In [None]:
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import FastPitchModel

hifigan_ckpt_path =  <PATH_TO_PRETRAINED_HIFIGAN_CHECKPOINT>
vocoder = HifiGanModel.load_from_checkpoint(hifigan_ckpt_path)
vocoder.eval().cuda()

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Arguments:
    spec_gen_model -- Instance of FastPitch model
    vocoder_model -- Instance of a vocoder model (HiFiGAN in our case)
    str_input -- Text input for the synthesis
    speaker -- Speaker number (in the case of a multi-speaker model -- in the mixing case)
    
    Returns:
    spectrogram, waveform of the synthesized audio.
    """
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

def get_best_ckpt(experiment_base_dir, new_speaker_id, duration_mins, mixing_enabled, original_speaker_id):
    """
    Gives the model checkpoint paths of an experiment  we ran. 
    
    Arguments:
    experiment_base_dir -- Base experiment directory (specified on top of this notebook as exp_base_dir)
    new_speaker_id -- Speaker id of new HiFiTTS speaker we finetuned FastPitch on
    duration_mins -- total minutes of the new speaker data
    mixing_enabled -- True or False depending on whether we want to mix the original speaker data or not
    original_speaker_id -- speaker id of the original HiFiTTS speaker
    
    Returns:
    List of all checkpoint paths sorted by validation error, Last checkpoint path
    """
    if not mixing_enabled:
        exp_dir = "{}/{}_to_{}_no_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    else:
        exp_dir = "{}/{}_to_{}_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    
    ckpt_candidates = []
    last_ckpt = None
    for root, dirs, files in os.walk(exp_dir):
        for file in files:
            if file.endswith(".ckpt"):
                val_error = float(file.split("v_loss=")[1].split("-epoch")[0])
                if "last" in file:
                    last_ckpt = os.path.join(root, file)
                ckpt_candidates.append( (val_error, os.path.join(root, file)))
    ckpt_candidates.sort()
    
    return ckpt_candidates, last_ckpt

Specify the speaker id, duration mins and mixing variable to find the relevant checkpoint from the exp_base_dir and compare the synthesized audio with validation samples of the new speaker.

In [None]:
new_speaker_id = 92
duration_mins = 5
mixing = False
original_speaker_id = 8051


_ ,last_ckpt = get_best_ckpt(exp_base_dir, new_speaker_id, duration_mins, mixing, original_speaker_id)
print(last_ckpt)

spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
spec_model.eval().cuda()
_speaker=None
if mixing:
    _speaker = 1

num_val = 2

manifest_path = os.path.join(filelist_dir, "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id))
val_records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        val_records.append( json.loads(line) )
        if len(val_records) >= num_val:
            break
            
for val_record in val_records:
    print ("Real validation audio")
    ipd.display(ipd.Audio(val_record['audio_filepath'], rate=44100))
    print ("SYNTHESIZED FOR -- Speaker: {} | Dataset size: {} mins | Mixing:{} | Text: {}".format(new_speaker_id, duration_mins, mixing, val_record['text']))
    spec, audio = infer(spec_model, vocoder, val_record['text'], speaker = _speaker)
    ipd.display(ipd.Audio(audio, rate=44100))
    %matplotlib inline
    #if spec is not None:
    imshow(spec, origin="lower", aspect = "auto")
    plt.show()