In [1]:
from inference.tts.spec_denoiser import SpecDenoiserInfer, data_preprocess
from utils.commons.hparams import set_hparams
from phonemizer.backend.espeak.wrapper import EspeakWrapper
import time

  torchaudio.set_audio_backend("soundfile")


# User defined directories

In [2]:
binary_data_directory='data\\processed\\binary\\libritts'
Espeak_dll_directory = 'C:\Program Files\eSpeak NG\libespeak-ng.dll'
EspeakWrapper.set_library(Espeak_dll_directory) 
whisperX_model_directory='facebook'

mfa_dictionary_path = 'data/processed/libritts/mfa_dict.txt'
mfa_acoustic_model_path = 'data/processed/libritts/mfa_model.zip'

#where to save the output .wav files
inference_output_dir='inference/out'

#checkpoint paths
orig_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_568000.ckpt'
ada_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_621250.ckpt'
naive_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_585750.ckpt'


# Several examples on different speakers without any fine-tuning

In [3]:
#load all the models to device by instantiating a SpecDenoiserInfer object which will perform our inference
hparams=set_hparams(exp_name='spec_denoiser')
infer_class_obj=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory)

| Hparams chains:  []
| Hparams: 
[;33;maccumulate_grad_batches[0m: 1, [;33;madd_word_pos[0m: True, [;33;mamp[0m: False, [;33;maudio_num_mel_bins[0m: 80, [;33;maudio_sample_rate[0m: 22050, 
[;33;mbinarizer_cls[0m: data_gen.tts.base_binarizer.BaseBinarizer, [;33;mbinary_data_dir[0m: data/binary/libritts, [;33;mcheck_val_every_n_epoch[0m: 10, [;33;mclip_grad_norm[0m: 1, [;33;mclip_grad_value[0m: 0, 
[;33;mconv_use_pos[0m: False, [;33;mdebug[0m: False, [;33;mdec_dilations[0m: [1, 1, 1, 1], [;33;mdec_ffn_kernel_size[0m: 9, [;33;mdec_inp_add_noise[0m: False, 
[;33;mdec_kernel_size[0m: 5, [;33;mdec_layers[0m: 4, [;33;mdec_post_net_kernel[0m: 3, [;33;mdecoder_rnn_dim[0m: 0, [;33;mdecoder_type[0m: conv, 
[;33;mdetach_postflow_input[0m: True, [;33;mdiff_decoder_type[0m: wavenet, [;33;mdiff_loss_type[0m: l1, [;33;mdilation_cycle_length[0m: 1, [;33;mdisc_interval[0m: 1, 
[;33;mdisc_lr[0m: 0.0001, [;33;mdisc_norm[0m: in, [;33;mdisc_reduction[

In this example, the silent phoeneme problem occurs. This is corrected in our version of the code.

In [4]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True 

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=10

#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

#os.system('rm -r inference/audio')
#os.makedirs(f'inference/audio', exist_ok=True)
if use_MFA:
    dataset_info = data_preprocess(test_file_path, test_wav_directory, mfa_dictionary_path, mfa_acoustic_model_path,
                               output_directory, align=False)
#else: 
#    dataset_info = data_preprocess(test_file_path, test_wav_directory, dictionary_path, acoustic_model_path,
#                               output_directory, align=False)

dataset_info=[{}]
dataset_info[0]['item_name'] = 'trump'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'and several new measures to protect American security and prosperity.' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/trump.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[10,10]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'and several new measures to protect American security and integrity.' #the full text to edit to
dataset_info[0]['edited_region'] = '[10,10]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 10 frames, but there are only 8 frames of silence before the first edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 4.8936073780059814


Here is how this sounded before the changes we made:

In [5]:
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,fix_silent_phonemes=False)

Mask loc buffer set to 10 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.0013918 ,  0.00468023,  0.00100364, ..., -0.0006648 ,
         -0.00016394,  0.00020675], dtype=float32),
  array([ 0.0013918 ,  0.00468023,  0.00100364, ..., -0.0006648 ,
         -0.00016394,  0.00020675], dtype=float32)]]

From the following example, we see that the mask applied to the spectrogram is not being done in the most ideal way. The is because of how the masking is done by checking when mel2word changes.

Thus, in the below, we can see that the 'left part' of the masked spectrogram ends right before the next utterance after the third word (i.e. the 'L' in libri) is detected to begin, at about .79 seconds. However, there is silence detected from 0.61 to .79 seconds, so it seems it would make more sense to end the 'left part' in the middle of these two time intervals in order to account for possible error with phoneme alignment.

Similarly, the 'right part' of the masked spectrogram begins right before the next utterance after the sixth word (i.e. the 'R' in recording) is detected to begin, at about 1.45 seconds. However, there is silence detected from 1.38 to 1.45 seconds to work with.

Thus the masked spectogram is always placed as snuggly as possible to the right, which is why in inference we often hear the beginning of the next utterance from the original text overlapping with the beginning of the edited text. 

By setting mask_loc_buffer=10 we overcome this problem:

In [6]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=10


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = '1'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'this is a libri vox recording' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/1.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[4,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'this is a silly baby recording' #the full text to edit to
dataset_info[0]['edited_region'] = '[4,5]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Mask loc buffer set to 10 frames, but there are only 16 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 10 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 0.4528827667236328


Here is how this sounded before our changes:

In [7]:
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,0)

Ground Truth audio:


Inferred audio:


[[array([-0.00303069, -0.00298999, -0.00332781, ...,  0.00148537,
          0.00185763,  0.00149519], dtype=float32),
  array([-0.00303069, -0.00298999, -0.00332781, ...,  0.00148537,
          0.00185763,  0.00149519], dtype=float32)]]

# Some examples with the fine-tuned model

In [8]:
hparams2=set_hparams(exp_name='spec_denoiser')
hparams2['use_spk_id']=True
hparams2['num_spk']=1
infer_class_obj_naive=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='naive',fine_tune_ckpt_path=naive_ckpt_path)
infer_class_obj_ada=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path) #uses all adaspeech fine tuned params
infer_class_obj_ada_encoder_spk_emb=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='se') #uses only the spk embedding, speaker id, and encoder paramaters from the ada fine-tuning
infer_class_obj_ada_encoder_spk_emb_no_spkid=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='se') #uses only the spk embedding and encoder paramaters from the ada fine-tuning
infer_class_obj_ada_encoder_pitch_dur=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='pl') #only pitch and duration from the ada fine-tuning (dur = l for length becuase d is for decoder)


loading fine tuned parameters:
dict_keys(['fs.spk_id_proj.weight'])
LOAD DIFFUSION MODEL TIME: 0.6652452945709229
| load 'model_gen' from 'pretrained/hifigan_hifitts\model_ckpt_steps_2168000.ckpt'.
Build Vocoder Time 1.0937118530273438
Vocoder Device cuda
Loaded the voice encoder model on cuda in 0.05 seconds.
WHISPERX LOAD TIME = 3.9574830532073975
loading fine tuned parameters:
dict_keys(['fs.encoder.res_blocks.0.blocks.0.0.weight', 'fs.encoder.res_blocks.0.blocks.0.0.bias', 'fs.encoder.res_blocks.0.blocks.1.0.weight', 'fs.encoder.res_blocks.0.blocks.1.0.bias', 'fs.encoder.res_blocks.1.blocks.0.0.weight', 'fs.encoder.res_blocks.1.blocks.0.0.bias', 'fs.encoder.res_blocks.1.blocks.1.0.weight', 'fs.encoder.res_blocks.1.blocks.1.0.bias', 'fs.encoder.res_blocks.2.blocks.0.0.weight', 'fs.encoder.res_blocks.2.blocks.0.0.bias', 'fs.encoder.res_blocks.2.blocks.1.0.weight', 'fs.encoder.res_blocks.2.blocks.1.0.bias', 'fs.encoder.res_blocks.3.blocks.0.0.weight', 'fs.encoder.res_blocks.3.blocks.0

In [15]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio_backup/bg3narrator_clip0010.wav'

#where to store the mfa textgrid if you are using mfa
output_directory = ''

dataset_info=[{}]
dataset_info[0]['item_name'] = ''  #this should just be used for naming the output file
dataset_info[0]['text'] = 'a single word pervades your consciousness' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = test_wav_directory #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[2,4]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'a little tweety bird enters your consciousness' #the full text to edit to
dataset_info[0]['edited_region'] = '[2,5]' #word counts in the full edited text of the region which is to be inferred starting from one 


#performing inference
print('Original Model:')
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

print(f'Total Time: {time.time()-total_time_start}')

Original Model:
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 52.74877953529358


In [16]:
print('Naive Fine Tuned Model:')
infer_class_obj_naive.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Naive Fine Tuned Model:
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([-0.0059326 , -0.0055943 , -0.00562791, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [17]:
print('Ada Fine Tuned Model:')
infer_class_obj_ada.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Ada Fine Tuned Model:
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([ 0.00090223,  0.00191623, -0.00076769, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [18]:
print('Ada Fine Tuned Model (Only Encoder, Speaker Embeding,Speaker id):')
infer_class_obj_ada_encoder_spk_emb.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Ada Fine Tuned Model (Only Encoder, Speaker Embeding,Speaker id):
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.


In [None]:
print('Ada Fine Tuned Model (Only Encoder, Speaker Embeding):')
infer_class_obj_ada_encoder_spk_emb_no_spkid.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

Ada Fine Tuned Model (Only Encoder, Speaker Embeding):
Ground Truth audio:


Inferred audio:


[[array([-0.00373007, -0.00367301, -0.00381834, ...,  0.00484199,
          0.00146057, -0.00490872], dtype=float32),
  array([-0.00240366, -0.00264107, -0.00348265, ...,  0.00484199,
          0.00146057, -0.00490872], dtype=float32)]]

In [None]:
print('Ada Fine Tuned Model (Only pitch and duration predictors)')
infer_class_obj_ada_encoder_pitch_dur.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

Ada Fine Tuned Model (Only pitch and duration predictors)
Ground Truth audio:


Inferred audio:


[[array([-0.00373007, -0.00367301, -0.00381834, ...,  0.00484199,
          0.00146057, -0.00490872], dtype=float32),
  array([-0.00320106, -0.00296728, -0.00404635, ...,  0.00484199,
          0.00146057, -0.00490872], dtype=float32)]]