In [1]:
from inference.tts.spec_denoiser import data_preprocess,SpecDenoiserInfer
from utils.commons.hparams import set_hparams

  torchaudio.set_audio_backend("soundfile")


Code based on FluentSpeech, available at https://github.com/Zain-Jiang/Speech-Editing-Toolkit 

To run: 

Assuming you want to be able to use MFA:
1. Install MFA with anaconda
2. Place MFA dictionary at ...
2. Install easyspeak from ...
3. pip install -r requirements.txt (need to update this)
4. Download the models from... and put them in ...
5. Set the binary data directory location and easyspeak location in the next cell

If you only want to use our improved alignment, you can skip step 1. If you only want to use MFA, you can skip...

In [2]:
binary_data_directory='C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable\\data\\processed\\binary\\libritts'
Espeak_dll_directory = 'C:\Program Files\eSpeak NG\libespeak-ng.dll'
EspeakWrapper.set_library(Espeak_dll_directory) 
whisperX_model_directory='C:\\Users\\bezem\\Documents\\erdos_deep_learning\\whisperX-main\\facebook'

mfa_dictionary_path = 'data/processed/libritts/mfa_dict.txt'
mfa_acoustic_model_path = 'data/processed/libritts/mfa_model.zip'

#where to save the output .wav files
inference_output_dir='inference/out'

#checkpoint paths
orig_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_568000.ckpt'
ada_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_621250.ckpt'
naive_ckpt_path='checkpoints/spec_denoiser/model_ckpt_steps_585750.ckpt'


In [22]:
#load all the models to device by instantiating a SpecDenoiserInfer object which will perform our inference
hparams=set_hparams(exp_name='spec_denoiser')
infer_class_obj=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory)

LOAD DIFFUSION MODEL TIME: 1.2935893535614014
| load 'model_gen' from 'pretrained/hifigan_hifitts\model_ckpt_steps_2168000.ckpt'.
Build Vocoder Time 2.7740280628204346
Vocoder Device cuda
Loaded the voice encoder model on cuda in 0.04 seconds.
WHISPERX LOAD TIME = 6.54592752456665


# Several examples on different speakers without any fine-tuning

In [23]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True 

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=10

#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

#os.system('rm -r inference/audio')
#os.makedirs(f'inference/audio', exist_ok=True)
if use_MFA:
    dataset_info = data_preprocess(test_file_path, test_wav_directory, mfa_dictionary_path, mfa_acoustic_model_path,
                               output_directory, align=False)
#else: 
#    dataset_info = data_preprocess(test_file_path, test_wav_directory, dictionary_path, acoustic_model_path,
#                               output_directory, align=False)

dataset_info=[{}]
dataset_info[0]['item_name'] = 'trump'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'and several new measures to protect American security and prosperity.' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/trump.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[10,10]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'and several new measures to protect American security and integrity.' #the full text to edit to
dataset_info[0]['edited_region'] = '[10,10]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 10 frames, but there are only 8 frames of silence before the first edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 2.4454286098480225


From the following example, we see that the mask applied to the spectrogram is not being done in the most ideal way. The is because of how the masking is done by checking when mel2word changes.

Thus, in the below, we can see that the 'left part' of the masked spectrogram ends right before the next utterance after the third word (i.e. the 'L' in libri) is detected to begin, at about .79 seconds. However, there is silence detected from 0.61 to .79 seconds, so it seems it would make more sense to end the 'left part' in the middle of these two time intervals in order to account for possible error with phoneme alignment.

Similarly, the 'right part' of the masked spectrogram begins right before the next utterance after the sixth word (i.e. the 'R' in recording) is detected to begin, at about 1.45 seconds. However, there is silence detected from 1.38 to 1.45 seconds to work with.

Thus the masked spectogram is always placed as snuggly as possible to the right, which is why in inference we often hear the beginning of the next utterance from the original text overlapping with the beginning of the edited text. 

Even though the model was probably trained with this method of masking, it would probably help to make a more conservative choice of masking by changing how head_idx and tail_idx are calculated in forward_model.

EDIT: One can now try this out by setting the mask_loc_buffer below.

We now see that something strange is going on with the below example. edited_mel2ph is output from self.model.fs.forward_dur and is supposed to predict the number of bins each phoeneme in edited_ph is held for (including silent phonemes and indexed from 1). The "middle part" of the result is then used to construct an new version of edited_mel2ph that is input to the model by concatinating with the original mel2ph on either side. But, as we see below, even in the original edited_mel2ph output, phonemes 9-12 [AH0, |, S, IH1] are predicted not to be uttered at all! So it says "this is a ly baby recording"...

An aside: Why not get the uvs from the alignment model? Apparently using f0s for the uvs results in something quite different

In [6]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=10


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = '1'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'this is a libri vox recording' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/1.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[1,4]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'that is a silly vox recording' #the full text to edit to
dataset_info[0]['edited_region'] = '[1,4]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Mask loc buffer set to 10 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 10 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 0.39792537689208984


In [7]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = '1'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'this is a libri vox recording' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/1.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[3,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'this is a stinky little guy recording' #the full text to edit to
dataset_info[0]['edited_region'] = '[3,6]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 0.5584983825683594


In [8]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = '1'  #this should just be used for naming the output file
dataset_info[0]['text'] = 'this is a libri vox recording' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = 'inference/audio_backup/1.wav' #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[3,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'this is a stinky little guy recording' #the full text to edit to
dataset_info[0]['edited_region'] = '[3,6]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 0.6013844013214111


In [9]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#relative path of the .csv with the information for inference
test_file_path = 'inference/example2.csv'

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'inference/audio'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{"item_name": "", "text": " get out of the sun", "wav_fn_orig": "c:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable\\saved_audio/flagged_0.wav", "region": "[4,5]", "mfa_textgrid": "", "edited_text": "get out of the jelly!", "edited_region": "[4,5]"}]

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Ground Truth audio:


Inferred audio:


Total Time: 0.490680456161499


In [10]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Voice Cloning and editing\\Voice Cloning and editing\\AUDIO Narrator BG3\\BG3 Narrator Lines - Cleaned by hand\\clips_by_silence_with_trailing_silence_removed_inner_silences_trimmed\\clip0018.wav'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = ''  #this should just be used for naming the output file
dataset_info[0]['text'] = 'every breath is thick with blood he pauses' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = test_wav_directory #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[2,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'every breath is thick with blood he pauses' #the full text to edit to
dataset_info[0]['edited_region'] = '[2,5]' #word counts in the full edited text of the region which is to be inferred starting from one 

#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Ground Truth audio:


Inferred audio:


Total Time: 0.53855299949646


In [11]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Voice Cloning and editing\\Voice Cloning and editing\\AUDIO Narrator BG3\\BG3 Narrator Lines - Cleaned by hand\\clips_by_silence_with_trailing_silence_removed_inner_silences_trimmed\\clip0018.wav'

#where to store the mfa textgrid if you are using mfa
output_directory = 'inference/audio/mfa_out'

dataset_info=[{}]
dataset_info[0]['item_name'] = ''  #this should just be used for naming the output file
dataset_info[0]['text'] = 'every breath is thick with blood he pauses' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = test_wav_directory #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[2,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'every breath is thick with blood he pauses' #the full text to edit to
dataset_info[0]['edited_region'] = '[2,5]' #word counts in the full edited text of the region which is to be inferred starting from one 


#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Ground Truth audio:


Inferred audio:


Total Time: 0.7519786357879639


In [12]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\data\\processed\\libritts\\wav_processed\\1089_134686_000002_000001.wav'

#where to store the mfa textgrid if you are using mfa
output_directory = ''

dataset_info=[{}]
dataset_info[0]['item_name'] = ''  #this should just be used for naming the output file
dataset_info[0]['text'] = 'after early nightfall the yellow lamps would light up , here and there , the squalid quarter of the brothels .' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = test_wav_directory #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[2,5]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'after early nightfall the yellow lamps would light up , here and there , the squalid quarter of the brothels .' #the full text to edit to
dataset_info[0]['edited_region'] = '[2,5]' #word counts in the full edited text of the region which is to be inferred starting from one 


#performing inference
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

print(f'Total Time: {time.time()-total_time_start}')

Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 4 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 0.7330291271209717


# Some examples with the fine-tuned model

In [31]:
hparams2=set_hparams(exp_name='spec_denoiser')
hparams2['use_spk_id']=True
hparams2['num_spk']=1
infer_class_obj_naive=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='naive',fine_tune_ckpt_path=naive_ckpt_path)
infer_class_obj_ada=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path) #uses all adaspeech fine tuned params
infer_class_obj_ada_encoder_spk_emb=SpecDenoiserInfer(hparams2,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='se') #uses only the spk embedding, speaker id, and encoder paramaters from the ada fine-tuning
infer_class_obj_ada_encoder_spk_emb_no_spkid=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='se') #uses only the spk embedding and encoder paramaters from the ada fine-tuning


loading fine tuned parameters:
dict_keys(['fs.spk_id_proj.weight'])
LOAD DIFFUSION MODEL TIME: 1.4847264289855957
| load 'model_gen' from 'pretrained/hifigan_hifitts\model_ckpt_steps_2168000.ckpt'.
Build Vocoder Time 2.604448080062866
Vocoder Device cuda
Loaded the voice encoder model on cuda in 0.04 seconds.
WHISPERX LOAD TIME = 5.498392343521118
loading fine tuned parameters:
dict_keys(['fs.encoder.res_blocks.0.blocks.0.0.weight', 'fs.encoder.res_blocks.0.blocks.0.0.bias', 'fs.encoder.res_blocks.0.blocks.1.0.weight', 'fs.encoder.res_blocks.0.blocks.1.0.bias', 'fs.encoder.res_blocks.1.blocks.0.0.weight', 'fs.encoder.res_blocks.1.blocks.0.0.bias', 'fs.encoder.res_blocks.1.blocks.1.0.weight', 'fs.encoder.res_blocks.1.blocks.1.0.bias', 'fs.encoder.res_blocks.2.blocks.0.0.weight', 'fs.encoder.res_blocks.2.blocks.0.0.bias', 'fs.encoder.res_blocks.2.blocks.1.0.weight', 'fs.encoder.res_blocks.2.blocks.1.0.bias', 'fs.encoder.res_blocks.3.blocks.0.0.weight', 'fs.encoder.res_blocks.3.blocks.0.0

In [38]:
total_time_start=time.time()

#during inference we can choose if we want to use MFA for alignment by setting use_MFA=True here. Otherwise whisperX is used for phoneme alignment, which is much faster
use_MFA=False   
#we can also choose if we want to use their method of librosa for spectrograms, or ours of torchaudio
use_librosa=False  
#whether or not to save the file
save_wav_bool=False
#whether or not to print the inferred audio 
disp_wav=True

#this should be a positive integer determining how many mel bins to the left to shift the end of the left part of the masked spectogram and the start of the right part of the masked spectrogram
#each mel bin corresponds to roughly 256/22050 = .0116 seconds of the original audio
#if the buffer is large enough that it overlaps with the previously detected phoneme, the midpoint of the silence before the first word to change is used instead
mask_loc_buffer=5


#one way to perform inference is to create a .csv with columns as demoed on the FluentSpeech github and pass these arguments to data_preprocess. However, if we are not using MFA, we can also just build the output of data_proprocess by hand as below.

#where to put the .lab file with the sentence translated to CMU if performing MFA inference
#It is unclear to me right now if the code still works with this changed, since when using MFA the .wav file for inference also gets copied to 'inference/audio'
test_wav_directory = 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\data\\processed\\NarratorBG3\\wav_processed\\bg3narrator_clip0010.wav'

#where to store the mfa textgrid if you are using mfa
output_directory = ''

dataset_info=[{}]
dataset_info[0]['item_name'] = ''  #this should just be used for naming the output file
dataset_info[0]['text'] = 'a single word pervades your consciousness' #a transcription of the original text
dataset_info[0]['wav_fn_orig'] = test_wav_directory #location of the .wav file to perform inference on 
dataset_info[0]['region'] = '[2,4]' #the region to edit (counting the words that will be changed starting from 1)
dataset_info[0]['mfa_textgrid'] = '' #we still need to set this to some value even if we are not using MFA


dataset_info[0]['edited_text'] = 'a little tweety bird enters your consciousness' #the full text to edit to
dataset_info[0]['edited_region'] = '[2,5]' #word counts in the full edited text of the region which is to be inferred starting from one 


#performing inference
print('Original Model:')
infer_class_obj.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

print(f'Total Time: {time.time()-total_time_start}')

Original Model:
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


Total Time: 4.950699329376221


In [39]:
print('Naive Fine Tuned Model:')
infer_class_obj_naive.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Naive Fine Tuned Model:
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([-0.00296991, -0.00293535, -0.0034539 , ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [40]:
print('Ada Fine Tuned Model:')
infer_class_obj_ada.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Ada Fine Tuned Model:
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([ 0.0009023 ,  0.00191662, -0.00076694, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [41]:
print('Ada Fine Tuned Model (Only Encoder, Speaker Embeding,Speaker id):')
infer_class_obj_ada_encoder_spk_emb.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer,speaker_id='bg3narrator')

Ada Fine Tuned Model (Only Encoder, Speaker Embeding,Speaker id):
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([-0.00433962, -0.00296975, -0.00395054, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [43]:
print('Ada Fine Tuned Model (Only Encoder, Speaker Embeding):')
infer_class_obj_ada_encoder_spk_emb_no_spkid.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

Ada Fine Tuned Model (Only Encoder, Speaker Embeding):
Silent inferred phonemes predicted, changing word regions!
Mask loc buffer set to 5 frames, but there are only 2 frames of silence before the first edited word. Using silence midpoint instead.
Mask loc buffer set to 5 frames, but there are only 2 frames of silence after the last edited word. Using silence midpoint instead.
Ground Truth audio:


Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([-0.00115322,  0.00019837, -0.0030721 , ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]

In [44]:
infer_class_obj_ada_encoder_pitch_dur=SpecDenoiserInfer(hparams,binary_data_directory,orig_ckpt_path,whisperX_model_directory,fine_tuned='ada',fine_tune_ckpt_path=ada_ckpt_path,ada_weights='pd')
infer_class_obj_ada_encoder_pitch_dur.example_run(dataset_info,use_MFA,use_librosa,save_wav_bool,disp_wav,mask_loc_buffer)

loading fine tuned parameters:
dict_keys(['fs.decoder.res_blocks.0.blocks.0.0.weight', 'fs.decoder.res_blocks.0.blocks.0.0.bias', 'fs.decoder.res_blocks.0.blocks.1.0.weight', 'fs.decoder.res_blocks.0.blocks.1.0.bias', 'fs.decoder.res_blocks.1.blocks.0.0.weight', 'fs.decoder.res_blocks.1.blocks.0.0.bias', 'fs.decoder.res_blocks.1.blocks.1.0.weight', 'fs.decoder.res_blocks.1.blocks.1.0.bias', 'fs.decoder.res_blocks.2.blocks.0.0.weight', 'fs.decoder.res_blocks.2.blocks.0.0.bias', 'fs.decoder.res_blocks.2.blocks.1.0.weight', 'fs.decoder.res_blocks.2.blocks.1.0.bias', 'fs.decoder.res_blocks.3.blocks.0.0.weight', 'fs.decoder.res_blocks.3.blocks.0.0.bias', 'fs.decoder.res_blocks.3.blocks.1.0.weight', 'fs.decoder.res_blocks.3.blocks.1.0.bias', 'fs.decoder.last_norm.weight', 'fs.decoder.last_norm.bias', 'fs.pitch_predictor.conv.0.2.weight', 'fs.pitch_predictor.conv.0.2.bias', 'fs.pitch_predictor.conv.1.2.weight', 'fs.pitch_predictor.conv.1.2.bias', 'fs.pitch_predictor.conv.2.2.weight', 'fs.pitc

Inferred audio:


[[array([ 0.00089875,  0.00191659, -0.00076168, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32),
  array([-0.00566105, -0.00582234, -0.00633081, ..., -0.00182851,
         -0.0040878 ,  0.00438392], dtype=float32)]]