In [1]:
import torch
from modules.speech_editing.spec_denoiser.spec_denoiser import GaussianDiffusion
from utils.commons.ckpt_utils import load_ckpt
from utils.commons.hparams import set_hparams
from data_gen.tts.base_preprocess import BasePreprocessor
from modules.speech_editing.spec_denoiser.diffnet import DiffNet

  torchaudio.set_audio_backend("soundfile")


In [2]:
binary_data_directory='C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable\\data\\processed\\binary\\libritts'
hparams=set_hparams(exp_name='spec_denoiser')
preprocessor=BasePreprocessor()
ph_encoder,word_encoder=preprocessor.load_dict(binary_data_directory)
DIFF_DECODERS = {
    'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
}
model = GaussianDiffusion(
            phone_encoder=ph_encoder,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
            timesteps=hparams['timesteps'], time_scale=hparams['timescale'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )
load_ckpt(model, hparams['work_dir'], 'model')

| load 'model' from 'checkpoints/spec_denoiser\model_ckpt_steps_568000.ckpt'.


In [3]:
param_names=[param[0] for param in model.named_parameters()] #see if we can tell what parameters corresponds to the Layer norms from their names

In [4]:
#make sure none are hidden from param_names https://discuss.pytorch.org/t/model-named-parameters-will-lose-some-layer-modules/14588/3
for k, v in model.state_dict().items():
    if k not in param_names:
        print(k)
#looks like were good!

timesteps
timescale
betas
alphas_cumprod
alphas_cumprod_prev
sqrt_alphas_cumprod
sqrt_one_minus_alphas_cumprod
log_one_minus_alphas_cumprod
sqrt_recip_alphas_cumprod
sqrt_recipm1_alphas_cumprod
posterior_variance
posterior_log_variance_clipped
posterior_mean_coef1
posterior_mean_coef2
spec_min
spec_max


In [5]:
for param in param_names:
    print(param)

denoise_fn.input_projection.weight
denoise_fn.input_projection.bias
denoise_fn.mlp.0.weight
denoise_fn.mlp.0.bias
denoise_fn.mlp.2.weight
denoise_fn.mlp.2.bias
denoise_fn.residual_layers.0.dilated_conv.weight
denoise_fn.residual_layers.0.dilated_conv.bias
denoise_fn.residual_layers.0.diffusion_projection.weight
denoise_fn.residual_layers.0.diffusion_projection.bias
denoise_fn.residual_layers.0.conditioner_projection.weight
denoise_fn.residual_layers.0.conditioner_projection.bias
denoise_fn.residual_layers.0.output_projection.weight
denoise_fn.residual_layers.0.output_projection.bias
denoise_fn.residual_layers.1.dilated_conv.weight
denoise_fn.residual_layers.1.dilated_conv.bias
denoise_fn.residual_layers.1.diffusion_projection.weight
denoise_fn.residual_layers.1.diffusion_projection.bias
denoise_fn.residual_layers.1.conditioner_projection.weight
denoise_fn.residual_layers.1.conditioner_projection.bias
denoise_fn.residual_layers.1.output_projection.weight
denoise_fn.residual_layers.1.out

Considering, e.g. the pitch_predictor, we have:
```
def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5, dropout_rate=0.1):
        super(PitchPredictor, self).__init__()
        self.conv = torch.nn.ModuleList()
        self.kernel_size = kernel_size
        for idx in range(n_layers):
            in_chans = idim if idx == 0 else n_chans
            self.conv += [torch.nn.Sequential(
                torch.nn.Conv1d(in_chans, n_chans, kernel_size, padding=kernel_size // 2),
                torch.nn.ReLU(),
                LayerNorm(n_chans, dim=1),
                torch.nn.Dropout(dropout_rate)
            )]
        self.linear = torch.nn.Linear(n_chans, odim)
```
So it makes sense to surmise that, in the list:

fs.pitch_predictor.conv.0.0.weight

fs.pitch_predictor.conv.0.0.bias

fs.pitch_predictor.conv.0.2.weight

fs.pitch_predictor.conv.0.2.bias

0.0 is indicating the weights and bias for the Conv1d layer and 0.2 for the LayerNorm layer.

To confirm this:

In [6]:
for param in model.named_parameters():
    if param[0]=='fs.pitch_predictor.conv.0.0.weight':
        print(param[1].shape)
    if param[0]=='fs.pitch_predictor.conv.0.0.bias':
        print(param[1].shape)
    if param[0]=='fs.pitch_predictor.conv.0.2.weight':
        print(param[1].shape)
    if param[0]=='fs.pitch_predictor.conv.0.2.bias':
        print(param[1].shape)


torch.Size([256, 256, 5])
torch.Size([256])
torch.Size([256])
torch.Size([256])


From the conv1d documentation:

Attributes: weight (Tensor): the learnable weights of the module of shape
            $(\text{out\_channels},
            \frac{\text{in\_channels}}{\text{groups}}, \text{kernel\_size})$.
            The values of these weights are sampled from
            $\mathcal{U}(-\sqrt{k}, \sqrt{k})$ where
            $k = \frac{groups}{C_\text{in} * \text{kernel\_size}}$

bias (Tensor):   the learnable bias of the module of shape
            (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
            sampled from $\mathcal{U}(-\sqrt{k}, \sqrt{k})$ where
            $k = \frac{groups}{C_\text{in} * \text{kernel\_size}}$

From the LayerNorm documentation: 

Attributes: weight: the learnable weights of the module of shape
            $\text{normalized\_shape}$ when :attr:`elementwise_affine` is set to ``True``.
            The values are initialized to 1.
            
bias:   the learnable bias of the module of shape
                $\text{normalized\_shape}$ when :attr:`elementwise_affine` is set to ``True``.
                The values are initialized to 0.

So, this indeed seems to be their naming convention.

Based on the AdaSpeech paper https://arxiv.org/pdf/2103.00993 (in particular section 2.2) and its implementation https://github.com/tuanh123789/AdaSpeech/tree/main  (see also figure 2 and section 2.3 of https://arxiv.org/pdf/2211.00585) it may be sufficient to fine-tune simply by retraining any LayerNorm parameters. AdaSpeech suggests fine-tuning these as a linear function of the speaker embedding, but if we are just fine-tuning on a single speaker, I don't see why we should make this restriction. We may also want to finetune the final linear output layers? And we should almost certainly fine tune the single linear layer spk_embed_proj which makes the utterance-level speaker projection.

AdaSpeech uses FastSpeech2 as the model backbone. Our model also uses fastspeech2 as the model backbone for acoustic conditioning modeling. So here we share the same Transformer-based archicture. A difference, however, is that AdaSpeech also uses the mel-spectrogram decoder of fastspeech 2, which is composed of stacked feed-forward Transformer layers and thus has additional layer-norms which they fine tune. We instead have a Context-Aware Spectrogram Denoiser which, following DiffWave 2021, uses a non-causal WaveNet, whose mel spectrogram decoder consists of a 1x1 convolution layer and N convolution blocks with residual connections. If we used the Auxiliary Decoder + boundary predictor in addition to the spectrogram denoiser as in DiffSinger2022, we would also have these additional feed-forward Transformer layers and hence layer-norms to fine tune. In addition, we do not have the more fine-grained ''phoneme level acoustic conditioning modeling'' added to the adaspeech architecture, only utterance-level and speaker-level. Nonetheless, it is interesting to see if adaspeech's proposed fine tuning works if we are only fine tuning the acoustic conditioning modeling and not the spectrogram decoder (in adaspeech's ablation study they only consider with/without CLN and dont break it into components). Adding these other elements is an interesting avenue for future work. In particular, using fast speech's spectrogram decoder should only add in 4 more layer norms to fine tune, but changing the architecture of the model would involve training from scratch. We can also add back in FastSpeech2's energy predictor, which was removed in this model for some reason.

It also probably makes sense to compare with the very simple "fine tuning" suggested by the code where one just adds a fixed value to the speaker projection for a given speaker-id. I'm guessing this isn't very effective, which is probably why they don't discuss its implementation anywhere.


In [7]:
for module in model.children():
    print(module)

DiffNet(
  (input_projection): Conv1d(80, 256, kernel_size=(1,), stride=(1,))
  (diffusion_embedding): SinusoidalPosEmb()
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=1024, bias=True)
    (1): Mish()
    (2): Linear(in_features=1024, out_features=256, bias=True)
  )
  (residual_layers): ModuleList(
    (0-19): 20 x ResidualBlock(
      (dilated_conv): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
      (diffusion_projection): Linear(in_features=256, out_features=256, bias=True)
      (conditioner_projection): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
      (output_projection): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
  )
  (skip_projection): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
  (output_projection): Conv1d(256, 80, kernel_size=(1,), stride=(1,))
)
FastSpeech(
  (encoder): TextConvEncoder(
    (res_blocks): Sequential(
      (0): ResidualBlock(
        (blocks): ModuleList(
          (0-1): 2 x Sequential(
            

It looks like there are layer norms in the fastspeech encoder and decoder, the DurationPredictor, and the PitchPredictor. We can identify which of the weights correspond to each LayerNorm weight and bias and fine tune them (as well as the linear speaker embeding projection layer). We can probably use the existing training code, although it may be specific to the datasets they use, so we need to figure out how to format our new speaker dataset in the correct way.

It may also be worth learning a bit more about what each of the 4 components which contain layer norms are actually doing,


Encoder: (4 blocks, layer norm on each twice plus final layer norm = 9 layer norms, 18 parameters, size 256 each)
fs.encoder.res_blocks.0.blocks.0.0.weight,
fs.encoder.res_blocks.0.blocks.0.0.bias,
fs.encoder.res_blocks.0.blocks.1.0.weight,
fs.encoder.res_blocks.0.blocks.1.0.bias,
fs.encoder.res_blocks.1.blocks.0.0.weight,
fs.encoder.res_blocks.1.blocks.0.0.bias,
fs.encoder.res_blocks.1.blocks.1.0.weight,
fs.encoder.res_blocks.1.blocks.1.0.bias,
fs.encoder.res_blocks.2.blocks.0.0.weight,
fs.encoder.res_blocks.2.blocks.0.0.bias,
fs.encoder.res_blocks.2.blocks.1.0.weight,
fs.encoder.res_blocks.2.blocks.1.0.bias,
fs.encoder.res_blocks.3.blocks.0.0.weight,
fs.encoder.res_blocks.3.blocks.0.0.bias,
fs.encoder.res_blocks.3.blocks.1.0.weight,
fs.encoder.res_blocks.3.blocks.1.0.bias,
fs.encoder.last_norm.weight,
fs.encoder.last_norm.bias

Decoder: (4 blocks, layer norm on each twice plus final layer norm = 9 layer norms, 18 parameters, size 256 each)
fs.decoder.res_blocks.0.blocks.0.0.weight,
fs.decoder.res_blocks.0.blocks.0.0.bias,
fs.decoder.res_blocks.0.blocks.1.0.weight,
fs.decoder.res_blocks.0.blocks.1.0.bias,
fs.decoder.res_blocks.1.blocks.0.0.weight,
fs.decoder.res_blocks.1.blocks.0.0.bias,
fs.decoder.res_blocks.1.blocks.1.0.weight,
fs.decoder.res_blocks.1.blocks.1.0.bias,
fs.decoder.res_blocks.2.blocks.0.0.weight,
fs.decoder.res_blocks.2.blocks.0.0.bias,
fs.decoder.res_blocks.2.blocks.1.0.weight,
fs.decoder.res_blocks.2.blocks.1.0.bias,
fs.decoder.res_blocks.3.blocks.0.0.weight,
fs.decoder.res_blocks.3.blocks.0.0.bias,
fs.decoder.res_blocks.3.blocks.1.0.weight,
fs.decoder.res_blocks.3.blocks.1.0.bias,
fs.decoder.last_norm.weight,
fs.decoder.last_norm.bias

Duration Predictor:(3 layer norms, 6 parameters size 256 each)
fs.dur_predictor.conv.0.2.weight,
fs.dur_predictor.conv.0.2.bias,
fs.dur_predictor.conv.1.2.weight,
fs.dur_predictor.conv.1.2.bias,
fs.dur_predictor.conv.2.2.weight,
fs.dur_predictor.conv.2.2.bias

Pitch Predictor:(5 layer norms, 10 parameters size 256 each)
fs.pitch_predictor.conv.0.2.weight,
fs.pitch_predictor.conv.0.2.bias,
fs.pitch_predictor.conv.1.2.weight,
fs.pitch_predictor.conv.1.2.bias,
fs.pitch_predictor.conv.2.2.weight,
fs.pitch_predictor.conv.2.2.bias,
fs.pitch_predictor.conv.3.2.weight,
fs.pitch_predictor.conv.3.2.bias,
fs.pitch_predictor.conv.4.2.weight,
fs.pitch_predictor.conv.4.2.bias


Speaker embedding weight and bias (size 256 by 256 and 256 respectively):
fs.spk_embed_proj.weight,
fs.spk_embed_proj.bias

In [8]:
for param in model.named_parameters():
    if param[0]=='fs.spk_embed_proj.weight':
        print(param[1].shape)
    if param[0]=='fs.spk_embed_proj.bias':
        print(param[1].shape)


torch.Size([256, 256])
torch.Size([256])


In [6]:
encoder_layer_norms = [
    "fs.encoder.res_blocks.0.blocks.0.0.weight",
    "fs.encoder.res_blocks.0.blocks.0.0.bias",
    "fs.encoder.res_blocks.0.blocks.1.0.weight",
    "fs.encoder.res_blocks.0.blocks.1.0.bias",
    "fs.encoder.res_blocks.1.blocks.0.0.weight",
    "fs.encoder.res_blocks.1.blocks.0.0.bias",
    "fs.encoder.res_blocks.1.blocks.1.0.weight",
    "fs.encoder.res_blocks.1.blocks.1.0.bias",
    "fs.encoder.res_blocks.2.blocks.0.0.weight",
    "fs.encoder.res_blocks.2.blocks.0.0.bias",
    "fs.encoder.res_blocks.2.blocks.1.0.weight",
    "fs.encoder.res_blocks.2.blocks.1.0.bias",
    "fs.encoder.res_blocks.3.blocks.0.0.weight",
    "fs.encoder.res_blocks.3.blocks.0.0.bias",
    "fs.encoder.res_blocks.3.blocks.1.0.weight",
    "fs.encoder.res_blocks.3.blocks.1.0.bias",
    "fs.encoder.last_norm.weight",
    "fs.encoder.last_norm.bias"
]

decoder_layer_norms = [
    "fs.decoder.res_blocks.0.blocks.0.0.weight",
    "fs.decoder.res_blocks.0.blocks.0.0.bias",
    "fs.decoder.res_blocks.0.blocks.1.0.weight",
    "fs.decoder.res_blocks.0.blocks.1.0.bias",
    "fs.decoder.res_blocks.1.blocks.0.0.weight",
    "fs.decoder.res_blocks.1.blocks.0.0.bias",
    "fs.decoder.res_blocks.1.blocks.1.0.weight",
    "fs.decoder.res_blocks.1.blocks.1.0.bias",
    "fs.decoder.res_blocks.2.blocks.0.0.weight",
    "fs.decoder.res_blocks.2.blocks.0.0.bias",
    "fs.decoder.res_blocks.2.blocks.1.0.weight",
    "fs.decoder.res_blocks.2.blocks.1.0.bias",
    "fs.decoder.res_blocks.3.blocks.0.0.weight",
    "fs.decoder.res_blocks.3.blocks.0.0.bias",
    "fs.decoder.res_blocks.3.blocks.1.0.weight",
    "fs.decoder.res_blocks.3.blocks.1.0.bias",
    "fs.decoder.last_norm.weight",
    "fs.decoder.last_norm.bias"
]

dur_predictor_layer_norms = [
    "fs.dur_predictor.conv.0.2.weight",
    "fs.dur_predictor.conv.0.2.bias",
    "fs.dur_predictor.conv.1.2.weight",
    "fs.dur_predictor.conv.1.2.bias",
    "fs.dur_predictor.conv.2.2.weight",
    "fs.dur_predictor.conv.2.2.bias"
]

pitch_predictor_layer_norms = [
    "fs.pitch_predictor.conv.0.2.weight",
    "fs.pitch_predictor.conv.0.2.bias",
    "fs.pitch_predictor.conv.1.2.weight",
    "fs.pitch_predictor.conv.1.2.bias",
    "fs.pitch_predictor.conv.2.2.weight",
    "fs.pitch_predictor.conv.2.2.bias",
    "fs.pitch_predictor.conv.3.2.weight",
    "fs.pitch_predictor.conv.3.2.bias",
    "fs.pitch_predictor.conv.4.2.weight",
    "fs.pitch_predictor.conv.4.2.bias"
]

speaker_embedding_weights = [
    "fs.spk_embed_proj.weight",
    "fs.spk_embed_proj.bias"
]

all_weights_to_fine_tune_names=encoder_layer_norms+decoder_layer_norms+dur_predictor_layer_norms+pitch_predictor_layer_norms+speaker_embedding_weights

In [10]:
all_weights_to_fine_tune_weights=[]
for param in model.named_parameters():
    if param[0] in all_weights_to_fine_tune_names:
        print(param[0])
        print(param[1].size())
        all_weights_to_fine_tune_weights.append(param[1])
print(len(all_weights_to_fine_tune_weights))
print(18+18+6+10+2)

fs.encoder.res_blocks.0.blocks.0.0.weight
torch.Size([256])
fs.encoder.res_blocks.0.blocks.0.0.bias
torch.Size([256])
fs.encoder.res_blocks.0.blocks.1.0.weight
torch.Size([256])
fs.encoder.res_blocks.0.blocks.1.0.bias
torch.Size([256])
fs.encoder.res_blocks.1.blocks.0.0.weight
torch.Size([256])
fs.encoder.res_blocks.1.blocks.0.0.bias
torch.Size([256])
fs.encoder.res_blocks.1.blocks.1.0.weight
torch.Size([256])
fs.encoder.res_blocks.1.blocks.1.0.bias
torch.Size([256])
fs.encoder.res_blocks.2.blocks.0.0.weight
torch.Size([256])
fs.encoder.res_blocks.2.blocks.0.0.bias
torch.Size([256])
fs.encoder.res_blocks.2.blocks.1.0.weight
torch.Size([256])
fs.encoder.res_blocks.2.blocks.1.0.bias
torch.Size([256])
fs.encoder.res_blocks.3.blocks.0.0.weight
torch.Size([256])
fs.encoder.res_blocks.3.blocks.0.0.bias
torch.Size([256])
fs.encoder.res_blocks.3.blocks.1.0.weight
torch.Size([256])
fs.encoder.res_blocks.3.blocks.1.0.bias
torch.Size([256])
fs.encoder.last_norm.weight
torch.Size([256])
fs.encoder

We need to fine tune $256*53+256*256=79104$ paramaters. There are 31560867 in the model, so we are only tuning about .25% of them. Even knowing we are not tuning the denoiser, which has 15086416 paramaters, we are only tuning .48% of the remaining parameters in the model.

In [11]:
total_num_params=0
for param in model.named_parameters():
    size_this_param=1
    for dim_size in param[1].size():
        size_this_param=size_this_param*dim_size
    total_num_params+=size_this_param
print(total_num_params)

31560867


In [12]:
total_num_diff_params=0
for param in model.named_parameters():
    if param[0].startswith('denoise_fn'):
        size_this_param=1
        for dim_size in param[1].size():
            size_this_param=size_this_param*dim_size
        total_num_diff_params+=size_this_param
print(total_num_diff_params)

15086416


The entire pipeline of the model goes as follows: 

wav, transcript, edited text, and edited word regions as input

wav converted to mel spectrogram 

text and edited text is converted to CMU phonemes via vanilla g2p_en

phoneme alignment on original audio performed with mfa or whisperx - we improved how short silences are handled. input is just wav. produces mel2ph

f0 (fundamental frequency) over time and uv (unvoiced) markers over time on original audio determined via vanilla parselmouth. input is just wav
utterance-level speaker embedding is determined via vanilla resemblyzer. input is just wav

the edited phonemes (alone!) are passed to FastSpeechEncoder, each phoneme is assigned a 256 dimensional vector. Somehow must relate the phonemes to one another? The result is encoder_out

The utterance-level speaker embedding (256) from resemblyzer (alone!) is sent to spk_embed_proj, which is a linear layer producing another 256-dimensional vector. If a speaker_id is used, a single fixed 256 dim vector is added to this. The result is style_embed

dur_inp = style_embed+encoder_out. This and a masked mel2ph are sent to the forward duration predictor. the masking is determined by the edited text and edited word regions

edited_mel2ph, edited_f0, edited_uv, and ref_mels are computed from the output of the duration predictor and from the edited word regions. We improve this computation using the new hyperparameter "mask_loc_buffer". edited_mel2ph is from sticking the predicted mel2ph from the duration predicter in between the two mel2ph ends. edited_f0, edited_uv, and ref_mels are just the original f0, uv, and mel with zeros put in the unknown region which will be edited

edited_f0,edited_uv,style_embed, edited_mel2ph, and encoder_out are used sent to the pitch prediction model, and the result with encoder_out and edited_mel2ph is input into the decoder and then the linear "mel out" layer to determine decoder_inp. So for each mel bin, we have an associated f0,phoneme, and decoder output associated to that phoneme.

decoder_inp is summed with the result of passing the ref_mels (alone) to the MelEncoder to get cond

cond is passed to the spectrogram denoiser to get the inferred spectrogram for the edited region, called output

Output is concatinated with the original mel spectrogram to get the full spectogram

the result is passed to a vocoder to produce the resulting wav.








The losses are: The sample reconstruction loss (mean absolute error between true and inferred spectrogram), the structural similarity index (SSIM) loss arXiv:2202.13066 (another measure of similarity between true and inferred spectrogram), and reconstruction loss for pitch and duration predictor, which are just the l2 lost between the true and inferred phoneme duration and pitch over time. For the former the ground truth is as determined by MFA and for the latter by parselmouth. To train, they randomly mask 80% of the phoenemes in a given utterance and try to reconstruct them. It is unclear if any of the libritts dataset is set aside for validation in the paper, or if the entire thing is used for training. It seems looking at spec_denoiser.yaml that for validation maybe only 30% of phonemes are randomly masked, though, since infer_mask_ratio: 0.30.



From the following, we see that changing the hparam use_spk_id to true and trying to load are existing model, ''fs.spk_id_proj.weight'' is missing from the state dict

In [4]:
hparams_spk_id=set_hparams(exp_name='spec_denoiser_spk_id')
preprocessor_spk_id=BasePreprocessor()
ph_encoder_spk_id,word_encoder_spk_id=preprocessor_spk_id.load_dict(binary_data_directory)
model_spk_id = GaussianDiffusion(
            phone_encoder=ph_encoder_spk_id,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
            timesteps=hparams['timesteps'], time_scale=hparams['timescale'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )
load_ckpt(model_spk_id, hparams_spk_id['work_dir'], 'model', force=False, strict=False)
param_names_spk_id=[param[0] for param in model_spk_id.named_parameters()]

| ckpt not found in checkpoints/spec_denoiser_spk_id.


In [18]:
print(len(param_names))
print(len(param_names_spk_id))
new_params=[param for param in param_names_spk_id if param not in param_names]
print(new_params)

323
324
['fs.spk_id_proj.weight']


For fine tuning, we use the BG3Narrator dataset that Francesca put together. It consists of 2874 utterances and one speaker. 574 utterances are set aside for validation. The baseline is the model (with our adjustments) as-is. The validation losses are:

From naive fine tune initial validation results:
{'total_loss': 2.0832, 'l1_coarse': 0.2393, 'ssim_coarse': 0.2545, 'pdur': 0.1128, 'wdur': 0.1465, 'uv': 1.1229, 'f0': 0.2072}

From adafinetune initial validation results:
{'total_loss': 2.3501, 'l1_coarse': 0.3107, 'ssim_coarse': 0.3218, 'pdur': 0.0941, 'wdur': 0.102, 'uv': 1.3279, 'f0': 0.1934}

The ''best'' inferences can be found as events.out.tfevents.1723669676.DESKTOP-CRD2FGJ.21240.0

For fine-tuning, we add the hparam --naive_fine_tune.

This assumes you have a model already trained with use_spk_id: false, so that fs.spk_id_proj.weight is missing from the state dict. It then initializes the paramaters from that model, adds fs.spk_id_proj.weight to the state dict, and only trains those weights (using their same training method). 

modify config.yaml of the original model to use_spk_id: true and num_spk to the number of speakers you are fine-tuning on. Also modify the data directory hparams accordingly, as well as val_check_interval and valid_infer_interval to be how often you want to validate and save the model parameters and max_updates to adjust how long to fine-tune for. Then run 

python tasks/run.py --config checkpoints/spec_denoiser/config.yaml --exp_name spec_denoiser --naive_fine_tune



In [11]:
import numpy as np
ph_lengths = np.load('C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\data\\binary\\NarratorBG3\\train_ph_lengths.npy')
print('Total number of utterances:')
print(len(ph_lengths))
print('Mean number of phonemes per utterance (including silent):')
print(np.mean(ph_lengths))
print('Max number of ph in an utterance:')
print(np.max(ph_lengths))
print('Min number of ph in an utterance:')
print(np.min(ph_lengths))
val_ph_lengths = np.load('C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\data\\binary\\NarratorBG3\\valid_ph_lengths.npy')
print('Total number of utterances:')
print(len(val_ph_lengths))
print('Mean number of phonemes per utterance (including silent):')
print(np.mean(val_ph_lengths))
print('Max number of ph in an utterance:')
print(np.max(val_ph_lengths))
print('Min number of ph in an utterance:')
print(np.min(val_ph_lengths))

Total number of utterances:
2178
Mean number of phonemes per utterance (including silent):
51.16988062442608
Max number of ph in an utterance:
230
Min number of ph in an utterance:
4
Total number of utterances:
542
Mean number of phonemes per utterance (including silent):
54.798892988929886
Max number of ph in an utterance:
210
Min number of ph in an utterance:
4


In [12]:

import torchaudio
import os
audio_files=[os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk('C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\Speech-Editing-Toolkit-stable\\data\\processed\\NarratorBG3\\wav_processed') for f in filenames] 
audio_lengths=[]
for file in audio_files:
    wav,rate=torchaudio.load(file)
    audio_lengths.append(wav.shape[1]/rate)

In [13]:
print('Mean utterance length in seconds:')
print(np.mean(audio_lengths))
print('Max utterance length:')
print(np.max(audio_lengths))
print('Min utterance length:')
print(np.min(audio_lengths))

Mean utterance length in seconds:
4.255508420951308
Max utterance length:
20.57687074829932
Min utterance length:
0.2849886621315193


In [14]:
optimizer=torch.optim.AdamW(
            model_spk_id.parameters(),
            lr=hparams['lr'],
            betas=(hparams['optimizer_adam_beta1'], hparams['optimizer_adam_beta2']),
            weight_decay=hparams['weight_decay'])

In [15]:
for group in optimizer.param_groups:
    for param in group:
        if param[0] =='fs.spk_id_proj.weight':
            print('Found!')
#cannot find param from optimizer groups, names get lost

In [16]:
for param in model_spk_id.named_parameters():
    if param[0] =='fs.spk_id_proj.weight':
            print('Found!')

Found!


In [13]:
#a sanity check to make sure no other weights are being messed with 
model_spk_id_trained = GaussianDiffusion(
            phone_encoder=ph_encoder_spk_id,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
            timesteps=hparams['timesteps'], time_scale=hparams['timescale'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )
load_ckpt(model_spk_id_trained, 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\checkpoints\\spec_denoiser_naive_fine_tuning', 'model')

| load 'model' from 'C:\Users\bezem\Documents\erdos_deep_learning\Speech-Editing-Toolkit-stable-unedited\Speech-Editing-Toolkit-stable\checkpoints\spec_denoiser_naive_fine_tuning\model_ckpt_steps_572000.ckpt'.


In [15]:
num_problem_params=0
for param in model_spk_id_trained.named_parameters():
    matching_params=[parameter for parameter in model.named_parameters() if param[0]==parameter[0]]
    if len(matching_params)>1:
        print('problem!')
    elif len(matching_params)==1:
        if any(value not in param[1] for value in matching_params[0][1]):
            num_problem_params+=1
            print('Problem!')
            print(param[0])
            print(matching_params[0][0])
            print('Original:')
            print(matching_params[0][1])
            print('Changed:')
            print(param[1])
    else:
        print(param[0])


fs.spk_id_proj.weight


In [5]:
%tensorboard --logdir 

UsageError: Line magic function `%` not found.


We also add the hparam --single_speaker_ada_fine_tune, which makes the same assumptions but also tunes all of the LayerNorm weights and biases identified above, as well as the speaker embedding weights and bias.

modify config.yaml of the original model to use_spk_id: true (if you want to include a speaker-id paramater) and num_spk to the number of speakers you are fine-tuning on. Also modify the data directory hparams accordingly, as well as val_check_interval and valid_infer_interval to be how often you want to validate and save the model parameters and max_updates to adjust how long to fine-tune for. Then run 

python tasks/run.py --config checkpoints/spec_denoiser/config.yaml --exp_name spec_denoiser --single_speaker_ada_fine_tune

Right now for both fine-tuning methods we save the whole model rather than just the changed weights for sanity checks - later can change to save only the changed weights

In [5]:
#a sanity check to make sure no other weights are being messed with 
model_spk_id_ada_trained = GaussianDiffusion(
            phone_encoder=ph_encoder,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
            timesteps=hparams['timesteps'], time_scale=hparams['timescale'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )
load_ckpt(model_spk_id_ada_trained, 'C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\checkpoints\\spec_denoiser_ada_fine_tuning', 'model',strict=False)

| load 'model' from 'C:\Users\bezem\Documents\erdos_deep_learning\Speech-Editing-Toolkit-stable-unedited\Speech-Editing-Toolkit-stable\checkpoints\spec_denoiser_ada_fine_tuning\model_ckpt_steps_568250.ckpt'.


In [8]:
num_problem_params=0
for param in model_spk_id_ada_trained.named_parameters():
    matching_params=[parameter for parameter in model.named_parameters() if param[0]==parameter[0]]
    if len(matching_params)>1:
        print('problem!')
    elif len(matching_params)==1:
        if any(value not in param[1] for value in matching_params[0][1]) and (param[0] not in all_weights_to_fine_tune_names):
            num_problem_params+=1
            print('Problem!')
            print(param[0])
            print(matching_params[0][0])
            print('Original:')
            print(matching_params[0][1])
            print('Changed:')
            print(param[1])
    else:
        print(param[0])

In [11]:
for param in model_spk_id_ada_trained.named_parameters():
    print(param[0])


denoise_fn.input_projection.weight
denoise_fn.input_projection.bias
denoise_fn.mlp.0.weight
denoise_fn.mlp.0.bias
denoise_fn.mlp.2.weight
denoise_fn.mlp.2.bias
denoise_fn.residual_layers.0.dilated_conv.weight
denoise_fn.residual_layers.0.dilated_conv.bias
denoise_fn.residual_layers.0.diffusion_projection.weight
denoise_fn.residual_layers.0.diffusion_projection.bias
denoise_fn.residual_layers.0.conditioner_projection.weight
denoise_fn.residual_layers.0.conditioner_projection.bias
denoise_fn.residual_layers.0.output_projection.weight
denoise_fn.residual_layers.0.output_projection.bias
denoise_fn.residual_layers.1.dilated_conv.weight
denoise_fn.residual_layers.1.dilated_conv.bias
denoise_fn.residual_layers.1.diffusion_projection.weight
denoise_fn.residual_layers.1.diffusion_projection.bias
denoise_fn.residual_layers.1.conditioner_projection.weight
denoise_fn.residual_layers.1.conditioner_projection.bias
denoise_fn.residual_layers.1.output_projection.weight
denoise_fn.residual_layers.1.out

In [12]:
#testing how to load in only some of the weights
model_spk_id_ada_trained = GaussianDiffusion(
            phone_encoder=ph_encoder,
            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
            timesteps=hparams['timesteps'], time_scale=hparams['timescale'],
            loss_type=hparams['diff_loss_type'],
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
        )
from utils.commons.ckpt_utils import get_last_checkpoint, get_all_ckpts
checkpoint, name = get_last_checkpoint('C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\checkpoints\\spec_denoiser_ada_fine_tuning', None)
checkpoints, name = get_last_checkpoint('C:\\Users\\bezem\\Documents\\erdos_deep_learning\\Speech-Editing-Toolkit-stable-unedited\\Speech-Editing-Toolkit-stable\\checkpoints\\spec_denoiser_ada_fine_tuning', None)

We also perform an abalation study to see which of these is contributing the most to the low validation loss for the adaspeech style fine tuning. See help for the new  fine_tune_weight_sel on how to recreate this. Possible combinations are:

Without spk_id:

e 

l 

p

s

el

ep

es

lp

ls

ps

elp

els

eps

lps

elps 

with spk_id:
e

l

p

s

el

ep

es

lp

ls

ps

elp

els

eps

lps

elps