In [1]:
import numpy as np, glob, ujson
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2   

# next cell
%reload_ext autoreload

In [2]:
import torch
from torch import optim
from wavenet_vocoder.util import is_mulaw_quantize, is_scalar_input
from wavenet_vocoder import WaveNet
from wavenet_vocoder.data_loader import get_data_loader
from wavenet_vocoder.train import train, train_one_step, MeanMetric
print(torch.__version__)

1.3.1


In [3]:
from wavenet_vocoder.tfcompat.hparam import HParams

hparams = HParams(
    name="wavenet_vocoder",

    # Input type:
    # 1. raw [-1, 1]
    # 2. mulaw [-1, 1]
    # 3. mulaw-quantize [0, mu]
    # If input_type is raw or mulaw, network assumes scalar input and
    # discretized mixture of logistic distributions output, otherwise one-hot
    # input and softmax output are assumed.
    # **NOTE**: if you change the one of the two parameters below, you need to
    # re-run preprocessing before training.
    input_type="mulaw-quantize",
    quantize_channels=256,  # 65536 or 256

    # Audio:
    # time-domain pre/post-processing
    # e.g., preemphasis/inv_preemphasis
    # ref: LPCNet https://arxiv.org/abs/1810.11846
    preprocess="",
    postprocess="",
    # waveform domain scaling
    global_gain_scale=1.0,

    sample_rate=16000,
    # this is only valid for mulaw is True
    silence_threshold=2,
    num_mels=80,
    fmin=125,
    fmax=7600,
    fft_size=1024,
    # shift can be specified by either hop_size or frame_shift_ms
    hop_size=256,
    frame_shift_ms=None,
    win_length=1024,
    win_length_ms=-1.0,
    window="hann",

    # DC removal
    highpass_cutoff=70.0,

    # Parametric output distribution type for scalar input
    # 1) Logistic or 2) Normal
    output_distribution="Logistic",
    log_scale_min=-16.0,

    # Model:
    # This should equal to `quantize_channels` if mu-law quantize enabled
    # otherwise num_mixture * 3 (pi, mean, log_scale)
    # single mixture case: 2
#     out_channels=10 * 3,
    out_channels=256,
    layers=18,
    stacks=2,
    residual_channels=128,
    gate_channels=256,  # split into 2 gropus internally for gated activation
    skip_out_channels=128,
    dropout=0.0,
    kernel_size=3,

    # Local conditioning (set negative value to disable))
    cin_channels=80,
    cin_pad=0,
    # If True, use transposed convolutions to upsample conditional features,
    # otherwise repeat features to adjust time resolution
    upsample_conditional_features=True,
    upsample_net="ConvInUpsampleNetwork",
    upsample_params={
        "upsample_scales": [4, 4, 4, 4],  # should np.prod(upsample_scales) == hop_size
    },

    # Global conditioning (set negative value to disable)
    # currently limited for speaker embedding
    # this should only be enabled for multi-speaker dataset
    gin_channels=-1,  # i.e., speaker embedding dim
    n_speakers=7,  # 7 for CMU ARCTIC

    # Data loader
    pin_memory=True,
    num_workers=1,

    # Loss

    # Training:
    batch_size=8,
    optimizer="Adam",
    optimizer_params={
        "lr": 1e-3,
        "eps": 1e-8,
        "weight_decay": 0.0,
    },

    # see lrschedule.py for available lr_schedule
    lr_schedule="step_learning_rate_decay",
    lr_schedule_kwargs={"anneal_rate": 0.5, "anneal_interval": 200000},

    max_train_steps=1000000,
    nepochs=2000,

    clip_thresh=-1,

    # max time steps can either be specified as sec or steps
    # if both are None, then full audio samples are used in a batch
    max_time_sec=None,
    max_time_steps=10240,  # 256 * 40

    # Hold moving averaged parameters and use them for evaluation
    exponential_moving_average=True,
    # averaged = decay * averaged + (1 - decay) * x
    ema_decay=0.9999,

    # Save
    # per-step intervals
    checkpoint_interval=100000,
    train_eval_interval=100000,
    # per-epoch interval
    test_eval_epoch_interval=50,
    save_optimizer_state=True,

    # Eval:
)

In [4]:
def build_model(hparams):
    if is_mulaw_quantize(hparams.input_type):
        if hparams.out_channels != hparams.quantize_channels:
            raise RuntimeError(
                "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
    if hparams.upsample_conditional_features and hparams.cin_channels < 0:
        s = "Upsample conv layers were specified while local conditioning disabled. "
        s += "Notice that upsample conv layers will never be used."
        warn(s)

    upsample_params = hparams.upsample_params
    upsample_params["cin_channels"] = hparams.cin_channels
    upsample_params["cin_pad"] = hparams.cin_pad
    model = WaveNet(
        out_channels=hparams.out_channels,
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        cin_pad=hparams.cin_pad,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_params=upsample_params,
        scalar_input=is_scalar_input(hparams.input_type),
        output_distribution=hparams.output_distribution,
    )
    return model

model = build_model(hparams)
model

WaveNet(
  (first_conv): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
  (conv_layers): ModuleList(
    (0): ResidualConv1dGLU(
      (conv): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(2,))
      (conv1x1c): Conv1d(80, 256, kernel_size=(1,), stride=(1,), bias=False)
      (conv1x1_out): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (conv1x1_skip): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    )
    (1): ResidualConv1dGLU(
      (conv): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
      (conv1x1c): Conv1d(80, 256, kernel_size=(1,), stride=(1,), bias=False)
      (conv1x1_out): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
      (conv1x1_skip): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
    )
    (2): ResidualConv1dGLU(
      (conv): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
      (conv1x1c): Conv1d(80, 256, kernel_size=(1,), stride=(1,), bias=False)
      (conv1x1_out): Conv1d(128, 12

In [5]:
device = torch.device('cuda')
model = model.to(device)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, amsgrad=True)

### Load data

In [7]:
audio_files = sorted(glob.glob("./data/fma_tiny_16k_15s/raw/*.npz"))
processed_files = sorted(glob.glob("./data/fma_tiny_16k_15s/processed/*.npz"))

In [8]:
dataset = get_data_loader(processed_files, batch_size=1, shuffle=True, n_split=1, local_conditioning=True, max_time_steps=12032)

c_batch=torch.Size([100, 80, 47]) x=torch.Size([100, 12032])


In [9]:
global_step = 0

In [15]:
global_step = train(dataset, model, optimizer, 
      device, train_one_step, 
      epoch_seeds=[1] * 1000, global_step=global_step, n_epoches=450, 
      no_steps_per_epoch=len(dataset), 
      ckpt_dir="./ckpts",
      log_freq=1, log_metrics=[MeanMetric(x) for x in ['loss', 'accuracy']])

HBox(children=(FloatProgress(value=0.0, description='training', max=45000.0, style=ProgressStyle(description_w…

In [11]:
global_step

30000