In [1]:
import math
import yaml
import librosa
import torch
import torch.nn as nn
from os import path
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from effortless_config import Config
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from core import harmonic_synth, amp_to_impulse_response, fft_convolve
from core import mlp, gru, scale_function, remove_above_nyquist, upsample, get_scheduler, multiscale_fft, safe_log, mean_std_loudness

In [2]:
from models import DDSP_signal_only, DDSP_with_features
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class args(Config):
    CONFIG = "config_violin.yaml"

args.parse_args("")
with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

ddsp_model = DDSP_with_features(**config["model"]).to(device)
print(ddsp_model)

Using device: cuda:0
DDSP_with_features(
  (latent_z): Latent_Z(
    (z_vector): Torch_MFCC_Extractor(
      (mfcc): MFCC(
        (amplitude_to_DB): AmplitudeToDB()
        (MelSpectrogram): MelSpectrogram(
          (spectrogram): Spectrogram()
          (mel_scale): MelScale()
        )
      )
    )
    (norm_layer): InstanceNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
    (gru): GRU(30, 512, batch_first=True)
    (dense_z): Linear(in_features=512, out_features=16, bias=True)
  )
  (decoder): Decoder_with_Z(
    (in_mlps): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=1, out_features=512, bias=True)
        (1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (2): ReLU()
        (3): Linear(in_features=512, out_features=512, bias=True)
        (4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (5): ReLU()
        (6): Linear(in_features=512, out_features=512, bias=True)
        (7): LayerNorm((512,), 

In [3]:
from datasets.dataset_all import Dataset, get_files
from effortless_config import Config
import yaml
import torch

#get_files("config_violin.yaml")

class args(Config):
    CONFIG = "config_violin.yaml"

args.parse_args("")
with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

out_dir = config["preprocess"]["out_dir"]

dataset = Dataset(out_dir)
batch_size = config["hyperparams"]["batch_size"]
dataloader = torch.utils.data.DataLoader(dataset,
                                        batch_size,
                                        True,
                                        drop_last=True,
                                        )

print("Size of dataset:", len(dataset), "\nSize of sig batch:", next(iter(dataloader))['signals'].size(), "\nSize of sig batch:", next(iter(dataloader))['pitches'].size(), "\nSize of sig batch:", next(iter(dataloader))['loudness'].size())

Size of dataset: 216 
Size of sig batch: torch.Size([16, 64000]) 
Size of sig batch: torch.Size([16, 400]) 
Size of sig batch: torch.Size([16, 400])


In [4]:
class args(Config):
    CONFIG = "config.yaml"
    NAME = "debug"
    ROOT = "runs_violin2"
    STEPS = 500000
    START_LR = 1e-3
    STOP_LR = 1e-4
    DECAY_OVER = 400000

mean_loudness, std_loudness = mean_std_loudness(dataloader)
config["data"]["mean_loudness"] = mean_loudness
config["data"]["std_loudness"] = std_loudness

writer = SummaryWriter(path.join(args.ROOT, args.NAME), flush_secs=20)

with open(path.join(args.ROOT, args.NAME, "config.yaml"), "w") as out_config:
    yaml.safe_dump(config, out_config)

opt = torch.optim.Adam(ddsp_model.parameters(), lr=args.START_LR)

schedule = get_scheduler(
    len(dataloader),
    args.START_LR,
    args.STOP_LR,
    args.DECAY_OVER,
)

best_loss = float("inf")
mean_loss = 0
n_element = 0
step = 0
epochs = int(np.ceil(args.STEPS / len(dataloader)))

In [5]:
import soundfile as sf

losses = []

for e in tqdm(range(epochs)):
    total_loss = 0
    for batch in dataloader:
        s = batch['signals'].to(device)
        p = batch['pitches'].unsqueeze(-1).to(device)
        l = batch['loudness'].unsqueeze(-1).to(device)

        l = (l - mean_loudness) / std_loudness

        y = ddsp_model(s, p, l).squeeze(-1)

        ori_stft = multiscale_fft(
            s,
            config["train"]["scales"],
            config["train"]["overlap"],
        )
        rec_stft = multiscale_fft(
            y,
            config["train"]["scales"],
            config["train"]["overlap"],
        )

        loss = 0
        for s_x, s_y in zip(ori_stft, rec_stft):
            lin_loss = (s_x - s_y).abs().mean()
            log_loss = (safe_log(s_x) - safe_log(s_y)).abs().mean()
            loss = loss + lin_loss + log_loss

        opt.zero_grad()
        loss.backward()
        opt.step()

        writer.add_scalar("loss", loss.item(), step)

        step += 1

        n_element += 1
        mean_loss += (loss.item() - mean_loss) / n_element
        total_loss += loss.item()

    if not e % 10:
        writer.add_scalar("lr", schedule(e), e)
        writer.add_scalar("reverb_decay", ddsp_model.reverb.decay.item(), e)
        writer.add_scalar("reverb_wet", ddsp_model.reverb.wet.item(), e)
        # scheduler.step()
        if mean_loss < best_loss:
            best_loss = mean_loss
            torch.save(
                ddsp_model.state_dict(),
                path.join(args.ROOT, args.NAME, "state.pth"),
            )

        mean_loss = 0
        n_element = 0

        audio = torch.cat([s, y], -1).reshape(-1).detach().cpu().numpy()

        sf.write(
            path.join(args.ROOT, args.NAME, f"eval_{e:06d}.wav"),
            audio,
            config["preprocess"]["sample_rate"],
        )
    
    total_loss /= len(dataloader)
    losses.append(total_loss)


  8%|▊         | 3083/38462 [56:46<10:51:30,  1.10s/it]


KeyboardInterrupt: 

In [6]:
from datasets.dataset_all import Dataset, get_files
import torch
import yaml
from effortless_config import Config

get_files("config_test.yaml")

class args(Config):
    CONFIG = "config_test.yaml"

args.parse_args("")
with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

out_dir = config["preprocess"]["out_dir"]

dataset = Dataset(out_dir)
batch_size = config["hyperparams"]["batch_size"]
dataloader = torch.utils.data.DataLoader(dataset,
                                        batch_size,
                                        True,
                                        drop_last=True,
                                        )

print("Size of dataset:", len(dataset), "\nSize of sig batch:", next(iter(dataloader))['signals'].size(), "\nSize of sig batch:", next(iter(dataloader))['pitches'].size(), "\nSize of sig batch:", next(iter(dataloader))['loudness'].size())

  x, sr = li.load(f, sample_rate)




  f = li.fft_frequencies(sample_rate, n_fft)
  + 2 * np.log10(f_sq)
Synth\synth.wav: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]

Size of dataset: 2 
Size of sig batch: torch.Size([1, 64000]) 
Size of sig batch: torch.Size([1, 400]) 
Size of sig batch: torch.Size([1, 400])





In [7]:
from models import DDSP_with_features
import soundfile as sf

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class args(Config):
    CONFIG = "config_test.yaml"

args.parse_args("")
with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

ddsp_model = DDSP_with_features(**config["model"]).to(device)

ddsp_model.load_state_dict(torch.load("runs_violin2/debug/state.pth"))

class args(Config):
    CONFIG = "runs_violin/debug/config.yaml"

args.parse_args("")
with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

mean_loudness = config["data"]["mean_loudness"]
std_loudness = config["data"]["std_loudness"]

print(mean_loudness, std_loudness)

batch = next(iter(dataloader))
s = batch['signals'].to(device)
p = batch['pitches'].unsqueeze(-1).to(device)
l = batch['loudness'].unsqueeze(-1).to(device)

l = (l - mean_loudness) / std_loudness

y = ddsp_model(s, p, l).squeeze(-1)

print(y)

y = torch.cat([s, y], -1).reshape(-1).detach().cpu().numpy()

sf.write("synth_to_violin.wav", y, 16000)

Using device: cuda:0
-4.517404079437256 3.236548900604248
tensor([[0.0463, 0.0799, 0.0790,  ..., 0.0120, 0.0096, 0.0078]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)
