In [1]:
import argparse
import json
import os
import numpy as np
import IPython.display as ipd
from tqdm import tqdm
from scipy.io.wavfile import write
from noise import generate_pink_noise #generate_blue_noise

import torch
use_gpu = torch.cuda.is_available()

import librosa
from librosa.core import load
from librosa.filters import mel as librosa_mel_fn
mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)

import params
from model import DiffVC

import sys
sys.path.append('hifi-gan/')
from env import AttrDict
from models import Generator as HiFiGAN

sys.path.append('speaker_encoder/')
from encoder import inference as spk_encoder
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
src_path = 'example/6415_111615_000012_000005.wav'
import matplotlib.pyplot as plt

In [5]:
from utils import save_plot

In [3]:
import params
n_mels = params.n_mels
sampling_rate = params.sampling_rate
n_fft = params.n_fft
hop_size = params.hop_size

channels = params.channels
filters = params.filters
layers = params.layers
kernel = params.kernel
dropout = params.dropout
heads = params.heads
window_size = params.window_size
enc_dim = params.enc_dim

dec_dim = params.dec_dim
spk_dim = params.spk_dim
use_ref_t = params.use_ref_t
beta_min = params.beta_min
beta_max = params.beta_max

random_seed = params.seed
test_size = params.test_size


In [4]:
model = DiffVC(n_mels, channels, filters, heads, layers, kernel, 
                dropout, window_size, enc_dim, spk_dim, use_ref_t, 
                dec_dim, beta_min, beta_max).cuda()
model.load_encoder(os.path.join('logs_enc', 'enc.pt'))

In [26]:
lin = np.linspace(1,0.1,80)
lin_tile = np.tile(lin,(3,1))

In [29]:
lin_tile_T = lin_tile.T

In [33]:
lin_tile_T_batch = np.tile(lin_tile_T,(6,1,1))

In [30]:
print(lin_tile_T.shape)

(80, 3)


In [34]:
print(lin_tile_T_batch.shape)

(6, 80, 3)


In [5]:
from torch.utils.data import DataLoader

import params
from data import VCTKDecDataset, VCDecBatchCollate
train_set = VCTKDecDataset('data') #VCTK data
collate_fn = VCDecBatchCollate()
train_loader = DataLoader(train_set, batch_size=4, 
                              collate_fn=collate_fn, num_workers=4, drop_last=True)

Total number of validation wavs is 4100.
Total number of training wavs is 39620.
Total number of training speakers is 99.


In [7]:
# plt.figure(figsize=(12,3))
for batch in tqdm(train_loader, total=1):
# batch = train_loader[0]
    mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda()
    noise = generate_blue_noise(mel,0.1)
    # noise = torch.randn_like(mel)
    save_plot(mel[0].cpu(),'./mel_spectrogram.png')
    # plt.imshow(noise[0].cpu())
    # plt.colorbar()
    # plt.savefig('./noise_scale.png')
    # plt.imshow(mel[0].cpu())
    # plt.colorbar()
    # plt.savefig('./melpng.png')
    # print(mel.shape)
    # print(noise.shape)
    break

  0%|          | 0/1 [00:00<?, ?it/s]


In [8]:
for batch in tqdm(train_loader, total=1):
# batch = train_loader[0]
    mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda()
    c, mel_lengths = batch['c'].cuda(), batch['mel_lengths'].cuda()
    save_plot(mel[0].cpu(),'./mel_spectrogram_2.png')
    model.compute_loss(mel, mel_lengths, mel_ref, c)
    # noise = generate_blue_noise(mel,0.1)
    break

  0%|          | 0/1 [00:00<?, ?it/s]


In [4]:
plt.figure(figsize=(12,3))
for batch in tqdm(train_loader, total=1):
    mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda()
    noise = generate_blue_noise(mel,0.1)
    plt.imshow(noise[0].cpu())
    plt.colorbar()
    plt.savefig('./blue_noise_scale.png')
    break

  0%|          | 0/1 [00:01<?, ?it/s]


In [3]:
wav, _ = load(src_path, sr=22050)
wav = wav[:(wav.shape[0] // 256)*256]
wav = np.pad(wav, 384, mode='reflect')
stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))

In [10]:
plt.imshow(stftm)
plt.savefig('./stftm.png')

In [18]:
plt.imshow(np.array([[1,2],[3,4]]))
plt.savefig('./check.png')

In [14]:

plt.imshow(mel_basis)
plt.savefig('./melbasis2.png')

In [4]:
mel_spectrogram = np.matmul(mel_basis, stftm)

In [12]:
plt.imshow(mel_spectrogram)
plt.savefig('./melspectorogram2.png')

In [5]:
log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))

In [6]:
print(log_mel_spectrogram.shape)

(80, 794)


In [2]:
def get_mel(wav_path):
    wav, _ = load(wav_path, sr=22050)
    wav = wav[:(wav.shape[0] // 256)*256]
    wav = np.pad(wav, 384, mode='reflect')
    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
    mel_spectrogram = np.matmul(mel_basis, stftm)
    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
    return log_mel_spectrogram

def get_embed(wav_path):
    wav_preprocessed = spk_encoder.preprocess_wav(wav_path)
    embed = spk_encoder.embed_utterance(wav_preprocessed)
    return embed

def noise_median_smoothing(x, w=5):
    y = np.copy(x)
    x = np.pad(x, w, "edge")
    for i in range(y.shape[0]):
        med = np.median(x[i:i+2*w+1])
        y[i] = min(x[i+w+1], med)
    return y

def mel_spectral_subtraction(mel_synth, mel_source, spectral_floor=0.02, silence_window=5, smoothing_window=5):
    mel_len = mel_source.shape[-1]
    energy_min = 100000.0
    i_min = 0
    for i in range(mel_len - silence_window):
        energy_cur = np.sum(np.exp(2.0 * mel_source[:, i:i+silence_window]))
        if energy_cur < energy_min:
            i_min = i
            energy_min = energy_cur
    estimated_noise_energy = np.min(np.exp(2.0 * mel_synth[:, i_min:i_min+silence_window]), axis=-1)
    if smoothing_window is not None:
        estimated_noise_energy = noise_median_smoothing(estimated_noise_energy, smoothing_window)
    mel_denoised = np.copy(mel_synth)
    for i in range(mel_len):
        signal_subtract_noise = np.exp(2.0 * mel_synth[:, i]) - estimated_noise_energy
        estimated_signal_energy = np.maximum(signal_subtract_noise, spectral_floor * estimated_noise_energy)
        mel_denoised[:, i] = np.log(np.sqrt(estimated_signal_energy))
    return mel_denoised

In [3]:
# loading voice conversion model
#vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model
vc_path = 'logs/logs_dec_blue_noise1/vc_50.pt'

generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 
                   params.layers, params.kernel, params.dropout, params.window_size, 
                   params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 
                   params.beta_min, params.beta_max)
if use_gpu:
    generator = generator.cuda()
    generator.load_state_dict(torch.load(vc_path))
else:
    generator.load_state_dict(torch.load(vc_path, map_location='cpu'))
generator.eval()

print(f'Number of parameters: {generator.nparams}')

Number of parameters: 126259128


In [4]:
# loading HiFi-GAN vocoder
hfg_path = 'checkpts/vocoder/' # HiFi-GAN path

with open(hfg_path + 'config.json') as f:
    h = AttrDict(json.load(f))

if use_gpu:
    hifigan_universal = HiFiGAN(h).cuda()
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
else:
    hifigan_universal = HiFiGAN(h)
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])

_ = hifigan_universal.eval()
hifigan_universal.remove_weight_norm()

Removing weight norm...


In [5]:
# loading speaker encoder
enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path
if use_gpu:
    spk_encoder.load_model(enc_model_fpath, device="cuda")
else:
    spk_encoder.load_model(enc_model_fpath, device="cpu")

Loaded encoder "pretrained.pt" trained to step 1564501


In [6]:
# loading source and reference wavs, calculating mel-spectrograms and speaker embeddings
src_path = 'example/6415_111615_000012_000005.wav' # path to source utterance
tgt_path = 'example/8534_216567_000015_000010.wav' # path to reference utterance

mel_source = torch.from_numpy(get_mel(src_path)).float().unsqueeze(0)
if use_gpu:
    mel_source = mel_source.cuda()
mel_source_lengths = torch.LongTensor([mel_source.shape[-1]])
if use_gpu:
    mel_source_lengths = mel_source_lengths.cuda()

mel_target = torch.from_numpy(get_mel(tgt_path)).float().unsqueeze(0)
if use_gpu:
    mel_target = mel_target.cuda()
mel_target_lengths = torch.LongTensor([mel_target.shape[-1]])
if use_gpu:
    mel_target_lengths = mel_target_lengths.cuda()

embed_target = torch.from_numpy(get_embed(tgt_path)).float().unsqueeze(0)
if use_gpu:
    embed_target = embed_target.cuda()

In [9]:
# performing voice conversion
mel_encoded, mel_ = generator.forward(mel_source, mel_source_lengths, mel_target, mel_target_lengths, embed_target, 
                                      n_timesteps=30, mode='ml')
mel_synth_np = mel_.cpu().detach().squeeze().numpy()
mel_source_np = mel_.cpu().detach().squeeze().numpy()
mel = torch.from_numpy(mel_spectral_subtraction(mel_synth_np, mel_source_np, smoothing_window=1)).float().unsqueeze(0)
if use_gpu:
    mel = mel.cuda()

In [12]:
# source utterance (vocoded)
with torch.no_grad():
    audio = hifigan_universal.forward(mel_source).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))

In [13]:
# reference utterance (vocoded)
with torch.no_grad():
    audio = hifigan_universal.forward(mel_target).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))

In [17]:
# converted speech
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio, rate=22050))

In [12]:
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
source = ipd.Audio(audio, rate=22050)
ipd.display(source)
#ピンクノイズ

In [10]:
with torch.no_grad():
    audio = hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
source = ipd.Audio(audio, rate=22050)
ipd.display(source)
#ブルーノイズ

In [20]:
import wave

# バイナリデータを含む変数（例：binary_data）を用意する
# このバイナリデータは音声データを表します
byte = audio.numpy().tobytes()
# WAVファイルに保存するための設定
sample_width = 2  # 16-bitの場合は2, 8-bitの場合は1
channels = 1  # モノラルの場合は1, ステレオの場合は2
frame_rate = 22050  # サンプルレート
num_frames = len(byte) // (sample_width * channels)  # フレーム数


# WAVファイルを書き込む
with wave.open('output.wav', 'w') as wav_file:
    wav_file.setnchannels(channels)
    wav_file.setsampwidth(sample_width)
    wav_file.setframerate(frame_rate)
    wav_file.setnframes(num_frames)
    wav_file.writeframes(byte)

In [21]:
print(audio.shape)

torch.Size([203264])


In [12]:
with open('./output_blue.wav','wb') as f:
    f.write(source.data)