In [1]:
import matplotlib
matplotlib.use('TkAgg')
%matplotlib inline

import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('./hifi_gan/')

from hifi_gan.env import AttrDict
from hifi_gan.meldataset import MAX_WAV_VALUE
from hifi_gan.models import Generator

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd
import librosa
import torch
import os
import json
import random
import soundfile as sf

from hparams import create_hparams
from model import Tacotron2, load_model
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

In [2]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [5]:
def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    fig, axes = plt.subplots(4, 1, figsize=figsize)
    axes = axes.flatten()
    axes[0].imshow(mel_source, aspect='auto', origin='lower', interpolation='none')
    axes[1].imshow(mel_outputs_postnet, aspect='auto', origin='lower', interpolation='none')
    axes[2].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2].set_xlim(0, len(f0s))
    axes[3].imshow(alignments, aspect='auto', origin='lower', interpolation='none')
    axes[0].set_title("Source Mel")
    axes[1].set_title("Predicted Mel")
    axes[2].set_title("Source pitch contour")
    axes[3].set_title("Source rhythm")
    plt.tight_layout()
    plt.show()

In [6]:
def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [7]:
hparams = create_hparams()

In [8]:
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)

  fft_window = pad_center(fft_window, filter_length)
  mel_basis = librosa_mel_fn(


## Load Models

In [9]:
device = torch.device('cuda')
config_file = os.path.join("/run/media/viblab/Markov2/Pras/Thesis/TryMyOwn/hifi-gan/cp_hifigan", 'config.json')
with open(config_file) as f:
    data = f.read()

json_config = json.loads(data)
h = AttrDict(json_config)

generator = Generator(h).to(device)

state_dict_g = torch.load("/run/media/viblab/Markov2/Pras/Thesis/TryMyOwn/hifi-gan/cp_hifigan/g_00110000", map_location="cpu")
generator.load_state_dict(state_dict_g['generator'])

generator.eval()
generator.remove_weight_norm()

Removing weight norm...


In [10]:
checkpoint_path = "outdir/checkpoint_15000"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

<All keys matched successfully>

## Setup dataloaders

In [12]:
audio_paths = 'filelists/newcombine_test_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [91]:
file_idx = random.randrange(1, len(dataloader.audiopaths_and_text))
audio_path, lid, text = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners))[None, :].cuda()    
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

/run/media/viblab/Markov2/Pras/Thesis/Database/ESD/wavs/0013/Sad/0013_001189.wav Cuckoos is downheaded and crying.


## Define Speakers Set

In [None]:
# speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids
# speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python',header=None, comment=';', sep=' *\| *', 
#                        names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
# speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
# female_speakers = cycle(
#     speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
# male_speakers = cycle(
#     speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())

# Style Transfer (Rhythm and Pitch Contour)

In [94]:
import shutil

shutil.copy(audio_path, "sample_sound/original.wav")
ipd.Audio(audio_path, rate=hparams.sampling_rate)

In [95]:
with torch.no_grad():
    # get rhythm (alignment map) using tacotron 2
    mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(x)
    rhythm = rhythm.permute(1, 0, 2)

with torch.no_grad():
    x_voc = mel_outputs_postnet.cpu().detach().numpy()
    x_voc = torch.FloatTensor(x_voc).to(device)
    y_g_hat = generator(x_voc)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype('int16')

sf.write("sample_sound/generated.wav", audio, hparams.sampling_rate)
ipd.Audio(audio, rate=hparams.sampling_rate)

In [102]:
rhythm_tgt = rhythm

In [103]:
mel_tgt = mel

In [104]:
pitch_tgt = pitch_contour

In [105]:
speaker_id = dataloader[file_idx][2][None].cuda()
lid = dataloader[file_idx][4].cuda()

with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference(
        (text_encoded, mel_tgt, speaker_id, pitch_tgt, lid))

plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                      mel_outputs_postnet.data.cpu().numpy()[0],
                      pitch_contour.data.cpu().numpy()[0, 0],
                      rhythm.data.cpu().numpy()[:, 0].T)

  plt.show()


In [106]:
with torch.no_grad():
    x_voc = mel_outputs_postnet.cpu().detach().numpy()
    x_voc = torch.FloatTensor(x_voc).to(device)
    y_g_hat = generator(x_voc)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype('int16')

sf.write("sample_sound/transfer.wav", audio, hparams.sampling_rate)
ipd.Audio(audio, rate=hparams.sampling_rate)

In [80]:
speaker_id = dataloader[file_idx][2][None].cuda()
lid = dataloader[file_idx][4].cuda()

with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
        (text_encoded, mel_tgt, speaker_id, pitch_tgt, lid, rhythm))

plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                      mel_outputs_postnet.data.cpu().numpy()[0],
                      pitch_contour.data.cpu().numpy()[0, 0],
                      rhythm.data.cpu().numpy()[:, 0].T)

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [1, 81] but got: [1, 82].

In [55]:
with torch.no_grad():
    x_voc = mel_outputs_postnet.cpu().detach().numpy()
    x_voc = torch.FloatTensor(x_voc).to(device)
    y_g_hat = generator(x_voc)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype('int16')

sf.write("sample_sound/transfer2.wav", audio, hparams.sampling_rate)
ipd.Audio(audio, rate=hparams.sampling_rate)

# Singing Voice from Music Score

In [19]:
data = get_data_from_musicxml('data/haendel_hallelujah.musicxml', 132, convert_stress=True)
panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}

In [None]:
n_speakers_per_part = 4
frequency_scaling = 0.4
n_seconds = 90
audio_stereo = np.zeros((hparams.sampling_rate*n_seconds, 2), dtype=np.float32)
for i, (part, v) in enumerate(data.items()):
    rhythm = data[part]['rhythm'].cuda()
    pitch_contour = data[part]['pitch_contour'].cuda()
    text_encoded = data[part]['text_encoded'].cuda()
    
    for k in range(n_speakers_per_part):
        pan = np.random.randint(panning[part][0], panning[part][1])
        if any(x in part.lower() for x in ('soprano', 'alto', 'female')):
            speaker_id = torch.LongTensor([next(female_speakers)]).cuda()
        else:
            speaker_id = torch.LongTensor([next(male_speakers)]).cuda()
        print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan))

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))

            audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
            audio = audio.cpu().numpy()
            audio = panner(audio, pan)
            audio_stereo[:audio.shape[0]] += audio            
            write("{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio)

In [None]:
audio_stereo = audio_stereo / np.max(np.abs(audio_stereo))
write("audio_stereo.wav", hparams.sampling_rate, audio_stereo)
ipd.Audio([audio_stereo[:,0], audio_stereo[:,1]], rate=hparams.sampling_rate)