## Importing all the dependencies:

In [None]:
import torch
import soundfile as sf
import pickle
from synthesis import build_model
from synthesis import wavegen
import librosa
import os
import numpy as np
from math import ceil
from model_vc import Generator
from model_bl import D_VECTOR
from collections import OrderedDict
import soundfile as sf
from scipy import signal
from scipy.signal import get_window
from librosa.filters import mel
from numpy.random import RandomState

## defining all the methods used in other parts of the notebook:

In [None]:
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a
    
    
def pySTFT(x, fft_length=1024, hop_length=256):
    
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)    
    
    
mel_basis = mel(sr=16000, n_fft=1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)



def gmel(fname):
    x, fs = sf.read(fname)
    x = librosa.resample(x, orig_sr=fs, target_sr=16000)
    y = signal.filtfilt(b, a, x)
    wav = y * 0.96 + (np.random.rand(y.shape[0]) - 0.5) * 1e-06
    D = pySTFT(wav).T
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)
    return S.astype(np.float32)


def get_mean_embedding(model, embedding_dir):
    speaker_embeddings = []
    for wav_file in sorted(os.listdir(embedding_dir)):
        if wav_file.endswith('.wav'):
            wav_path = os.path.join(embedding_dir, wav_file)
            mel_spectrogram = gmel(wav_path)
            speaker_embed = get_speaker_embedding(model, mel_spectrogram)
            speaker_embeddings.append(speaker_embed)
    return np.mean(speaker_embeddings, axis=0)


def get_speaker_embedding(model, mel_spectrogram):
    mel_tensor = torch.from_numpy(mel_spectrogram[np.newaxis, :, :]).cuda()
    speaker_embed = model(mel_tensor).detach().squeeze().cpu().numpy()
    return speaker_embed

# Generating style transfered utterances as in the original implementation

In [None]:
"""

The metadata.pkl file contains a list of elements, each element containing information about an utterance.
Each element of metadata is a list with 3 elements: the speaker identity, the speaker embedding and the mel spectrogram
of the utterance.

"""

In [141]:
pickle.load(open('metadata.pkl', "rb"))[0] #first utterance details in metadata file

['p225',
 array([-2.92885415e-02,  1.67739280e-02,  6.42375797e-02,  5.12384847e-02,
         8.77934247e-02, -4.05867286e-02,  1.08849108e-02,  2.06673276e-02,
        -8.39690343e-02,  1.48199843e-02,  3.82344425e-02,  1.12243919e-02,
         4.62971367e-02,  1.66661311e-02, -5.78785129e-02,  3.60288732e-02,
         1.92339886e-02,  9.17971320e-03,  2.74707917e-02, -5.48039749e-02,
         2.50798557e-02,  4.66737375e-02, -6.14981353e-03,  3.88026945e-02,
         5.68139665e-02, -7.33052716e-02,  2.30920967e-02, -1.04292825e-01,
        -2.61898227e-02,  3.02257240e-02, -3.02889403e-02,  3.63447554e-02,
        -4.97230627e-02,  1.57715172e-01, -3.93295921e-02,  5.51161245e-02,
         4.64604087e-02, -4.59927395e-02, -4.82378080e-02, -3.81431282e-02,
         3.91379185e-02,  4.59317304e-02, -1.55072343e-02, -1.67513415e-02,
        -5.09507731e-02, -5.08496165e-02, -2.25679334e-02,  2.63878461e-02,
        -1.26611767e-02, -2.49883570e-02,  3.02621610e-02, -2.48487398e-04,
   

In [137]:
dfq = 32 #Downsampling-upsampling frequency

In [11]:
device = 'cuda'
G = Generator(32,256,512,dfq).eval().to(device)

g_checkpoint = torch.load('autovc.ckpt', map_location=device)
#g_checkpoint = torch.load('autovc.ckpt')
G.load_state_dict(g_checkpoint['model'])

metadata = pickle.load(open('/metadata.pkl', "rb"))

spect_vc = []

for sbmt_i in metadata:
             
    x_org = sbmt_i[2]
    x_org, len_pad = pad_seq(x_org,dfq)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    
    for sbmt_j in metadata:
                   
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        

device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth",map_location=device)
model.load_state_dict(checkpoint["state_dict"])

for spect in spect_vc:
    name = spect[0]
    c = spect[1]
    print(name)
    waveform = wavegen(model, c=c)   
    sf.write(name+'.wav', waveform, samplerate=16000)
    

# Generating conversions with different downsampling-upsampling factors
### Paper default = 32, which gives the best quality for conversions

In [32]:
"""dfq is the downsampling-upsampling factor"""


for dfq in [1,2,4,8,16,32,64]:
    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda'
    G = Generator(32,256,512,dfq).eval().to(device)

    g_checkpoint = torch.load('autovc.ckpt', map_location=device)
    #g_checkpoint = torch.load('autovc.ckpt')
    G.load_state_dict(g_checkpoint['model'])

    metadata = pickle.load(open('metadata.pkl', "rb"))

    spect_vc = []

    for sbmt_i in metadata:

        x_org = sbmt_i[2]
        x_org, len_pad = pad_seq(x_org,dfq)
        uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
        emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)

        for sbmt_j in metadata:

            emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)

            with torch.no_grad():
                _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)

            if len_pad == 0:
                uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
            else:
                uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
            spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
        
    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load("checkpoint_step001000000_ema.pth",map_location=device)
    model.load_state_dict(checkpoint["state_dict"])

    for spect in [spect_vc[1]]:   #Change [spect_vc[1]] to spect_vc to do it for all utterances in metadata
        name = spect[0]
        c = spect[1]
        print(name)
        waveform = wavegen(model, c=c)   
        sf.write(name+'_{}.wav'.format(dfq), waveform, samplerate=16000)


p225xp228


100%|██████████| 23040/23040 [02:27<00:00, 155.91it/s]


p225xp228


100%|██████████| 23040/23040 [02:28<00:00, 154.92it/s]


p225xp228


100%|██████████| 23040/23040 [02:28<00:00, 154.88it/s]


p225xp228


100%|██████████| 23040/23040 [02:29<00:00, 154.58it/s]


p225xp228


100%|██████████| 23040/23040 [02:29<00:00, 153.94it/s]


p225xp228


100%|██████████| 23040/23040 [02:29<00:00, 154.22it/s]


p225xp228


100%|██████████| 23040/23040 [02:29<00:00, 153.63it/s]


# Function for direct style transfer between two audio files

In [113]:
"""Function for direct style transfer.

The first argument source_uttr is the location of source wav file.
The second argument target_sample is a wave file location containing a sample of target voice
Gives the transformed speech output as a time series array.
You cam use IPython.display.Audio() method to play it

"""



def stf(source_uttr, target_sample):
    
    mel_s_uttr = gmel(source_uttr)
    mel_t_smpl = gmel(target_sample)

    melsp = torch.from_numpy(mel_s_uttr[np.newaxis, :, :]).cuda()
    emb_org = C(melsp).detach().squeeze().cpu().numpy()
    
    melsp = torch.from_numpy(mel_t_smpl[np.newaxis, :, :]).cuda()
    emb_trg = C(melsp).detach().squeeze().cpu().numpy()
 
    x_org = mel_s_uttr
    x_org, len_pad = pad_seq(x_org, dfq)
    
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(emb_org[np.newaxis, :]).to(device)
    emb_trg = torch.from_numpy(emb_trg[np.newaxis, :]).to(device)

    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
    waveform = wavegen(model, uttr_trg)

    return waveform


In [None]:
# waveform = wavegen(model, c=c)   
    # sf.write(name+'.wav', waveform, samplerate=16000)

#### Example:

In [70]:
import IPython

In [100]:
IPython.display.Audio("audio/p225_001.wav")

In [101]:
IPython.display.Audio("audio/p228_001.wav")

In [114]:
IPython.display.Audio(stf("audio/p225_001.wav","audio/p228_001.wav"),rate=16000)

100%|██████████| 27136/27136 [02:56<00:00, 153.81it/s]


In [115]:
IPython.display.Audio(stf("audio/p360_001.wav","audio/p364_001.wav"),rate=16000)

100%|██████████| 31488/31488 [03:26<00:00, 152.71it/s]


In [120]:
IPython.display.Audio(stf("wavs/p225/p225_016.wav","wavs/p228/p228_016.wav"),rate=16000)

100%|██████████| 90368/90368 [09:40<00:00, 155.66it/s]


In [104]:
IPython.display.Audio("wavs/p225/p225_016.wav")

In [119]:
IPython.display.Audio("audio/p225_001.wav")

In [144]:
IPython.display.Audio("transformedvoices1/p225xp270.wav")

## Generate metadata as in original format used in AutoVC github repository

In [135]:
"""
Generate speaker embeddings, mel-spectrograms, and metadata as in the original AutoVC repository.

For generating metadats:

'emb_source directory' should contain subdirectories with their names containing speaker identity and the files within each 
subdirectory should be .wav files, and the mean of embeddings from all the files will be taken as speaker embedding in the metadata file.

'to_be_converted' directory should contain the subdirectories with each subdirectory named as speaker identity as before. Each subdirectory
should contain the .wav files that are desired to be style transfered.
"""

# import os
# import pickle
# from model_bl import D_VECTOR
# from collections import OrderedDict
# import numpy as np
# import torch
# import soundfile as sf
# import librosa
# from scipy import signal

C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')

new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)


# Directory containing speaker subdirectories
speakers_dir = './to_be_converted' 
embeddings_dir = './emb_source'

speakers = []
for speaker_dir in sorted(os.listdir(speakers_dir)):
    speaker_path = os.path.join(speakers_dir, speaker_dir)
    if os.path.isdir(speaker_path):
        print(f'Processing speaker: {speaker_dir}')
        for wav_file in sorted(os.listdir(speaker_path)):
            if wav_file.endswith('.wav'):
                wav_path = os.path.join(speaker_path, wav_file)
                mel_spectrogram = gmel(wav_path)
                speaker_embed = get_mean_embedding(C, os.path.join(embeddings_dir, speaker_dir))
                speaker_utterances = [speaker_dir, speaker_embed, mel_spectrogram]
                speakers.append(speaker_utterances)

with open(os.path.join(speakers_dir, 'metadata_new.pkl'), 'wb') as handle:
    pickle.dump(speakers, handle)

Processing speaker: p225
Processing speaker: p226
Processing speaker: p227
Processing speaker: p228
