In [4]:
from tqdm import tqdm
import torch
from IPython.display import Audio
from IPython.utils import io
from TTS.encoder import inference as encoder
from TTS.sc_wave_rnn import inference as sc_vocoder
from pathlib import Path
import numpy as np
import librosa
import glob
import umap
import copy

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

import pandas as pd

import subprocess
import base64
import pickle
from subprocess import Popen, PIPE

import os
import soundfile

ModuleNotFoundError: No module named 'encoder'

### Load models

In [2]:
# This is 80-chanel Speaker Encoder
encoder_weights = Path("./encoder/saved_models/pretrained.pt")
encoder.load_model(encoder_weights)

# Speaker Conditional WaveRNN
sc_vocoder_weights = Path("./sc_wave_rnn/saved_models/pretrained.pt")
sc_vocoder.load_model(sc_vocoder_weights)

Loaded encoder "pretrained.pt" trained to step 1500001
Building SC Wave-RNN
Trainable Parameters: 4.612M
Loading model weights at sc_wave_rnn/saved_models/pretrained.pt


### Functions

In [50]:
# calculate embedding

def get_embedings(speaker_rec_paths):
    """
        Calculate encoder embeddings for list of speaker records.
        Input: list of paths to speaker records
        Returns: mean embedding, mean embedding for each record, list of partitial embeddings
    """
    embeddings = []
    partial_embeddings = []
    
    for i in tqdm(range(len(speaker_rec_paths)), desc="Embeddings calculations"):
        in_fpath = Path(speaker_rec_paths[i])
        encoder_wav = encoder.preprocess_wav(in_fpath)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)  
        embeddings.append(embed)
        partial_embeddings.extend(partial_embeds)
        
    #mean_embedding = np.mean(embeddings, axis=0)
    mean_embedding = np.mean(partial_embeddings, axis=0)
    mean_embedding = mean_embedding / np.linalg.norm(mean_embedding, 2)
    torch.cuda.empty_cache()
    
    return mean_embedding, embeddings, partial_embeddings

In [51]:
def generate_mels_for_text(text_fragments, embedding, output_directory, synth_batch_size = 64):
     """
        Generate MEL spectograms for list of sentences using Synthesizer and 
        save spectograms to numpy files.
        The function also insert pauses of different length depending 
        on the last punctuation sign in sentence (or sentence fragment).
        Input: fragments - list of sentences,
               embeding - embedding vector for speaker,
               output_directory - path to directory where spectograms will be saved.
        Returns: specs - list of MEL spectograms.
    """
    specs = []
    num_fragments = len(text_fragments)
    
    mels_counter=-1
    
    if num_fragments // synth_batch_size == 0:
        mels_generation_iterations = 1
    elif num_fragments // synth_batch_size == 1 and num_fragments % synth_batch_size == 0:
        mels_generation_iterations = 1
    else:
        mels_generation_iterations = num_fragments // synth_batch_size + 1
    
    encoded_embd = base64.b64encode(pickle.dumps(embedding))
    
    for i in tqdm(range(mels_generation_iterations), desc="Mels generation"):
        
        if mels_generation_iterations == 1:
            fragments_batch = text_fragments
        elif mels_generation_iterations!=1 and i < mels_generation_iterations-1:
            fragments_batch = text_fragments[i*synth_batch_size:(i+1)*synth_batch_size]
        elif mels_generation_iterations!=1 and i == mels_generation_iterations-1:
            fragments_batch = text_fragments[(i)*synth_batch_size:]
        
        encoded_fragments_batch = base64.b64encode(pickle.dumps(fragments_batch))
        process = subprocess.run(['python','calculate_mels.py', '--t', 
                                  encoded_fragments_batch, '--e', encoded_embd], 
                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        with open ('generated_mels.txt', 'rb') as fp:
            specs_batch = pickle.load(fp)
            
        for j in range(len(specs_batch)):
            sample = specs_batch[j]      
            specs.append(sample)
            
            mels_counter+=1
            filename=output_directory+str(mels_counter) # save numpy array as binary file
            np.save(filename, sample)
            
    return specs

In [52]:
def generate_audio_from_mels(specs, embed):
    """
        Generate audio for list of MEL spectograms using Speaker Conditional Vocoder.
        Input: specs - list of MEL spectograms
               embed - embedding vector for speaker
        Returns: wav_list - list of audio.
    """
    wav_list = []
    num_mels= len(specs)
    
    audio_generation_iterations = num_mels
        
    for i in tqdm(range(audio_generation_iterations), desc="Audio generation"):
        spec_batch = specs[i] # batch size == 1
        generated_wav = sc_vocoder.infer_waveform(spec_batch, embed)
        wav_list.append(generated_wav)
        torch.cuda.empty_cache()      
    return wav_list

In [61]:
def generate_audio_from_melfile(input_directory, output_directory, embed):
    """
        Generate audio for MEL spectograms from input_directory .
        Input: input_directory - path to directory where spectograms were saved,
               output_directory - path to directory where synthesized audios will be saved,
               embed - embedding vector for speaker.
    """
    
    mel_paths = glob.glob(input_directory+"*.npy")
    mel_ids = [int(os.path.basename(path).split('.')[0]) for path in mel_paths]
    
    wav_paths = glob.glob(output_directory+"*.wav")
    wav_ids = [int(os.path.basename(path).split('.')[0]) for path in wav_paths]
    
    # in order to check if some files alredy generated (if you run function second time)
    new_mel_paths = [mel_paths[i] for i in range(len(mel_paths)) if mel_ids[i] not in wav_ids]
    new_mel_ids = [mel_ids[i] for i in range(len(mel_ids)) if mel_ids[i] not in wav_ids]
    
    for i in tqdm(range(len(new_mel_ids)), desc="Audio generation"):
        mel_sample = np.load(new_mel_paths[i])
        generated_wav = sc_vocoder.infer_waveform(mel_sample, embed)
        generated_wav = np.pad(generated_wav, (0, 16000), mode="constant")
        generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
        soundfile.write(output_directory + str(new_mel_ids[i]) + '.wav', generated_wav, samplerate=16000)

### Generate embedding for LJSpeech speaker

In [54]:
data_type = 'unseen'
speaker_n = 1

speaker_rec_paths = []
embedings = []
speaker_rec_paths = (glob.glob("speakers_samples/"+data_type+"/"+str(speaker_n)+"/*.wav"))

wav = encoder.preprocess_wav(speaker_rec_paths[0])
display(Audio(wav, rate=encoder.sampling_rate))

In [55]:
mean_embedding, embeddings, partial_embeddings = get_embedings(speaker_rec_paths)
mean_partial_embed = np.mean(partial_embeddings, axis=0) / np.linalg.norm(np.mean(partial_embeddings, axis=0), 2)
print(len(partial_embeddings))

Embeddings calculations: 100%|██████████| 20/20 [00:02<00:00,  6.84it/s]

138





### Generate MELs

In [56]:
df_mtsamples = pd.read_csv('normalized_mtsamples.csv',index_col=False)
sentences = df_mtsamples.sentence.tolist()
print('Number of sentences: '+str(len(sentences)))

Number of sentences: 94128


In [57]:
sentences[:10]

['she has also had a hysterectomy,',
 "this is actually rather severe when adjusted for the patient's low albumin.",
 'reason for referral: evaluation for right liters four selective nerve root block.',
 'blood urea nitrogen and creatinine are within normal limits.',
 'due to this finding, it was evident that the fetal state would not support labor in order to accomplish a vaginal delivery.',
 'hemostasis was assured within the mesentery and at the base of the cecum.',
 'it was cleared circumferentially all the way around and noted to have good flow and had good arterial texture.',
 'specimens: prostatic resection chips.',
 'liters thigh thirty nine centimeters.',
 'all sponge, needle, and instrument counts were correct.']

In [58]:
# Create directory for generated MELs
mel_directory="/SSD-2T/medical_domain_adaptation_dataset/mels/"
Path(mel_directory).mkdir(parents=True, exist_ok=True)

In [None]:
# Generate MELs
specs = generate_mels_for_text(sentences, mean_embedding, mel_directory, synth_batch_size = 64)

Mels generation:  95%|█████████▌| 1403/1471 [6:47:47<25:33, 22.55s/it]  

### Generate audio

In [59]:
# Create directory for generated wavs
wavs_directory="/SSD-2T/medical_domain_adaptation_dataset/wavs/"
Path(wavs_directory).mkdir(parents=True, exist_ok=True)

In [62]:
# Generate wavs
generate_audio_from_melfile(input_directory=mel_directory, output_directory=wavs_directory, embed=mean_embedding)

Audio generation: 100%|██████████| 65139/65139 [74:07:54<00:00,  4.10s/it]   


In [13]:
# Generate particular wav
wav_list = generate_audio_from_mels([specs[3]], mean_embedding)

Audio generation: 100%|██████████| 1/1 [00:04<00:00,  4.65s/it]


In [14]:
sample_rate = 16000
generated_wav = np.concatenate(wav_list, axis = 0)
generated_wav = np.pad(generated_wav, (0, sample_rate), mode="constant")
generated_wav = generated_wav/ np.abs(generated_wav).max() * 0.97
display(Audio(generated_wav, rate=sample_rate))