In [4]:
from multiprocessing.pool import Pool 
from synthesizer import audio
from functools import partial
from itertools import chain
# from encoder import inference as encoder
from pathlib import Path
from utils import logmmse
from tqdm import tqdm
import numpy as np
import librosa
import os

In [5]:
def preprocess_aishell(datasets_root, out_dir, n_processes, 
                           skip_existing, hparams, pairs):
    # Gather the input directories
    
    print("\n  Using data from:  " + datasets_root)
    
    # Create the output directories for each output file type
    out_dir.joinpath("mels").mkdir(exist_ok=True)
    out_dir.joinpath("audio").mkdir(exist_ok=True)
    
    # Create a metadata file
    metadata_fpath = out_dir.joinpath("train.txt")
    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

    #print(speaker_dirs)
    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, datasets_root=datasets_root)
    job = Pool(n_processes).imap(func, pairs)
    for speaker_metadata in tqdm(job, "AI-SHELL", len(pairs), unit="speakers"):
        for metadatum in speaker_metadata:
            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sample_rate = hparams.sample_rate
    hours = (timesteps / sample_rate) / 3600
    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
          (len(metadata), mel_frames, timesteps, hours))
    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

In [1]:
#TODO: update this acoordingly
def get_ssb_audio_relative_path(sid : str) -> str : 
    """
    returns relative path to utterance (sentence) according to given `sid`
    """
    spkid = sid[:7]
    return os.path.join(spkid, f'{sid}.wav')

def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, datasets_root):
    metadata = []
    #print(speaker_dir.glob("*.wav"))
    #wav_paths = list(chain.from_iterable(speaker_dir.glob("*.wav")))
    wav_texts = list( (os.path.join(datasets_root, get_ssb_audio_relative_path(x[0])), x[2]) for x in speaker_dir)
    wavs = []
    texts = []
    wav_paths = []
    for wav_path, text in wav_texts:
        wav = split_on_silences(wav_path, hparams)
        wavs.append(wav)
        texts.append(text)
        wav_paths.append(wav_path)
    assert len(wav_paths) == len(wavs) == len(texts)
    for i, (wav, text) in enumerate(zip(wavs, texts)):
            metadata.append(process_utterance(wav, text, out_dir, wav_paths[i].split('/')[-1], 
                                            skip_existing, hparams))
    return [m for m in metadata if m is not None]



NameError: name 'Path' is not defined

In [7]:
def split_on_silences(wav_fpath, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    
    return wav

In [8]:
def process_utterance(wav, text, out_dir, basename, 
                      skip_existing, hparams):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume  
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.
    
    
    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None
    
    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None
    
    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]
    
    # Skip utterances that are too long
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None
    
    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)
    
    # Return a tuple describing this training example
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text

# START PROCESSING
## step one : gather metadata description into bathces

In [19]:
batch_size=200  # was 200
pairs = []
with open('datasets/best_of_ssb/metadata.csv') as f:
    subpairs = []
    begin = True
    for line in f.readlines():
        subpairs.append(line.strip().split('|'))
        if len(subpairs) % batch_size == 0 and not begin:
            pairs.append(subpairs)
            subpairs = []
            
        begin = False
    if len(subpairs) > 0:
        pairs.append(subpairs)


len(pairs)

324

In [20]:
pairs[0][0]

['SSB00430444',
 '持起红缨枪追赶对方半公里',
 'CH IY2 Q IY3 HH UH2 NG2 Y IY1 NG1 Q IY1 AE1 NG1 JH UW1 IY1 G AE3 N3 D UW4 IY4 F AE1 NG1 B AE4 N4 G UH1 NG1 L IY3',
 'SSB0043']

## step two : preprocess and write to dataset directory
1. specify directory to raw data as `rootpath`.

the expected structure of this directory:
```
<rootpath>/<speaker-id>/<sentence-id>.wav
```
the `speaker-id` is derived from sentence-ids, you should override `get_ssb_audio_relative_path(sid:str) -> str` accordingly.

2. specify output directory as `outpath`, this path is afterwards used in training

In [21]:
from synthesizer.hparams import hparams
from pathlib import Path

rootpath = '/NASdata/AudioData/AISHELL-ASR-SSB/SPEECHDATA'
outpath  = Path('datasets').joinpath('best_of_ssb')
outpath.mkdir(exist_ok=True, parents=True)


preprocess_aishell(
    rootpath, 
    outpath, 
    20, 
    True, 
    hparams, 
    pairs
)


  Using data from:  /NASdata/AudioData/AISHELL-ASR-SSB/SPEECHDATA


AI-SHELL: 100%|██████████| 324/324 [25:38<00:00,  4.75s/speakers] 


The dataset consists of 63262 utterances, 18227087 mel frames, 3638758031 audio timesteps (63.17 hours).
Max input length (text chars): 329
Max mel frames length: 881
Max audio timesteps length: 176161
