In [5]:
import yaml
import os
from tqdm import tqdm
from text import _clean_text
from scipy.io import wavfile
import librosa
import numpy as np
from scipy.interpolate import interp1d
import tgt
import pyworld as pw
from sklearn.preprocessing import StandardScaler
from audio import *
import random
import json

In [79]:
class Preprocessor:
    
    
    def __init__(self, preprocessor_config_path):
        self.config = yaml.load(open(preprocessor_config_path, "r"), Loader=yaml.FullLoader)
        self.in_dir = self.config["path"]["raw_path"]
        self.out_dir = self.config["path"]["preprocessed_path"]
        self.val_size = self.config["preprocessing"]["val_size"]
        self.sampling_rate = self.config["preprocessing"]["audio"]["sampling_rate"]
        self.hop_length = self.config["preprocessing"]["stft"]["hop_length"]
        self.pitch_averaging = "phoneme_level"
        self.energy_averaging= "phoneme level"
        
        self.stft = audio.stft.TacotronSTFT(
            filter_length = self.config["preprocessing"]["stft"]["filter_length"],
            hop_length = self.config["preprocessing"]["stft"]["hop_length"],
            win_length = self.config["preprocessing"]["stft"]["win_length"],
            n_mel_channels = self.config["preprocessing"]["mel"]["n_mel_channels"],
            sampling_rate = self.config["preprocessing"]["audio"]["sampling_rate"],
            mel_fmin = self.config["preprocessing"]["mel"]["mel_fmin"],
            mel_fmax = self.config["preprocessing"]["mel"]["mel_fmax"]
        )
    
    
    #read the original LJspeech dataset. Cleans the text and normalizes the wave files.
    def preprocess_normalize(self):
        
        
        in_dir = self.config['path']['corpus_path']
        out_dir = self.config['path']['raw_path']
        
        max_wav_val = self.config['preprocessing']['audio']['max_wav_value']
        sampling_rate = self.config['preprocessing']['audio']['sampling_rate']
        
        with open(in_dir+"/metadata.csv") as file:
            for line in tqdm(file, ncols=100):
                columns = line.strip().split('|')
                text = columns[2]
                file_name = columns[0]
                
                cleaned_text = _clean_text(text)
                
                if not os.path.exists(out_dir):
                    os.mkdir(out_dir)
                
                audio_file_path = os.path.join(in_dir, "wavs", f"{file_name}.wav")
                
                if os.path.exists(audio_file_path):
                    # print("path exists")
                    with open(os.path.join(out_dir, f"{file_name}.txt"), "w") as new_text_file:
                        new_text_file.write(cleaned_text) 
                           
                    wav, _ = librosa.load(audio_file_path)
                    wav = (wav / max(abs(wav))) * max_wav_val
                    
                    wavfile.write(
                        os.path.join(out_dir, f"{file_name}.wav"),
                        sampling_rate,
                        wav.astype(np.int16)
                        )
        
        
    # Generate the data for training the different components
    # Here we calculate the pitch and energy of each wav file            
    def generate_training_data(self):
        
        
        os.makedirs((os.path.join(self.out_dir, "duration")), exist_ok=True)
        os.makedirs((os.path.join(self.out_dir, "pitch")), exist_ok=True)
        os.makedirs((os.path.join(self.out_dir, "energy")), exist_ok=True)
        os.makedirs((os.path.join(self.out_dir, "mel")), exist_ok=True)
        
        
        out = list()
        n_frames = 0
        pitch_scaler = StandardScaler()
        energy_scaler = StandardScaler()
        
        
        print("Generating Data for training")
        
        wav_files = []
        
        # returns wav_files
        for file_name in os.listdir(self.in_dir):
            if file_name.endswith(".wav"):
                wav_files.append(file_name)
        
        assert len(wav_files) != 0
            
        for file_name in tqdm(wav_files, ncols=100):
            file_name = file_name.split(".")[0]
            
            text_grid_path = os.path.join(self.out_dir, "TextGrid", f"{file_name}.TextGrid")
            wav_path = os.path.join(self.in_dir,"{}.wav".format(file_name))
            text_path = os.path.join(self.in_dir,"{}.txt".format(file_name))
            
            if os.path.exists(text_grid_path):
                text_grid = self._read_text_grid(text_grid_path)
                
                # getting the duration from the grid
                phones, durations, start, end = self.get_duration(text_grid.get_tier_by_name("phones"))
                
                # Saving duration values in npy files
                dur_filename = "{}-duration.npy".format(file_name)
                np.save(os.path.join(self.out_dir, "duration", dur_filename), durations)
                
                text = "{" + " ".join(phones) + "}"
                
                # read and trim waves (removing starting silence and ending silence)
                wav, _ = librosa.load(wav_path)
                wav = wav[
                    int(self.sampling_rate * start): int(self.sampling_rate * end)
                ].astype(np.float32)
                
                # reading the corresponding text
                with open(text_path) as text_file:
                    raw_text = text_file.readline().strip("\n")
                
                # calculating the fundamental frequency (this will be our pitch)
                pitch, t = pw.dio(
                    wav.astype(np.float64),
                    self.sampling_rate,
                    frame_period = self.hop_length/self.sampling_rate * 1000
                    
                )
                # smooths out the pitch contour
                pitch = pw.stonemask(wav.astype(np.float64), pitch, t, self.sampling_rate)
                # makes sure that the length of the pitch contour matches the total duration
                pitch = pitch[:sum(durations)]
                
                # calculating the mel_scale spectrogram and energy (energy is the l2-norm of each spectogram frame)
                mel, energy = audio.tools.get_mel_from_wav(wav, self.stft)
                # makes sure that the length of the energy countour and the generated spectrogram matches the total duration
                mel = mel[:, :sum(durations)]
                energy = energy[:sum(durations)]
                
                
                # filling in the empty pitch using liner interpolation
                nonzero_ids = np.where(pitch != 0)[0]
                inter_pol = interp1d(
                    nonzero_ids,
                    pitch[nonzero_ids],
                    fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
                    bounds_error=False
                    )
                pitch = inter_pol(np.arange(0, len(pitch)))
                
                # performing phone averaging 
                # ensure that only one pitch and energy is associated with each phones
                
                # averaging for pitch and energy
                pos = 0
                for i, d in enumerate(durations):
                    if d > 0:
                        pitch[i] = np.mean(pitch[pos: pos + d])
                        energy[i] = np.mean(energy[pos: pos + d])
                    else:
                        pitch[i] = 0
                        energy[i] = 0
                    pos += d
                    
                
                    
                pitch = pitch[:len(durations)]
                energy = energy[:len(durations)]
                
                # Saving the numpy arrays
                # Saving pitch array
                pitch_filename = "{}-pitch.npy".format(file_name)
                np.save(os.path.join(self.out_dir, "pitch", pitch_filename), pitch)
                # Saving energy array
                energy_filename = "{}-energy.npy".format(file_name)
                np.save(os.path.join(self.out_dir, "energy", energy_filename), energy)
                # Saving mel spectrogram array
                mel_filename = "{}-mel.npy".format(file_name)
                np.save(
                    os.path.join(self.out_dir, "mel", mel_filename),
                    mel.T,
                )
                
                
                info = "|".join([file_name, text, raw_text])
                pitch = self._remove_outlier(pitch)
                energy = self._remove_outlier(energy)
                n = mel.shape[1]
            
            
            out.append(info)
            
            if len(pitch) > 0:
                    pitch_scaler.partial_fit(pitch.reshape((-1, 1)))
            if len(energy) > 0:
                energy_scaler.partial_fit(energy.reshape((-1, 1)))

            n_frames += n
        
        # normalizing pitch and energy values
        pitch_mean = pitch_scaler.mean_[0]
        pitch_std = pitch_scaler.scale_[0]
        
        energy_mean = energy_scaler.mean_[0]
        energy_std = energy_scaler.scale_[0]
        
        # the function normalizes the values from the given directory and again save the values
        pitch_min, pitch_max = self._normalize(
            os.path.join(self.out_dir, "pitch"), pitch_mean, pitch_std
        )
        energy_min, energy_max = self._normalize(
            os.path.join(self.out_dir, "energy"), energy_mean, energy_std
        )
        
        # saving the min, max, mean and std values of pitch and energy for later use.
        with open(os.path.join(self.out_dir, "stats.json"), "w") as f:
            stats = {
                "pitch": [
                    float(pitch_min),
                    float(pitch_max),
                    float(pitch_mean),
                    float(pitch_std),
                ],
                "energy": [
                    float(energy_min),
                    float(energy_max),
                    float(energy_mean),
                    float(energy_std),
                ],
            }
            f.write(json.dumps(stats))
            
        print(
            "Total time: {} hours".format(
                n_frames * self.hop_length / self.sampling_rate / 3600
            )
        )
        
        # Creating train and validation set
        random.shuffle(out)
        out = [r for r in out if r is not None]
        
        # Saving sets for train and test
        with open(os.path.join(self.out_dir, "train.txt"), "w", encoding="utf-8") as f:
            for m in out[self.val_size :]:
                f.write(m + "\n")
        with open(os.path.join(self.out_dir, "val.txt"), "w", encoding="utf-8") as f:
            for m in out[: self.val_size]:
                f.write(m + "\n")
                
        return out
                
                
                
    # get the duration of each phones
    def get_duration(self, text_grid):
        # represent the silent phones
        silent_phones = ["sil", "sp", "spn"]
        
        phones = []
        durations = []
        start_time = 0
        end_time = 0
        end_idx = 0
        
        for t in text_grid._objects:
            s, e, p = t.start_time, t.end_time, t.text
            
            # check if the phones starts with a silence
            if phones == []:
                if p in silent_phones:
                    continue
                else:
                    start_time = s
            
            # the phone is not silent assume the end of current phone to be the end of the sequence
            # for ordinary phones
            if p not in silent_phones:
                phones.append(p)
                end_time = e
                end_idx = len(phones)
            # for silent phones
            else:
                phones.append(p)
                
            durations.append(
                int(np.round(e * self.sampling_rate / self.hop_length)
                    - np.round(s * self.sampling_rate / self.hop_length))
            )
            
        # remove if silence is present in the end
        phones = phones[:end_idx]
        durations = durations[:end_idx]
            
        return phones, durations, start_time, end_time
                
        
    def _remove_outlier(self, values):
        values = np.array(values)
        p25 = np.percentile(values, 25)
        p75 = np.percentile(values, 75)
        lower = p25 - 1.5 * (p75 - p25)
        upper = p75 + 1.5 * (p75 - p25)
        normal_indices = np.logical_and(values > lower, values < upper)

        return values[normal_indices]      
    
    def _normalize(self, in_dir, mean, std):
        max_value = np.finfo(np.float64).min
        min_value = np.finfo(np.float64).max
        for filename in os.listdir(in_dir):
            filename = os.path.join(in_dir, filename)
            values = (np.load(filename) - mean) / std
            np.save(filename, values)

            max_value = max(max_value, max(values))
            min_value = min(min_value, min(values))

        return min_value, max_value
  
                
    def _read_text_grid(self, text_grid_name):
        return tgt.io.read_textgrid(text_grid_name)
    
    

In [80]:
p = Preprocessor("./preprocess.yaml")
p.config

{'dataset': 'LJSpeech',
 'path': {'corpus_path': './LJSpeech',
  'lexicon_path': './lexicon/lexicon.txt',
  'raw_path': './normalized_audio',
  'preprocessed_path': './processed_data'},
 'preprocessing': {'val_size': 512,
  'text': {'text_cleaners': ['english_cleaners'], 'language': 'en'},
  'audio': {'sampling_rate': 22050, 'max_wav_value': 32768.0},
  'stft': {'filter_length': 1024, 'hop_length': 256, 'win_length': 1024},
  'mel': {'n_mel_channels': 80, 'mel_fmin': 0, 'mel_fmax': 8000},
  'pitch': {'feature': 'phoneme_level', 'normalization': True},
  'energy': {'feature': 'phoneme_level', 'normalization': True}}}

In [76]:
p.preprocess_normalize()

13100it [08:31, 25.59it/s]


In [81]:
p.generate_training_data()

Generating Data for training


  0%|                                                                       | 0/100 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.80it/s]

Total time: 0.1798095238095238 hours





["LJ001-0066|{IH1 N F AE1 K T sp G AH1 N TH ER0 spn EH1 S F ER1 S T T AY1 P sp AE1 F T ER0 W ER0 D Z Y UW1 Z D B AY1 spn sp IH1 Z R IH0 M AA1 R K AH0 B L IY0 L AY1 K DH IY0 T AY1 P sp AH1 V DH AH0 B IH0 F AO1 R M EH1 N SH AH0 N D S UW0 B IY1 AH0 K OW0 B UH1 K S}|in fact gunther zeiner's first type (afterwards used by schussler) is remarkably like the type of the before-mentioned subiaco books.",
 'LJ001-0027|{AH0 S P EH1 SH AH0 L IY0 EH1 Z R IH0 G AA1 R D Z DH AH1 L OW1 ER0 K EY1 S L EH1 T ER0 Z sp AE1 N D sp T AY1 P V EH1 R IY0 S IH1 M AH0 L ER0 W AH1 Z Y UW1 Z D D Y UH1 R IH0 NG DH AH0 N EH1 K S T F IH1 F T IY1 N AO1 R T W EH1 N T IY0 Y IH1 R Z sp N AA1 T OW1 N L IY0 B AY1 SH OW1 F ER0}|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by schoeffer,',
 'LJ001-0144|{DH AH1 W ER1 D Z M EY1 B IY1 S EH1 T M AH1 CH K L OW1 S ER0 T AH0 G EH1 DH ER0 sp W IH0 DH AW1 T L AO1 S AH0 V K L EH1 R N AH0 S}|the words may be