In [3]:
import os
import glob
import numpy as np
from scipy import ndimage
from IPython.display import Audio
from scipy.io import wavfile
import matplotlib.pyplot as plt
import librosa

In [16]:
Data_folder = '/home/beiming/Projects/SI_ATS/Database/RAW_DATA/EMA_MAE'
group_name = 'ENF'
speaker_id = '07'

speaker_folder = speaker_id + group_name

data_folder = os.path.join(Data_folder, group_name, speaker_folder)
WAV_path = os.path.join(data_folder, 'EMA Data')
EMA_path = os.path.join(Data_folder, 'Extracted_EMA', group_name, speaker_folder)

In [19]:
WAV_path_list = os.path.join(WAV_path, '*' + '.wav')
WAV_path_list = glob.glob(WAV_path_list)
WAV_path_list.sort()

EMA_path_list = os.path.join(EMA_path, '*' + '.MV')
EMA_path_list = glob.glob(EMA_path_list)
EMA_path_list.sort()

In [40]:
train_index = [0,1,2,3,4,5,6,7,8,9]
valid_index = [10]
test_index = [11]

mgc_dim = 60
lf0_dim = 1
vuv_dim = 1
bap_dim = 1

frame_period = 5
hop_length = 80
fftlen = 1024
alpha = 0.41

order = 59
frame_period = 5
windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]


In [35]:
import pyworld
import pysptk
import nnmnkwii

import numpy as np
from nnmnkwii.preprocessing.f0 import interp1d
from nnmnkwii.util import apply_delta_windows

def collect_features(wav_path, frame_period, order):
  
    x, fs = librosa.load(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram, order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc_delta = apply_delta_windows(mgc, windows)
    lf0_delta = apply_delta_windows(lf0, windows)
    bap_delta = apply_delta_windows(bap, windows)
    
    features = np.hstack((mgc, lf0, vuv, bap))

    delta_features = np.hstack((mgc_delta, lf0_delta, vuv, bap_delta))
    
    return features, delta_features

In [36]:
Train_MV = {}
Train_WAV = {}

index = 0

for i in train_index:
  
  MV = np.loadtxt(EMA_path_list[i])
  
  WAV, WAV_delta = collect_features(WAV_path_list[i],  frame_period, order)

  scale_ratio = WAV.shape[0] / MV.shape[0]

  MV_align = np.empty([WAV.shape[0], MV.shape[1]])

  for j in range(MV.shape[1]):

    MV_align[:,j] = ndimage.zoom(MV[:,j], scale_ratio)
    
  MV_delta = apply_delta_windows(MV_align, windows)
  
  
  Train_MV[index] = MV_delta
  Train_WAV[index] = WAV_delta
  
  index = index + 1  



In [37]:
print(Train_MV[0].shape)

(7762, 63)


In [38]:
print(Train_WAV[0].shape)

(7762, 190)


In [42]:
Valid_MV = {}
Valid_WAV = {}

index = 0

for i in valid_index:
  
  MV = np.loadtxt(EMA_path_list[i])
  
  WAV, WAV_delta = collect_features(WAV_path_list[i],  frame_period, order)
    
  print(WAV.shape)
  print(WAV_delta.shape)

  scale_ratio = WAV.shape[0] / MV.shape[0]

  MV_align = np.empty([WAV.shape[0], MV.shape[1]])

  for j in range(MV.shape[1]):

    MV_align[:,j] = ndimage.zoom(MV[:,j], scale_ratio)
    
  MV_delta = apply_delta_windows(MV_align, windows)
  
  
  Valid_MV[index] = MV_delta
  Valid_WAV[index] = WAV_delta
  
  index = index + 1  

(12481, 64)
(12481, 190)




In [43]:
def collect_features(wav_path, frame_period, order):
  
    x, fs = librosa.load(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram, order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc_delta = apply_delta_windows(mgc, windows)
    lf0_delta = apply_delta_windows(lf0, windows)
    bap_delta = apply_delta_windows(bap, windows)
    
    features = np.hstack((mgc, lf0, vuv, bap))

    delta_features = np.hstack((mgc_delta, lf0_delta, vuv, bap_delta))
    
    return mgc, bap, lf0, vuv

In [44]:
mgc, bap, lf0, vuv = collect_features(WAV_path_list[i],  frame_period, order)

In [45]:
print(mgc.shape)
print(bap.shape)
print(lf0.shape)
print(vuv.shape)

(12481, 60)
(12481, 2)
(12481, 1)
(12481, 1)
