In [39]:
import os
import glob
import numpy as np
from scipy.io import wavfile
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
import librosa
import librosa.display
import IPython
from IPython.display import Audio
import matplotlib.pyplot as plt

from nnmnkwii.preprocessing.f0 import interp1d
from nnmnkwii.util import apply_delta_windows

from scipy import ndimage

In [22]:
data_folder = '/home/beiming/Desktop/Parsed_data'
group_name = 'ENF' # ENF or ENM
group_folder = os.path.join(data_folder, group_name)
subject_list = os.listdir(group_folder) 
print(subject_list)

['09ENF', '40ENF', '37ENF', '36ENF', '05ENF', '17ENF', '07ENF', '18ENF', '21ENF', '28ENF']


In [23]:
sub_name = '09ENF'

data_sub_folder = os.path.join(group_folder, sub_name)

In [24]:
WAV_path_list = os.path.join(data_sub_folder, '*' + '.wav')
WAV_path_list = glob.glob(WAV_path_list)
WAV_path_list.sort()

EMA_path_list = os.path.join(data_sub_folder, '*' + '.ema')
EMA_path_list = glob.glob(EMA_path_list)
EMA_path_list.sort()

In [25]:
mgc_dim = 180
lf0_dim = 3
vuv_dim = 1
bap_dim = 3

ema_dim = 21

acoustic_dim = mgc_dim + lf0_dim + vuv_dim + bap_dim

fs = 16000
frame_period = 5
hop_length = 80
fftlen = 1024
alpha = 0.41

mgc_start_idx = 0
lf0_start_idx = 180
vuv_start_idx = 183
bap_start_idx = 184

windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]

In [29]:
I, fs = librosa.load(WAV_path_list[0], sr = 16000)

#fs, I = wavfile.read(WAV_path_list[0])

print(fs)

16000


In [30]:
import pyworld
import pysptk
import nnmnkwii

In [34]:
mgc_dim = 40
lf0_dim = 1
vuv_dim = 1
bap_dim = 1

fs = 16000

frame_period = 5
hop_length = 80
fftlen = 1024
alpha = 0.41

order = 39
frame_period = 5
windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]

file_num = len(WAV_path_list)

train_index = range(file_num - 20)
valid_index = range(file_num - 20, file_num - 10)
test_index = range(file_num - 10, file_num)

In [35]:
def collect_features(wav_path, fs, frame_period, order):
  
    x, sr = librosa.load(wav_path, sr = fs)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram, order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")
    
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    lf0 = lf0.reshape(lf0.shape[0],1)
    vuv = vuv.reshape(vuv.shape[0],1)

    mgc_delta = apply_delta_windows(mgc, windows)
    lf0_delta = apply_delta_windows(lf0, windows)
    bap_delta = apply_delta_windows(bap, windows)
    
    features = np.hstack((mgc, lf0, vuv, bap))

    delta_features = np.hstack((mgc_delta, lf0_delta, vuv, bap_delta))
    
    return features, delta_features

In [40]:
Valid_MV = {}
Valid_WAV = {}

index = 0

for i in valid_index:
  
  MV = np.loadtxt(EMA_path_list[i])
  
  WAV, WAV_delta = collect_features(WAV_path_list[i], fs,  frame_period, order)

  scale_ratio = WAV.shape[0] / MV.shape[0]


  MV_align = np.empty([WAV.shape[0], MV.shape[1]])

  for j in range(MV.shape[1]):

    MV_align[:,j] = ndimage.zoom(MV[:,j], scale_ratio)
    
  MV_delta = apply_delta_windows(MV_align, windows)
  
  
  Valid_MV[index] = MV_delta
  Valid_WAV[index] = WAV_delta
  
  index = index + 1  



In [41]:
Test_MV = {}
Test_WAV = {}

index = 0

for i in test_index:
  
  MV = np.loadtxt(EMA_path_list[i])
  
  WAV, WAV_delta = collect_features(WAV_path_list[i], fs,  frame_period, order)

  scale_ratio = WAV.shape[0] / MV.shape[0]


  MV_align = np.empty([WAV.shape[0], MV.shape[1]])

  for j in range(MV.shape[1]):

    MV_align[:,j] = ndimage.zoom(MV[:,j], scale_ratio)
    
  MV_delta = apply_delta_windows(MV_align, windows)
  
  
  Test_MV[index] = MV_delta
  Test_WAV[index] = WAV_delta
  
  index = index + 1  

In [42]:
Train_MV = {}
Train_WAV = {}

index = 0

for i in train_index:
  
  MV = np.loadtxt(EMA_path_list[i])
  
  WAV, WAV_delta = collect_features(WAV_path_list[i], fs,  frame_period, order)

  scale_ratio = WAV.shape[0] / MV.shape[0]


  MV_align = np.empty([WAV.shape[0], MV.shape[1]])

  for j in range(MV.shape[1]):

    MV_align[:,j] = ndimage.zoom(MV[:,j], scale_ratio)
    
  MV_delta = apply_delta_windows(MV_align, windows)
  
  
  Train_MV[index] = MV_delta
  Train_WAV[index] = WAV_delta
  
  index = index + 1 

In [43]:
Train_MV_block = np.concatenate([Train_MV[x] for x in Train_MV], 0).astype(np.float32)
Train_WAV_block = np.concatenate([Train_WAV[x] for x in Train_WAV], 0)

Valid_MV_block = np.concatenate([Valid_MV[x] for x in Valid_MV], 0).astype(np.float32)
Valid_WAV_block = np.concatenate([Valid_WAV[x] for x in Valid_WAV], 0)

Test_MV_block = np.concatenate([Test_MV[x] for x in Test_MV], 0).astype(np.float32)
Test_WAV_block = np.concatenate([Test_WAV[x] for x in Test_WAV], 0)

print(Train_MV_block.shape)
print(Train_WAV_block.shape)

print(Valid_MV_block.shape)
print(Valid_WAV_block.shape)

print(Test_MV_block.shape)
print(Test_WAV_block.shape)

print(Train_MV_block)
print(Train_WAV_block)

(125649, 63)
(125649, 127)
(9192, 63)
(9192, 127)
(6813, 63)
(6813, 127)
[[-3.3395802e+01  1.6143700e+01  7.9419999e+00 ...  4.9904375e+00
   6.8988562e+00  1.6429406e-01]
 [-3.3483120e+01  1.5874749e+01  7.8733754e+00 ...  2.6739890e-02
   5.0346792e-02 -1.2042025e-01]
 [-3.3419582e+01  1.5773069e+01  7.8436131e+00 ...  1.7501785e-01
   1.6871114e-01  1.6737479e-01]
 ...
 [-3.6329300e+01  1.3279700e+01  8.4714003e+00 ... -7.9999998e-02
   8.3800003e-02 -1.0800000e-02]
 [-3.6477600e+01  1.3283700e+01  8.4361000e+00 ... -9.3999997e-02
  -5.1000002e-03  2.8200001e-02]
 [-3.6623501e+01  1.3301900e+01  8.4468002e+00 ...  5.8501000e+00
   7.4232998e+00  1.7875000e+00]]
[[-7.62988842e+00  1.14834979e+00  9.78014884e-01 ... -8.68569749e-12
  -4.34284875e-12  8.68569749e-12]
 [-7.43905389e+00  9.52134296e-01  7.73554656e-01 ... -8.68569749e-12
   0.00000000e+00  0.00000000e+00]
 [-7.49806685e+00  8.37806475e-01  4.51986261e-01 ... -8.68569749e-12
   0.00000000e+00  0.00000000e+00]
 ...
 [-6.38

In [58]:
import torch
from torch import nn
from torch.autograd import Variable
from tqdm import tnrange, tqdm
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim

from torch.utils import data as data_utils

In [45]:
class MyNet(torch.nn.Module):
    """Very simple deep neural networks.
    """
    def __init__(self, D_in, H, D_out, num_layers=2):
        super(MyNet, self).__init__()
        self.first_linear = nn.Linear(D_in, H)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(H, H) for _ in range(num_layers)])
        self.last_linear = nn.Linear(H, D_out)
        self.relu = nn.Tanh()

    def forward(self, x):
        h = self.relu(self.first_linear(x))
        for hl in self.hidden_layers:
            h = self.relu(hl(h))
        return self.last_linear(h)

In [80]:
num_hidden_layers = 3
hidden_size = 256
batch_size = 1024
# We use PyTorch's multiprocess iterator. Note that large n_workers causes
# dataset copies across proccess.
n_workers = 4
pin_memory = True
nepoch = 25
lr = 0.001
weight_decay = 1e-6
use_cuda = torch.cuda.is_available()
print(use_cuda)

True


In [77]:
Train_tensor_MV = torch.from_numpy(Train_MV_block)
Train_tensor_MV = Train_tensor_MV.float()

Train_tensor_WAV = torch.from_numpy(Train_WAV_block)
Train_tensor_WAV = Train_tensor_WAV.float()

Valid_tensor_MV = torch.from_numpy(Valid_MV_block)
Valid_tensor_MV = Valid_tensor_MV.float()

Valid_tensor_WAV = torch.from_numpy(Valid_WAV_block)
Valid_tensor_WAV = Valid_tensor_WAV.float()

Train_loader = data_utils.TensorDataset(Train_tensor_MV, Train_tensor_WAV)
Train_loader_dataset = data_utils.DataLoader(Train_loader, batch_size=batch_size, num_workers=n_workers, pin_memory=pin_memory,shuffle = False)

Valid_loader = data_utils.TensorDataset(Valid_tensor_MV, Valid_tensor_WAV)
Valid_loader_dataset = data_utils.DataLoader(Valid_loader, batch_size=batch_size, num_workers=n_workers, pin_memory=pin_memory,shuffle = False)

In [81]:
model = MyNet(Train_MV_block.shape[1], hidden_size, Train_WAV_block.shape[1], num_hidden_layers)
print("Model", model)

Model MyNet(
  (first_linear): Linear(in_features=63, out_features=256, bias=True)
  (hidden_layers): ModuleList(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=256, bias=True)
  )
  (last_linear): Linear(in_features=256, out_features=127, bias=True)
  (relu): Tanh()
)


In [82]:
print(torch.cuda.is_available())

True


In [83]:
model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

criterion = nn.MSELoss()
model.train()
print("Start frame-wise training...")
loss_history = {"Train": [], "Valid": []}

dataset_loaders = {"Train": Train_loader_dataset, "Valid": Valid_loader_dataset}
    
for epoch in tnrange(nepoch):
    ## training
    model.train()
    running_loss = 0.0
    for x, y in Train_loader:
       # if use_cuda:
        x, y = x.cuda(), y.cuda()
        x, y = Variable(x), Variable(y)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    loss_history["Train"].append(running_loss / len(dataset_loaders["Train"]))
    print("Training loss:", running_loss)

    ## validating
    model.eval()
    running_loss = 0.0
    for x, y in Valid_loader:
 #       if use_cuda:
        x, y = x.cuda(), y.cuda()
        x, y = Variable(x), Variable(y)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y) 
        running_loss += loss.item()
    loss_history["Valid"].append(running_loss / len(dataset_loaders["Valid"]))
    
    print("Validation loss:", running_loss)

Start frame-wise training...


  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

Training loss: nan
Validation loss: nan


KeyboardInterrupt: 

In [54]:
plot(loss_history["Train"], linewidth=2, label="Train loss")
plot(loss_history["Valid"], linewidth=2, label="Test loss")
legend(prop={"size": 16})