In [1]:
"""
Generate speaker embeddings and metadata for training
"""
import os
import pickle
import numpy as np
import torch
from factory.MetaDV import MetaDV

In [2]:
C = torch.load("../model/static/metadv_tw.pt")

In [3]:
# HOW MANY DIFFERENT CONTENT OF VOICE IN YOUR DATA
num_uttrs = 100
len_crop = 176

In [4]:
# Directory containing mel-spectrograms
rootDir = './spmel'
dirName, subdirList, _ = next(os.walk(rootDir))
print('Found directory: %s' % dirName)

Found directory: ./spmel


In [5]:
def pad_along_axis(array: np.ndarray, target_length: int, axis: int = 0):
    pad_size = target_length - array.shape[axis]
    if pad_size <= 0:
        return array
    npad = [(0, 0)] * array.ndim
    npad[axis] = (0, pad_size)
    return np.pad(array, pad_width=npad, mode='constant', constant_values=0)

In [7]:
speakers = []
for speaker in sorted(subdirList):
    print('Processing speaker: %s' % speaker)
    utterances = []
    utterances.append(speaker)
    _, _, fileList = next(os.walk(os.path.join(dirName,speaker)))
    fileList = fileList[:num_uttrs]
    # make speaker embedding
    assert len(fileList) >= num_uttrs
    idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
    embs = []
    for i in range(num_uttrs):
        tmp = np.load(os.path.join(dirName, speaker, fileList[idx_uttrs[i]]))
        # pad if the current one is too short   
        if tmp.shape[0] < len_crop:
            tmp = pad_along_axis(tmp,len_crop)
            melsp = torch.from_numpy(tmp[np.newaxis,:, :]).cuda()
        else:
            melsp = torch.from_numpy(tmp[np.newaxis,:, :]).cuda()
        emb = C(melsp)[1]
        embs.append(emb.detach().squeeze().cpu().numpy())    
           
    utterances.append(np.mean(embs, axis=0))
    # create file list
    for fileName in sorted(fileList):
        utterances.append(os.path.join(speaker,fileName))
    speakers.append(utterances)

Processing speaker: p1
Processing speaker: p2
Processing speaker: 劉大偉
Processing speaker: 劉安婷
Processing speaker: 吳淡如
Processing speaker: 呂世浩
Processing speaker: 姜莫莉
Processing speaker: 廖敏惠
Processing speaker: 徐裴翊
Processing speaker: 曾博恩
Processing speaker: 林懷民
Processing speaker: 林昶佐
Processing speaker: 柯文哲
Processing speaker: 淨空法師
Processing speaker: 莊淑芬
Processing speaker: 葉丙成
Processing speaker: 蔣勳
Processing speaker: 視網膜
Processing speaker: 賴佩霞
Processing speaker: 郭婞淳
Processing speaker: 鍾瑩瑩
Processing speaker: 陳崇文
Processing speaker: 陳文茜
Processing speaker: 馮翊綱


In [8]:
with open(os.path.join(rootDir, 'train.pkl'), 'wb') as handle:
    pickle.dump(speakers, handle)