In [1]:
from paths import DATA_DIR, LJLISTS_DIR, LOGS_DIR, FILELISTS_DIR
from pathlib import Path
import numpy as np
import re
import torch

from text import cmudict
from utils import parse_filelist, intersperse

from text.converters import ipa_to_ternary, text_to_ipa, traits_list
from panphon import FeatureTable

ft = FeatureTable()

cmudict_path = "resources/cmu_dictionary"
dictionary_cmu = cmudict.CMUDict(cmudict_path)
cleaner_names = ["english_cleaners_v2"]

## MNGU0

In [76]:
mngu0_dir = DATA_DIR / 'MNGU0' / "src_data" / "s1" / "phone_labels"

utt_files = sorted(list(mngu0_dir.glob('*.utt')))
lab_files = sorted(list(mngu0_dir.glob('*.lab')))

In [77]:
utt_file = utt_files[0]  # Example utt file
lab_file = lab_files[0]  # Example lab file

In [78]:
from utils_dataset.mngu0 import mngu02ipa, get_mngu0_sentence, get_mngu0_phnm3

sentence = get_mngu0_sentence(utt_file)
ipa_phnm3 = get_mngu0_phnm3(lab_file) # norm
ipawords_list = ['%'.join([elem[2] for elem in ipa_phnm3])]
ternary_emb_phnm = ipa_to_ternary(ipawords_list, merge_diphtongues=True)
ternary_emb_phnm = torch.FloatTensor(ternary_emb_phnm).T 
print(f"Sentence: {sentence}")
print(f"IPA Phones: {ipa_phnm3}")
print(f"IPA Phones: {len(ipa_phnm3)}")
print(f"Ternary Phones: {ternary_emb_phnm.shape}")

Sentence: If you want to regulate noise, regulate noise.
IPA Phones: [(0.   , 0.488, '.') (0.488, 0.554, 'ɪ') (0.554, 0.634, 'f')
 (0.634, 0.676, 'j') (0.676, 0.708, 'uː') (0.708, 0.78 , 'w')
 (0.78 , 0.842, 'ɒ') (0.842, 0.896, 'n') (0.896, 0.914, 't')
 (0.914, 0.974, 't') (0.974, 1.004, 'uː') (1.004, 1.104, 'ɹ')
 (1.104, 1.148, 'ɛ') (1.148, 1.218, 'ɡ') (1.218, 1.238, 'j')
 (1.238, 1.282, 'uː') (1.282, 1.35 , 'l') (1.35 , 1.458, 'ɛɪ')
 (1.458, 1.518, 't') (1.518, 1.586, 'n') (1.586, 1.88 , 'ɔɪ')
 (1.88 , 2.028, 'z') (2.028, 2.174, '.') (2.174, 2.23 , 'ɹ')
 (2.23 , 2.25 , 'ɛ') (2.25 , 2.324, 'ɡ') (2.324, 2.356, 'j')
 (2.356, 2.394, 'uː') (2.394, 2.462, 'l') (2.462, 2.58 , 'ɛɪ')
 (2.58 , 2.62 , 't') (2.62 , 2.712, 'n') (2.712, 2.976, 'ɔɪ')
 (2.976, 3.142, 'z') (3.142, 3.612, '.')]
IPA Phones: 35
Ternary Phones: torch.Size([25, 35])


In [79]:
text = sentence
add_blank = False  # Whether to add a blank token between IPA symbols
merge_diphtongues = True  # Whether to merge diphthongs into single symbols

ipawords_list = text_to_ipa(
    text,
    dictionary=dictionary_cmu,
    cleaner_names=["english_cleaners_v2"],
    remove_punctuation=False,
)
if add_blank:
    ipawords_list = intersperse(ipawords_list, " ")
ternary_emb = ipa_to_ternary(
    ipawords_list, merge_diphtongues=merge_diphtongues
)
ternary_emb = torch.FloatTensor(ternary_emb).T  # shape: (n_ipa_feats, seq_len)
ternary_emb.shape

torch.Size([25, 34])

In [52]:
for k, v in mngu02ipa.items():
    emb = ipa_to_ternary([v], merge_diphtongues=True)
    if emb.shape[0] != 1:
        print(f"emb shape: {emb.shape}")
        print(f"Invalid IPA: {v} for MNGU0 phone {k}")

## Mocha

In [85]:
mocha_dir = DATA_DIR / 'mocha_timit'
mocha_trans_files = sorted(list(mocha_dir.glob('*.trans')))
mocha_phnm_files = sorted(list(mocha_dir.glob('*.phnm')))

trans_file = mocha_trans_files[0]  # Example trans file
phnm_file = mocha_phnm_files[0]  # Example phnm file

from utils_dataset.mocha import get_mocha_sentence, get_mocha_phnm3

sentence = get_mocha_sentence(trans_file)
ipa_phnm3 = get_mocha_phnm3(phnm_file)
print(f"Sentence: {sentence}")
print(f"Phonemes: {ipa_phnm3}")

Sentence: This was easy for us.
Phonemes: [(0.  , 0.87   , '.') (0.87, 0.91   , 'd') (0.91, 1.     , 'ɪ')
 (1.  , 1.15   , 's') (1.15, 1.22   , 'w') (1.22, 1.26   , 'ə')
 (1.26, 1.37   , 'z') (1.37, 1.5    , 'iː') (1.5 , 1.6    , 'z')
 (1.6 , 1.67   , 'i') (1.67, 1.83   , 'f') (1.83, 1.92   , 'ə')
 (1.92, 2.09   , 'ə') (2.09, 2.34   , 's') (2.34, 2.96875, '.')]


In [86]:
ipawords_list = ['%'.join([elem[2] for elem in ipa_phnm3])]
ternary_emb_phnm = ipa_to_ternary(ipawords_list, merge_diphtongues=True)
ternary_emb_phnm = torch.FloatTensor(ternary_emb_phnm).T 
print(f"Sentence: {sentence}")
print(f"IPA Phones: {ipa_phnm3}")
print(f"IPA Phones: {len(ipa_phnm3)}")
print(f"Ternary Phones: {ternary_emb_phnm.shape}")

Sentence: This was easy for us.
IPA Phones: [(0.  , 0.87   , '.') (0.87, 0.91   , 'd') (0.91, 1.     , 'ɪ')
 (1.  , 1.15   , 's') (1.15, 1.22   , 'w') (1.22, 1.26   , 'ə')
 (1.26, 1.37   , 'z') (1.37, 1.5    , 'iː') (1.5 , 1.6    , 'z')
 (1.6 , 1.67   , 'i') (1.67, 1.83   , 'f') (1.83, 1.92   , 'ə')
 (1.92, 2.09   , 'ə') (2.09, 2.34   , 's') (2.34, 2.96875, '.')]
IPA Phones: 15
Ternary Phones: torch.Size([25, 15])


In [87]:
text = sentence
add_blank = False  # Whether to add a blank token between IPA symbols
merge_diphtongues = True  # Whether to merge diphthongs into single symbols

ipawords_list = text_to_ipa(
    text,
    dictionary=dictionary_cmu,
    cleaner_names=["english_cleaners_v2"],
    remove_punctuation=False,
)
if add_blank:
    ipawords_list = intersperse(ipawords_list, " ")
ternary_emb = ipa_to_ternary(
    ipawords_list, merge_diphtongues=merge_diphtongues
)
ternary_emb = torch.FloatTensor(ternary_emb).T  # shape: (n_ipa_feats, seq_len)
ternary_emb.shape

torch.Size([25, 15])

In [88]:
phonems = set()
mocha_phnm_files = sorted(list(mocha_dir.glob('*arttts/*/phnm3/*_phnm3.npy')))
for phnm_file in mocha_phnm_files:
    #phnm3 = get_mocha_phnm3(phnm_file)
    phnm3 = np.load(phnm_file)
    for s, e, phone in phnm3:
        phonems.add(phone)
        if phone != "." and not ft.validate_word(phone):
            print(f"Invalid IPA: {phone} in file {phnm_file.name}")
        #emb = ipa_to_ternary([phone], merge_diphtongues=True)
        #if emb.shape[0] != 1:
        #    print(f"emb shape: {emb.shape}")
        #    print(f"Invalid IPA: {phone} in file {phnm_file.name}")

In [89]:
for v in phonems:
    emb = ipa_to_ternary([v], merge_diphtongues=True)
    if emb.shape[0] != 1:
        print(f"emb shape: {emb.shape}")
        print(f"Invalid IPA: {v} for MNGU0 phone {k}")

## MSPKA

In [2]:
from utils_dataset.mspka import get_mspka_sentence, get_mspka_phnm3, mspka2ipa

mspka_dir = DATA_DIR / 'MSPKA_EMA_ita'
mspka_lab_files = sorted(list(mspka_dir.glob('*.lab')))
mspka_sent_files = sorted(list(mspka_dir.glob('list_sentences')))

lab_file = mspka_lab_files[0]  # Example lab file
sent_file = mspka_sent_files[0]  # Example phnm file

sentence = get_mspka_sentence(sent_file)
ipa_phnm3 = get_mspka_phnm3(lab_file)

#with open(sent_file, 'r', encoding='utf-8') as f:
#    sentence = f.read().strip()
#print(f"Sentence from file: {sentence}")

#mspka_phnms = set()
#
#for lab_file in mspka_lab_files:
#    sent, phnm3 = get_mspka_sentence_phnm3(lab_file)
#    for s, e, phone in phnm3:
#        mspka_phnms.add(phone)
#mspka_phnms

In [3]:
for lab_file in mspka_lab_files:
    sentence = get_mspka_sentence(lab_file)[0]
    phnm3 = get_mspka_phnm3(lab_file)
    for s, e, phone in phnm3:
        if phone != "." and not ft.validate_word(phone):
            print(f"Invalid IPA: {phone} in file {lab_file.name}")

In [6]:
ipawords_list = ['%'.join([elem[2] for elem in ipa_phnm3])]
ternary_emb_phnm = ipa_to_ternary(ipawords_list, merge_diphtongues=True)
ternary_emb_phnm = torch.FloatTensor(ternary_emb_phnm).T 
print(f"Sentence: {sentence}")
print(f"IPA Phones: {ipa_phnm3}")
print(f"IPA Phones: {len(ipa_phnm3)}")
print(f"Ternary Phones: {ternary_emb_phnm.shape}")

Sentence: ('erano concesse dal comune che poi furono tolte', [])
IPA Phones: [(0.  , 0.35, '.') (0.35, 0.54, 'ɛ') (0.54, 0.59, 'ɾ') (0.59, 0.65, 'a')
 (0.65, 0.7 , 'n') (0.7 , 0.79, 'o') (0.79, 0.85, 'k') (0.85, 0.89, 'o')
 (0.89, 0.98, 'n') (0.98, 1.07, 't͡ʃ') (1.07, 1.15, 'ɛ')
 (1.15, 1.28, 'sː') (1.28, 1.32, 'e') (1.32, 1.37, 'd') (1.37, 1.42, 'a')
 (1.42, 1.48, 'l') (1.48, 1.56, 'k') (1.56, 1.62, 'o') (1.62, 1.75, 'm')
 (1.75, 1.89, 'u') (1.89, 1.94, 'n') (1.94, 2.06, 'e') (2.06, 2.4 , 'k')
 (2.4 , 2.51, 'e') (2.51, 2.58, 'p') (2.58, 2.76, 'ɔ') (2.76, 2.84, 'i')
 (2.84, 2.97, 'f') (2.97, 3.05, 'u') (3.05, 3.11, 'ɾ') (3.11, 3.19, 'o')
 (3.19, 3.22, 'n') (3.22, 3.32, 'o') (3.32, 3.4 , 't') (3.4 , 3.56, 'ɔ')
 (3.56, 3.63, 'l') (3.63, 3.74, 't') (3.74, 3.83, 'e') (3.83, 4.46, '.')]
IPA Phones: 39
Ternary Phones: torch.Size([25, 39])


In [8]:
for k, v in mspka2ipa.items():
    emb = ipa_to_ternary([v], merge_diphtongues=True)
    if emb.shape[0] != 1:
        print(f"emb shape: {emb.shape}")
        print(f"Invalid IPA: {v} for MPSKA phone {k}")
        sentence = get_mspka_sentence(lab_file)


emb shape: (2, 25)
Invalid IPA: nf for MPSKA phone nf


In [11]:
sentences = []
phnm3s = []
for lab_file in mspka_lab_files:
    sentence = get_mspka_sentence(lab_file)[0]
    phnm3 = get_mspka_phnm3(lab_file)
    truc = False
    for s, e, phone in phnm3:
        emb = ipa_to_ternary([phone], merge_diphtongues=True)
        if phone != "." and emb.shape[0] != 1:
            print(f"Invalid IPA: {phone} in file {lab_file.name}")
            print(sentence)
            truc = True
    if truc:
        sentences.append(sentence)
        phnm3s.append(phnm3)

## PB2007

In [12]:
pb_dir =  DATA_DIR / 'pb2007'
pb_phone_files = sorted(list(pb_dir.glob('*.phone')))

from utils_dataset.pb2007 import pb20072ipa, get_pb2007_phnm3

In [15]:
def get_pb2007_phnm3_ori(phone_file: str) -> np.ndarray:
    with open(phone_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    lines = [line.split(" ") for line in lines]

    phnm3 = []
    for line in lines:
        if len(line) == 3:
            s_frame, e_frame, phone = line
            s_sec = float(s_frame) / 100
            e_sec = float(e_frame) / 100
            phnm3.append((s_sec, e_sec, phone))
    phnm3 = [(s, e, phone) for s, e, phone in phnm3]
    phnm3 = np.array(phnm3, dtype=[("start", "f4"), ("end", "f4"), ("phone", "U10")])
    return phnm3

In [41]:
phnm3s = []
phone_files = []
for phone_file in pb_phone_files:
    phnm3 = get_pb2007_phnm3_ori(phone_file)
    truc=False
    for s, e, phone in phnm3:
        if phone == "e~":
            truc = True
        #if (phone not in [".", ".."]) and not ft.validate_word(phone):
        #    print(f"Invalid IPA: {phone} in file {phone_file.name}")
    if truc:
        phnm3s.append(phnm3)
        phone_files.append(phone_file)
        #print(f"Invalid IPA: {phone} in file {phone_file.name}")
        #print(sentence)

In [14]:
phone_file = pb_phone_files[-1]  # Example phone file
ipa_phnm3 = get_pb2007_phnm3(phone_file)
print(f"Phonemes: {ipa_phnm3}")
ipawords_list = ['%'.join([elem[2] for elem in ipa_phnm3])]
ternary_emb_phnm = ipa_to_ternary(ipawords_list, merge_diphtongues=True)
ternary_emb_phnm = torch.FloatTensor(ternary_emb_phnm).T 
print(f"IPA Phones: {ipa_phnm3}")
print(f"IPA Phones: {len(ipa_phnm3)}")
print(f"Ternary Phones: {ternary_emb_phnm.shape}")

Phonemes: [(0.  , 0.15, '.') (0.15, 0.21, 'l') (0.21, 0.27, 'e') (0.27, 0.37, 'p')
 (0.37, 0.43, 'a') (0.43, 0.47, 'ʁ') (0.47, 0.53, 'a') (0.53, 0.58, 'm')
 (0.58, 0.67, 'ɛ') (0.67, 0.72, 't') (0.72, 0.8 , 'ʁ') (0.8 , 0.85, 'a')
 (0.85, 0.9 , 'ʁ') (0.9 , 1.  , 't') (1.  , 1.06, 'i') (1.06, 1.14, 'k')
 (1.14, 1.19, 'y') (1.19, 1.23, 'l') (1.23, 1.29, 'a') (1.29, 1.42, 't')
 (1.42, 1.48, 'w') (1.48, 1.6 , 'a') (1.6 , 1.65, 'ʁ') (1.65, 1.77, 'k')
 (1.77, 1.82, 'i') (1.82, 1.92, 'p') (1.92, 1.98, 'i') (1.98, 2.05, 'l')
 (2.05, 2.14, 'ɔ') (2.14, 2.24, 't') (2.24, 2.31, 'ɑ̃') (2.31, 2.44, 's')
 (2.44, 2.5 , 'm') (2.5 , 2.57, 'ɔ') (2.57, 2.66, 'm') (2.66, 2.83, 'ɑ̃')
 (2.83, 2.94, 'm') (2.94, 3.  , 'a') (3.  , 3.06, 'm') (3.06, 3.13, 'a')
 (3.13, 3.28, 'ʃ') (3.28, 3.38, 'w') (3.38, 3.56, 'a') (3.56, 3.7 , 'ʁ')
 (3.7 , 3.78, '.') (3.78, 3.85, 'm') (3.85, 3.95, 'a') (3.95, 4.04, 'l')
 (4.04, 4.2 , 'ɑ̃') (4.2 , 4.32, 'ɡ') (4.32, 4.42, 'e') (4.42, 4.5 , 'm')
 (4.5 , 4.56, 'ɛ') (4.56, 4.64, 'l') (

In [None]:
for k, v in pb20072ipa.items():
    emb = ipa_to_ternary([v], merge_diphtongues=True)
    if emb.shape[0] != 1:
        print(f"emb shape: {emb.shape}")
        print(f"Invalid IPA: {v} for MPSKA phone {k}")µ

## LJSpeech

In [18]:
from configs import params_v1

merge_diphtongues = params_v1.merge_diphtongues
cmudict_path = "resources/cmu_dictionary"
dictionary_cmu = cmudict.CMUDict(cmudict_path)

def get_phonemes(
        text: str, add_blank: bool = False, #False to uniformize with other aligned datasets not using blanks
    ) -> torch.IntTensor:  # shape: (n_ipa_feats, seq_len)
    ipawords_list = text_to_ipa(
        text,
        dictionary=dictionary_cmu,
        cleaner_names=["english_cleaners_v2"],
        remove_punctuation=True, # Remove punctuation from the text to uniformize with other aligned datasets
    )
    if add_blank:
        ipawords_list = intersperse(ipawords_list, " ")
    phnm_string = '%'.join(ipawords_list)
    phnm_string = ".%" + phnm_string + "%."
    return phnm_string.split("%")

In [None]:
#from paths import FILELISTS_DIR
#
#filelist_fp = FILELISTS_DIR / "ljspeech" / "valid_v0.txt"
#lines = parse_filelist(filelist_fp)
#save_dir = DATA_DIR / "LJSpeech-1.1" / "phnm3"
#for fp, text in lines:
#    filestem = Path(fp).stem
#    ipawords_list = get_phonemes(text, add_blank=False)
#    phnm3_name = filestem + "_phnm3.npy"
#    phones = []
#    for phone in ipawords_list:
#        start_time = float("nan")
#        end_time = float("nan")
#        phones.append((start_time, end_time, phone))
#    phones = np.array(phones, dtype=[("start", "f4"), ("end", "f4"), ("phone", "U10")])
#    phnm3_fp = save_dir / phnm3_name
#    np.save(phnm3_fp, phones)

## Dataset, dataloader

In [43]:
from data_phnm import PhnmArticDataset, PhnmArticBatchCollate
from torch.utils.data import DataLoader
from paths import DATA_DIR

from configs import params_v1

batch_size = 1

train_dataset = PhnmArticDataset(
        params_v1.train_filelist_path,
        data_root_dir=DATA_DIR,
        load_coder=False,
        shuffle=False,
        merge_diphtongues=True,
    )
valid_dataset = PhnmArticDataset(
        params_v1.valid_filelist_path,
        data_root_dir=DATA_DIR,
        load_coder=False,
        shuffle=False,
        merge_diphtongues=True,
    )


train_dataset.filepaths_list = train_dataset.filepaths_list[:10]
valid_dataset.filepaths_list = valid_dataset.filepaths_list[:10]
print("train_size", len(train_dataset), "valid_size", len(valid_dataset))

batch_collate = PhnmArticBatchCollate()

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=batch_collate,
    drop_last=True,
    shuffle=False,
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=batch_collate,
    drop_last=True,
    shuffle=False,
)

train_size 10 valid_size 10


In [44]:
truc = next(iter(train_loader))
truc

{'x': tensor([[[ 0.,  1., -1.,  ..., -1., -1.,  0.],
          [ 0.,  1., -1.,  ...,  1., -1.,  0.],
          [ 0., -1.,  1.,  ...,  1.,  1.,  0.],
          ...,
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 1.,  0.,  0.,  ...,  0.,  0.,  1.]]]),
 'x_lengths': tensor([120]),
 'y': tensor([[[-1.4232, -1.1987, -1.1778,  ..., -0.6045, -0.4897, -0.5133],
          [-1.6910, -1.7874, -1.8012,  ..., -0.5524, -0.4178, -0.3054],
          [-1.9739, -2.0031, -2.0324,  ..., -0.9943, -0.7880, -0.4183],
          ...,
          [ 0.0038,  0.9296,  1.5182,  ...,  0.0295,  0.0187,  0.0164],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 1.2512,  1.2418,  1.4047,  ...,  2.1459,  2.1791,  2.2276]]]),
 'y_lengths': tensor([492])}

In [None]:
import torch
from model import GradTTS
from configs import params_v0

model = GradTTS(
        params_v0.n_ipa_feats,
        params_v0.n_spks,
        None if params_v0.n_spks == 1 else params_v0.spk_emb_dim, #spk_emb_dim
        params_v0.n_enc_channels,
        params_v0.filter_channels,
        params_v0.filter_channels_dp,
        params_v0.n_heads,
        params_v0.n_enc_layers,
        params_v0.enc_kernel,
        params_v0.enc_dropout,
        params_v0.window_size,
        params_v0.n_feats,
        params_v0.dec_dim,
        params_v0.beta_min,
        params_v0.beta_max,
        params_v0.pe_scale,
    )

version = 'v0_es_ema_200'
grad_filename = 'grad_10.pt'
ckpt_state_dict = torch.load(LOGS_DIR / version / grad_filename,
                  map_location=torch.device('cpu'))
model.load_state_dict(ckpt_state_dict)

<All keys matched successfully>

In [48]:
model.eval()
with torch.no_grad():
    for i, item in enumerate(valid_loader):
        x = item["x"].to(torch.float32)
        x_lengths = torch.LongTensor([x.shape[-1]])
        y_enc, y_dec, attn = model(x, x_lengths, n_timesteps=50)
        break

## Never know

In [121]:
#mngu0_dir = LJLISTS_DIR / "../mngu0"
#mocha_dir = LJLISTS_DIR / "../mocha"
#mspka_dir = LJLISTS_DIR / "../mspka"
#pb2007_dir = LJLISTS_DIR / "../pb2007"
#
#flist_dir = pb2007_dir
#
#with open(flist_dir / "file_list.txt", "r") as f:
#    lines = [line.strip() for line in f.readlines()]
#
#data_prefix = "DUMMY/"
#data_prefix += "pb2007" # or "mocha_timit" or "MSPKA_EMA_ita" or "pb2007"
#data_prefix += "/arttts"
#
#new_lines = []
#for line in lines:
#    _, spk, phnm3, filename = line.split("/")
#    spk_prefix = data_prefix + "/" + spk
#    phnm3_prefix = spk_prefix + "/phnm3"
#    phnm3_fp = phnm3_prefix +"/" + filename
#    wav_fp = spk_prefix + "/wavs" + "/" + filename.replace("_phnm3.npy", ".wav")
#    new_line = f"{wav_fp}|{phnm3_fp}"
#    new_lines.append(new_line)
#new_lines = sorted(new_lines)
#
#with open(flist_dir / "total_v1.txt", "w") as f:
#    for line in new_lines:
#        f.write(line + "\n")
#print(f"Total lines: {len(new_lines)}")

In [None]:
#import numpy as np
#
#dataset = "pb2007"
#for speaker in ["spk1"]:
#    with open(f"resources/filelists/{dataset}/total_v1.txt", "r") as f:
#        total_v1 = f.read().splitlines()
#    total_v1 = np.array(total_v1)
#    speakers = np.array([e.split("/")[3] for e in total_v1])
#    speaker_v1 = total_v1[np.where(np.array(speakers) == speaker)[0]]
#
#    with open(f"resources/filelists/{dataset}/{speaker}_v1.txt", "w") as file:
#        file.writelines([e + '\n' for e in speaker_v1])