In [19]:
import torch
import sys
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("./models/bert-prosody")

models = dict()


def get_bert_feature(text, word2ph, device=None):
    if (
        sys.platform == "darwin"
        and torch.backends.mps.is_available()
        and device == "cpu"
    ):
        device = "mps"
    if not device:
        device = "cuda"
    if device not in models.keys():
        models[device] = AutoModelForMaskedLM.from_pretrained(
            "./models/bert-prosody"
        ).to(device)
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt")
        for i in inputs:
            inputs[i] = inputs[i].to(device)
        res = models[device](**inputs, output_hidden_states=True)
        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()

    assert len(word2ph) == len(text) + 2
    word2phone = word2ph
    phone_level_feature = []
    for i in range(len(word2phone)):
        repeat_feature = res[i].repeat(word2phone[i], 1)
        phone_level_feature.append(repeat_feature)

    phone_level_feature = torch.cat(phone_level_feature, dim=0)

    return phone_level_feature.T


if __name__ == "__main__":
    import torch

    word_level_feature = torch.rand(38, 768)  # 12个词,每个词1024维特征
    word2phone = [
        1,
        2,
        1,
        2,
        2,
        1,
        2,
        2,
        1,
        2,
        2,
        1,
        2,
        2,
        2,
        2,
        2,
        1,
        1,
        2,
        2,
        1,
        2,
        2,
        2,
        2,
        1,
        2,
        2,
        2,
        2,
        2,
        1,
        2,
        2,
        2,
        2,
        1,
    ]

    # 计算总帧数
    total_frames = sum(word2phone)
    print(word_level_feature.shape)
    print(word2phone)
    phone_level_feature = []
    for i in range(len(word2phone)):

        # 对每个词重复word2phone[i]次
        repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
        print(word_level_feature[i].shape,word2phone[i],repeat_feature.shape)
        
        phone_level_feature.append(repeat_feature)

    phone_level_feature = torch.cat(phone_level_feature, dim=0)
    print(phone_level_feature.T.shape)  # torch.Size([36, 1024])

torch.Size([38, 768])
[1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1]
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 1 torch.Size([1, 768])
torch.Size([768]) 2 torch.Size([2, 768])
torch.Size([768]) 2 torch.Size([2, 768])
to

In [1]:
import torch
import numpy as np
bert_filename = "dataset/aishell3_16k/SSB00110010.bert.npy"
bertpt = torch.FloatTensor(np.load(bert_filename))

In [2]:
print(bertpt)

tensor([[-0.4332, -0.4008,  0.2471,  ..., -0.3400,  0.5843,  0.3035],
        [-0.2315, -0.8457, -0.4866,  ..., -0.3133,  0.0420,  0.2558],
        [ 0.5430, -1.2423,  0.5775,  ..., -0.9140, -0.1976, -0.2772],
        ...,
        [-0.5502, -0.6223,  0.3056,  ...,  0.4754, -0.0448,  0.4808],
        [ 0.4328, -0.3808,  0.9168,  ..., -1.1270,  0.1758,  0.1739],
        [-0.5201,  0.2979, -0.6583,  ..., -0.8205,  0.1313, -0.9362]])


In [3]:

print(bertpt.T.shape)

torch.Size([14, 256])


In [4]:

from prosody import TTSProsody
prosody = TTSProsody("models/bert-prosody/", "cuda")

In [13]:
prosody.get_char_embeds("你好").shape

torch.Size([2, 256])

In [17]:
emb = prosody.expand_for_phone(bertpt,[2,2,2,2,1,1,1,1,1,1,1,1,1,1])
print(emb)

[[-0.4332184  -0.40082648  0.2470586  ... -0.34002692  0.58425456
   0.30349484]
 [-0.4332184  -0.40082648  0.2470586  ... -0.34002692  0.58425456
   0.30349484]
 [-0.23147522 -0.8456656  -0.48662975 ... -0.31329533  0.0419562
   0.25580218]
 ...
 [-0.55024457 -0.6222654   0.30557343 ...  0.47536862 -0.04482635
   0.4808462 ]
 [ 0.43279293 -0.38081172  0.91682386 ... -1.126965    0.17581624
   0.1738908 ]
 [-0.5200584   0.29791054 -0.6582804  ... -0.8204557   0.1312837
  -0.93621165]]


In [18]:
print(emb.shape)

(18, 256)


In [1]:

from pypinyin import lazy_pinyin, BOPOMOFO
from text.cleaners import chinese_cleaners2
import text

from bert_extract import VITS_BERT

vits_bert = VITS_BERT("models/bert-prosody/", "cuda")

In [2]:
input = "比方说，“华尔街日报”就很好。..."


In [3]:


bopomofos = chinese_cleaners2(input)
s = text.cleaned_text_to_sequence(bopomofos)

print(bopomofos,len(bopomofos))
print(len(s))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.636 seconds.
Prefix dict has been built successfully.


_ㄅㄧˇㄈㄤˉㄕㄨㄛˉ，“ㄏㄨㄚˊㄦˇㄐㄧㄝˉㄖˋㄅㄠˋ”ㄐㄧㄡˋㄏㄣˇㄏㄠˇ。..._ 44
44


In [None]:
print(vits_bert.chinese_to_bert(input).shape)

In [None]:
for filelist in [""]:
    print("START:", filelist)
    filepaths_and_text = load_filepaths_and_text(filelist)
    for i in range(len(filepaths_and_text)):
      original_text = filepaths_and_text[i][args.text_index]