In [1]:
import torch

In [2]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("./sentencepiece.bpe.model")

True

In [3]:
sp.vocab_size()

250000

In [4]:
ids = sp.encode("Hello World")
print(ids)

# 解码(将ID转换回文本)
text = sp.decode(ids)
print(text)

[35377, 6660]
Hello World


In [6]:
emb = torch.nn.Embedding(250000, 256)
emb.weight.data.uniform_(-0.1, 0.1)
emb.weight.data.shape
embedding = emb(torch.tensor(ids))

In [7]:
embedding.shape

torch.Size([2, 256])

# 原来的

In [82]:
import sys
import os

current_dir = os.getcwd()
print("当前目录:", current_dir)
# 获取当前文件的父目录
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)
print(parent_dir)
from GPT_SoVITS.AR.models.t2s_model import Text2SemanticDecoder

当前目录: /workspaces/GPT-SoVITS2/playground
/workspaces/GPT-SoVITS2


In [83]:
decoder = Text2SemanticDecoder({
    "model": {
        "n_layer": 24,
        "hidden_dim": 768,
        "head": 16,
        "num_codebook": 8,
        "p_dropout": 0.0,
        "vocab_size": 4097,
        "pad_val": 4096,
        "embedding_dim": 768,
        "dropout": 0.0,
        "EOS": 4096,
    }
})

In [94]:
import torch
import torch.nn as nn

# 假设的输入数据
x = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 3, 3, 5, 5, 3, 4, 1, 67, 2]])
y = torch.tensor(
    [[11, 12, 13, 14, 15, 16, 17, 18, 19], [11, 12, 13, 14, 15, 3, 61, 3, 325]]
)
x_len = torch.tensor([10, 10])
y_len = torch.tensor([9, 8])

# 创建text和speech的Embedding层
text_vocab_size = 250000  # 假设的文本词汇表大小
speech_vocab_size = 250000  # 假设的语音词汇表大小
embedding_dim = 768  # 假设的嵌入维度

text_embedding = nn.Embedding(text_vocab_size, embedding_dim)
speech_embedding = nn.Embedding(speech_vocab_size, embedding_dim)


# 计算embedding的函数
def compute_embedding(tokens, lengths, embedding_layer):
    batch_size, max_len = tokens.shape
    mask = torch.arange(max_len).expand(batch_size, max_len) < lengths.unsqueeze(1)
    embedded = embedding_layer(tokens) * mask.unsqueeze(-1).float()
    return embedded


# 计算x和y的embedding
x_embedded = compute_embedding(x, x_len, text_embedding)
y_embedded = compute_embedding(y, y_len, speech_embedding)

# 正确拼接embedding
batch_size = x.shape[0]
max_len = x_len.max() + y_len.max()
xy_embedded = torch.zeros(
    (batch_size, max_len, embedding_dim), device=x_embedded.device
)

for i in range(batch_size):
    xy_embedded[i, : x_len[i]] = x_embedded[i, : x_len[i]]
    xy_embedded[i, x_len[i] : x_len[i] + y_len[i]] = y_embedded[i, : y_len[i]]

# 计算x_mask
x_mask = torch.arange(x.shape[1]).expand(batch_size, -1) < x_len.unsqueeze(1)
y_mask = torch.arange(y.shape[1]).expand(batch_size, -1) < y_len.unsqueeze(1)
# 计算正确的xy_mask
xy_mask = torch.zeros((batch_size, max_len), dtype=torch.bool)
for i in range(batch_size):
    xy_mask[i, : x_len[i] + y_len[i]] = True

# 计算xy_len
xy_len = x_len + y_len

print("xy_embedded shape:", xy_embedded.shape)
print("x_mask shape:", x_mask.shape)
print("y_mask shape:", y_mask.shape)
print("xy_mask shape:", xy_mask.shape)
print("xy_len:", xy_len)

# 打印第二个样本（batch idx = 1）的mask
print("\nxy_mask for batch idx 1:")
print(xy_mask[1])

# 打印x_mask
print("\nx_mask:")
print(x_mask)
print("\ny_mask:")
print(y_mask)

xy_embedded shape: torch.Size([2, 19, 768])
x_mask shape: torch.Size([2, 10])
y_mask shape: torch.Size([2, 9])
xy_mask shape: torch.Size([2, 19])
xy_len: tensor([19, 18])

xy_mask for batch idx 1:
tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False])

x_mask:
tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

y_mask:
tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True, False]])


In [103]:
x

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
        [ 1,  3,  3,  5,  5,  3,  4,  1, 67,  2]])

In [91]:
from transformers.models import qwen2
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    "Qwen/Qwen2-0.5B",
)
config.hidden_size = 768
config.num_hidden_layers = 16
config.max_window_layers = 16
config.num_attention_heads = 12
model = qwen2.Qwen2Model(config=config).to("cuda")

In [92]:
res1 = model(inputs_embeds=xy_embedded.to("cuda"), attention_mask=xy_mask.to("cuda"))
res2 = model(inputs_embeds=xy_embedded.to("cuda"))

In [93]:
res1.last_hidden_state[0] == res2.last_hidden_state[1]

tensor([[ True,  True,  True,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]], device='cuda:0')

In [79]:
y = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
y_mask_int = torch.tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
y = y.type(torch.int64) * (1 - y_mask_int)
targets = torch.nn.functional.pad(y, (0, 1), value=0) + 4096 * torch.nn.functional.pad(
            y_mask_int, (0, 1), value=1
        )

In [80]:
y, targets = targets[:, :-1], targets[:, 1:]

In [81]:
y

tensor([[   1,    2,    3,    4,    5,    6,    7,    8,    9,   10],
        [   1,    2,    3,    4,    5,    6,    7,    8,    9, 4096]])

In [113]:
y = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
y_mask = torch.tensor([[True, True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, True, True, False]])
EOS = 4096
# mask为False的地方变成EOS
y = torch.where(y_mask, y, EOS)
# 给y后面添加EOS
targets = torch.nn.functional.pad(y, (0, 1), value=EOS)
y, targets = targets[:, :-1], targets[:, 1:]

In [114]:
y

tensor([[   1,    2,    3,    4,    5,    6,    7,    8,    9,   10],
        [   1,    2,    3,    4,    5,    6,    7,    8,    9, 4096]])

In [115]:
targets

tensor([[   2,    3,    4,    5,    6,    7,    8,    9,   10, 4096],
        [   2,    3,    4,    5,    6,    7,    8,    9, 4096, 4096]])

In [108]:
qwen_config = AutoConfig.from_pretrained(
    "Qwen/Qwen2-72b",
)

In [116]:
from torch.nn.utils.rnn import pad_sequence
IGNORE_ID = -1
text_token_len = torch.tensor([10, 10])
speech_token_len = torch.tensor([9, 5])
speech_token = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9]])
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [4096]) for i in range(text_token_len.size(0))]
lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID)

In [117]:
lm_target

tensor([[  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
            1,    2,    3,    4,    5,    6,    7,    8,    9, 4096],
        [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
            1,    2,    3,    4,    5, 4096,   -1,   -1,   -1,   -1]])