In [125]:
import ffmpeg
import numpy as np

In [126]:
import torchaudio
import librosa
import torch
max_val = 0.8
target_sr = 16000
def load_wav(wav, target_sr):
    speech, sample_rate = torchaudio.load(wav)
    speech = speech.mean(dim=0, keepdim=True)
    if sample_rate != target_sr:
        assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
    return speech
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

In [127]:
import torch
speech_16k = postprocess(load_wav("./1.mp3", 16000))


In [128]:
# 计算语音时长
audio_duration = speech_16k.shape[1] / target_sr

print(f"语音时长: {audio_duration:.4f} 秒")


语音时长: 6.8318 秒


In [129]:
feat = speech_16k
import whisper
# load 
feat = whisper.log_mel_spectrogram(feat, n_mels=128)

In [130]:
import onnxruntime
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
speech_tokenizer_session = onnxruntime.InferenceSession(
    "../pretrained_models/speech_tokenizer_v1.onnx", sess_options=option, providers=["CUDAExecutionProvider"]
)

[0;93m2024-07-25 16:37:40.965221772 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 12 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2024-07-25 16:37:40.966285049 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-07-25 16:37:40.966290949 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [131]:
print(speech_tokenizer_session.get_inputs()[0].name)
print(speech_tokenizer_session.get_inputs()[1].name)

feats
feats_length


In [132]:
feat.detach().cpu().numpy()

array([[[-0.45053625, -0.05974329, -0.25894833, ..., -0.67959416,
         -0.67959416, -0.67959416],
        [-0.3529718 ,  0.03782111, -0.16138399, ..., -0.67959416,
         -0.67959416, -0.67959416],
        [-0.37882137,  0.08174139,  0.09368938, ..., -0.67959416,
         -0.67959416, -0.67959416],
        ...,
        [-0.29448664,  0.06283569,  0.16168153, ..., -0.67959416,
         -0.67959416, -0.67959416],
        [-0.38084733, -0.13342845, -0.03013408, ..., -0.67959416,
         -0.67959416, -0.67959416],
        [-0.420272  , -0.16023862, -0.10880888, ..., -0.67959416,
         -0.67959416, -0.67959416]]], dtype=float32)

In [133]:
feat_length = np.array([feat.shape[2]], dtype=np.int32)

In [134]:
print(f"feat_length: {feat_length}")

feat_length: [683]


In [135]:
speech_token = speech_tokenizer_session.run(None, {speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
                                                                speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()

In [136]:
speech_token = torch.Tensor(speech_token).long()

In [137]:
import sys
import os

current_dir = os.getcwd()
print("当前目录:", current_dir)
# 获取当前文件的父目录
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)
print(parent_dir)
from GPT_SoVITS.module.models import SynthesizerTrn
import time

params = {
    "spec_channels": 2048 // 2 + 1,
    "segment_size": 20480 // 640,
    "inter_channels": 256,
    "hidden_channels": 256,
    "filter_channels": 1024,
    "n_heads": 4,
    "n_layers": 6,
    "kernel_size": 3,
    "p_dropout": 0.1,
    "resblock": "1",
    "resblock_kernel_sizes": [3, 7, 11],
    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
    "upsample_rates": [10, 8, 2, 2, 2],
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [16, 16, 8, 2, 2],
}
net_g = SynthesizerTrn(**params)

当前目录: /workspaces/GPT-SoVITS2/playground
/workspaces/GPT-SoVITS2




In [138]:
import torch
checkpoint = torch.load('./G_139548.pth')

# 只获取模型状态字典
model_state_dict = checkpoint['model']

# 将状态字典加载到模型中
net_g.load_state_dict(model_state_dict)

# 将模型设置为评估模式
net_g.eval()

SynthesizerTrn(
  (embedding): Embedding(4097, 256)
  (rotary_emb): RotaryEmbedding()
  (enc_p): TextEncoder(
    (encoder_ssl): Encoder(
      (drop): Dropout(p=0.1, inplace=False)
      (attn_layers): ModuleList(
        (0-5): 6 x MultiHeadAttention(
          (conv_q): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
          (conv_k): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
          (conv_v): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
          (conv_o): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_1): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (ffn_layers): ModuleList(
        (0-5): 6 x FFN(
          (conv_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,))
          (conv_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_2): ModuleList(
        (0-5): 6 x LayerNorm()
      )


In [154]:
from scipy.io import wavfile
def load_audio(audio, target_sr):
    speech, sample_rate = torchaudio.load(audio)
    speech = speech.mean(dim=0, keepdim=True)
    if sample_rate != target_sr:
        speech = torchaudio.transforms.Resample(
            orig_freq=sample_rate, new_freq=target_sr
        )(speech)
    return speech


def postprocess(speech, target_sr, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db, frame_length=win_length, hop_length=hop_length
    )
    max_val = 0.8
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech
target_sr = 16000
audio = load_audio("1.mp3", target_sr)
audio = postprocess(audio, target_sr)
audio_np = audio.squeeze().numpy()
new_wav_path = "1.mp3" + ".wav"
wavfile.write(new_wav_path, target_sr, (audio_np * 32768).astype(np.int16))

In [140]:
import ffmpeg
target_sr = 32000
n_fft = 2048
hop_size = 640
win_size = 640
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
    if torch.min(y) < -1.0:
        print("min value is ", torch.min(y))
    if torch.max(y) > 1.0:
        print("max value is ", torch.max(y))

    global hann_window
    dtype_device = str(y.dtype) + "_" + str(y.device)
    wnsize_dtype_device = str(win_size) + "_" + dtype_device
    if wnsize_dtype_device not in hann_window:
        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
            dtype=y.dtype, device=y.device
        )

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
        mode="reflect",
    )
    y = y.squeeze(1)
    spec = torch.stft(
        y,
        n_fft,
        hop_length=hop_size,
        win_length=win_size,
        window=hann_window[wnsize_dtype_device],
        center=center,
        pad_mode="reflect",
        normalized=False,
        onesided=True,
        return_complex=False,
    )

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
    return spec

def load_audio(file, sr):
    try:
        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
        file = (
            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        )  # 防止小白拷路径头尾带了空格和"和回车
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load audio: {e}")

    return np.frombuffer(out, np.float32).flatten()
def get_audio(filename):
    audio_array = load_audio(filename, target_sr)
    audio = torch.FloatTensor(audio_array)
    audio = audio.unsqueeze(0)
    spec = spectrogram_torch(
        audio,
        n_fft,
        target_sr,
        hop_size,
        win_size,
        center=False,
    )
    spec = torch.squeeze(spec, 0)
    return spec, audio

y, audio = get_audio("./1.mp3.wav")

y_lengths = torch.tensor([y.shape[1]])

In [141]:
import sentencepiece as spm

text = "用这令旗，不仅调兵遣将能胜人一筹，投降也能先人一步……"
sp = spm.SentencePieceProcessor()
sp.load("../pretrained_models/sentencepiece.bpe.model")

def tokenize_text(text):
        token = [0] + [x + 1 for x in sp.encode(text)] + [2]
        return token
    
text_token = torch.Tensor(tokenize_text(text)).long()
text_token_lengths = torch.tensor([len(text_token)])

In [142]:
min_length = min(speech_token.shape[-1], y.shape[-1])

print(f"设置的min_length为: {min_length}")

# 使用最小值截断speech_token和y
speech_token = speech_token[..., :min_length]
y = y[..., :min_length]

print(f"截断后speech_token的形状: {speech_token.shape}")
print(f"截断后y的形状: {y.shape}")



设置的min_length为: 341
截断后speech_token的形状: torch.Size([341])
截断后y的形状: torch.Size([1025, 341])


In [143]:
y.unsqueeze_(0)
speech_token.unsqueeze_(0)
text_token.unsqueeze_(0)

tensor([[     0,      6,   1173,   1842,  12668,  37175,      4,  21523,  17619,
          19752, 153234,   1726,   1580,  27418,    487,    684,  94639,      4,
          11537,  16328,  37894,   2996,    487,  55007,   2551,      2]])

In [151]:
audio, _, _ = net_g.infer(
    speech_token,
    y,
    y_lengths,
    torch.zeros_like(text_token),
    text_token_lengths,
    noise_scale=0
)

# 确保音频数据是二维的 (channels, samples)
if audio.dim() == 3:
    audio = audio.squeeze(0)  # 移除批次维度

# 设置采样率
sample_rate = 32000

# 生成文件名
output_filename = f"generated_audio1.wav"

# 保存音频文件
torchaudio.save(output_filename, audio.cpu(), sample_rate)

In [105]:
import math
import os
import sys
import random
import traceback
import numpy as np
import torch
import sentencepiece as spm
import librosa
current_dir = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)
from GPT_SoVITS.module.mel_processing import spectrogram_torch

from tools.my_utils import load_audio

class TextAudioSpeakerLoader(torch.utils.data.Dataset):
    def tokenize_text(self, text):
        token = [0] + [x + 1 for x in self.sp.encode(text)] + [2]
        return token

    def __init__(self, hparams, val=False):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load("../pretrained_models/sentencepiece.bpe.model")
        exp_dir = hparams.exp_dir
        todo = []
        self.audiopaths_text = []
        self.lengths = []
        for root, dirs, files in os.walk(exp_dir):
            for file in files:
                if file.endswith(".txt"):
                    index_folder = os.path.relpath(root, exp_dir)
                    file_path = os.path.join(root, file)

                    # 尝试不同的编码
                    encodings = ["utf-8", "gbk", "gb2312", "utf-16"]
                    for encoding in encodings:
                        try:
                            with open(file_path, "r", encoding=encoding) as f:
                                lines = f.readlines()
                            break  # 如果成功读取，跳出循环
                        except UnicodeDecodeError:
                            continue  # 如果解码失败，尝试下一个编码
                    else:
                        print(f"无法解码文件 {file_path}，跳过此文件")
                        continue  # 如果所有编码都失败，跳过此文件

                    for line in lines:
                        try:
                            spk_name, wav_name, text = line.split("|")
                            todo.append([spk_name, wav_name, text, index_folder])
                        except Exception:
                            print(line)
        for data in todo:
            _, wav_name, text, index_folder = data
            audio_path = os.path.join(exp_dir, index_folder, wav_name)
            speech_token_path = audio_path + ".npy"
            bert_path = audio_path + ".pt"
            wav_path = audio_path + ".wav"
            if (
                os.path.exists(speech_token_path)
                and os.path.exists(bert_path)
                and os.path.exists(wav_path)
            ):
                try:
                    duration = librosa.get_duration(filename=wav_path)  # noqa: F821
                    self.lengths.append(math.ceil(duration * 50))
                except Exception as e:
                    print(f"无法处理文件 {wav_path}：{str(e)}")
                    continue
                self.audiopaths_text.append([audio_path, text])

        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.filter_length = hparams.filter_length
        self.hop_length = hparams.hop_length
        self.win_length = hparams.win_length
        self.sampling_rate = hparams.sampling_rate
        self.val = val

        """
        @misc{picard2023torchmanualseed3407needinfluencerandom,
        title={Torch.manual_seed(3407) is all you need: On the influence of random seeds in deep learning architectures for computer vision}, 
        author={David Picard},
        year={2023},
        eprint={2109.08203},
        archivePrefix={arXiv},
        primaryClass={cs.CV},
        url={https://arxiv.org/abs/2109.08203}, 
        }
        """
        random.seed(3407)  # 3407 is all you need

        random.shuffle(self.audiopaths_text)
        print("wav_data_len:", len(self.audiopaths_text))

    def get_audio_text_speaker_pair(self, audiopath_text):
        audiopath, text = audiopath_text
        text_token = self.tokenize_text(text)
        try:
            spec, wav = self.get_audio(audiopath + ".wav")
            speech_token = np.load(audiopath + ".npy")
            speech_token = torch.from_numpy(speech_token)
            min_length = min(speech_token.shape[-1], spec.shape[-1])
            speech_token = speech_token[..., :min_length]
            spec = spec[..., :min_length]
        except Exception:
            traceback.print_exc()
            spec = torch.zeros(1025, 100)
            wav = torch.zeros(1, 100 * self.hop_length)
            speech_token = torch.zeros(1, 100)
            text_token = text_token[-1:]
            print("load error!!!!!!", audiopath)
        return (speech_token, spec, wav, text_token)

    def get_audio(self, filename):
        audio_array = load_audio(filename, self.sampling_rate)
        audio = torch.FloatTensor(audio_array)
        audio = audio.unsqueeze(0)
        spec = spectrogram_torch(
            audio,
            self.filter_length,
            self.sampling_rate,
            self.hop_length,
            self.win_length,
            center=False,
        )
        spec = torch.squeeze(spec, 0)
        return spec, audio

    def get_sid(self, sid):
        sid = torch.LongTensor([int(sid)])
        return sid

    def __getitem__(self, index):
        # with torch.no_grad():
        return self.get_audio_text_speaker_pair(self.audiopaths_text[index])

    def __len__(self):
        return len(self.audiopaths_text)

In [106]:
class SimpleHparams:
    def __init__(self):
        self.exp_dir = "../dataset"
        self.max_wav_value = 32768.0
        self.sampling_rate = 32000
        self.filter_length = 2048
        self.hop_length = 640
        self.win_length = 2048

# 创建 hparams 实例
hparams = SimpleHparams()

# 创建 TextAudioSpeakerLoader 实例
dataset = TextAudioSpeakerLoader(hparams)

# 打印数据集的大小
print(f"数据集大小: {len(dataset)}")

# 获取第一个样本
if len(dataset) > 0:
    sample = dataset[1]
    speech_token, spec, wav, text_token = sample
    
    print(f"语音标记形状: {speech_token.shape}")
    print(f"频谱图形状: {spec.shape}")
    print(f"波形形状: {wav.shape}")
    print(f"文本标记: {text_token}")
    
    # 保存wav到文件
    import soundfile as sf
    import numpy as np
    
    # 确保wav是一个numpy数组，并且是float32类型
    if isinstance(wav, torch.Tensor):
        wav = wav.numpy()
    wav = wav.astype(np.float32)
    
    # 确保音频数据在-1到1之间
    wav = np.clip(wav, -1, 1)
    
    # 导出为WAV文件
    try:
        sf.write('output_audio.wav', wav.squeeze(), hparams.sampling_rate, subtype='FLOAT')
        print("音频已导出为 output_audio.wav")
    except Exception as e:
        print(f"导出音频时出错: {str(e)}")
else:
    print("数据集为空")

wav_data_len: 16006
数据集大小: 16006
语音标记形状: torch.Size([324])
频谱图形状: torch.Size([1025, 324])
波形形状: torch.Size([1, 207920])
文本标记: [0, 6, 10330, 216765, 886, 130668, 4, 35922, 274, 8856, 887, 7402, 83855, 4011, 25840, 4, 63211, 184120, 274, 2058, 4011, 2391, 4695, 6042, 4, 1036, 4502, 32, 2]
音频已导出为 output_audio.wav


In [107]:
sample[0].shape

torch.Size([324])

In [108]:
class TextAudioCollate:
    def __init__(self, return_ids=False):
        self.return_ids = return_ids

    def __call__(self, batch):
        # 按照频谱图长度排序
        _, ids_sorted_decreasing = torch.sort(
            torch.LongTensor([x[1].size(1) for x in batch]),
            dim=0, descending=True
        )

        max_speech_len = max([x[0].size(0) for x in batch])
        max_spec_len = max([x[1].size(1) for x in batch])
        max_wav_len = max([x[2].size(1) for x in batch])
        max_text_len = max([len(x[3]) for x in batch])

        speech_lengths = torch.LongTensor(len(batch))
        spec_lengths = torch.LongTensor(len(batch))
        wav_lengths = torch.LongTensor(len(batch))
        text_lengths = torch.LongTensor(len(batch))

        speech_padded = torch.LongTensor(len(batch), max_speech_len)
        spec_padded = torch.FloatTensor(len(batch), 1025, max_spec_len)
        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
        text_padded = torch.LongTensor(len(batch), max_text_len)

        speech_padded.zero_()
        spec_padded.zero_()
        wav_padded.zero_()
        text_padded.zero_()

        for i in range(len(ids_sorted_decreasing)):
            row = batch[ids_sorted_decreasing[i]]

            speech = row[0]
            speech_padded[i, :speech.size(0)] = speech
            speech_lengths[i] = speech.size(0)

            spec = row[1]
            spec_padded[i, :, :spec.size(1)] = spec
            spec_lengths[i] = spec.size(1)

            wav = row[2]
            wav_padded[i, :, :wav.size(1)] = wav
            wav_lengths[i] = wav.size(1)

            text = torch.LongTensor(row[3])
            text_padded[i, :text.size(0)] = text
            text_lengths[i] = text.size(0)

        if self.return_ids:
            return (
                speech_padded, speech_lengths, spec_padded, spec_lengths,
                wav_padded, wav_lengths, text_padded, text_lengths, ids_sorted_decreasing
            )
        return (
            speech_padded, speech_lengths, spec_padded, spec_lengths,
            wav_padded, wav_lengths, text_padded, text_lengths
        )

In [111]:
from torch.utils.data import DataLoader
# 创建 collate 函数实例
collate_fn = TextAudioCollate()

# 创建 DataLoader
batch_size = 4  # 使用小批量以便于观察
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# 获取一个批次的数据
batch = next(iter(dataloader))

# 解包批次数据
speech_padded, speech_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths = batch

# 打印每个张量的形状和类型
print(f"speech_padded shape: {speech_padded.shape}, type: {speech_padded.dtype}")
print(f"speech_lengths shape: {speech_lengths.shape}, type: {speech_lengths.dtype}")
print(f"spec_padded shape: {spec_padded.shape}, type: {spec_padded.dtype}")
print(f"spec_lengths shape: {spec_lengths.shape}, type: {spec_lengths.dtype}")
print(f"wav_padded shape: {wav_padded.shape}, type: {wav_padded.dtype}")
print(f"wav_lengths shape: {wav_lengths.shape}, type: {wav_lengths.dtype}")
print(f"text_padded shape: {text_padded.shape}, type: {text_padded.dtype}")
print(f"text_lengths shape: {text_lengths.shape}, type: {text_lengths.dtype}")

# 验证填充是否正确
for i in range(batch_size):
    print(f"\nSample {i}:")
    print(f"  Speech length: {speech_lengths[i]}, Padded speech shape: {speech_padded[i].shape}")
    print(f"  Spec length: {spec_lengths[i]}, Padded spec shape: {spec_padded[i].shape}")
    print(f"  Wav length: {wav_lengths[i]}, Padded wav shape: {wav_padded[i].shape}")
    print(f"  Text length: {text_lengths[i]}, Padded text shape: {text_padded[i].shape}")

speech_padded shape: torch.Size([4, 265]), type: torch.int64
speech_lengths shape: torch.Size([4]), type: torch.int64
spec_padded shape: torch.Size([4, 1025, 265]), type: torch.float32
spec_lengths shape: torch.Size([4]), type: torch.int64
wav_padded shape: torch.Size([4, 1, 170080]), type: torch.float32
wav_lengths shape: torch.Size([4]), type: torch.int64
text_padded shape: torch.Size([4, 20]), type: torch.int64
text_lengths shape: torch.Size([4]), type: torch.int64

Sample 0:
  Speech length: 265, Padded speech shape: torch.Size([265])
  Spec length: 265, Padded spec shape: torch.Size([1025, 265])
  Wav length: 170080, Padded wav shape: torch.Size([1, 170080])
  Text length: 20, Padded text shape: torch.Size([20])

Sample 1:
  Speech length: 199, Padded speech shape: torch.Size([265])
  Spec length: 199, Padded spec shape: torch.Size([1025, 265])
  Wav length: 130700, Padded wav shape: torch.Size([1, 170080])
  Text length: 16, Padded text shape: torch.Size([20])

Sample 2:
  Speech

In [112]:
for i in range(4):
    audio, _, _ = net_g.infer(
        speech_padded[i:i+1],
        spec_padded[i:i+1],
        spec_lengths[i:i+1],
        text_padded[i:i+1],
        text_lengths[i:i+1],
    )
    
    # 确保音频数据是二维的 (channels, samples)
    if audio.dim() == 3:
        audio = audio.squeeze(0)  # 移除批次维度

    # 设置采样率
    sample_rate = 32000

    # 生成文件名
    output_filename = f"generated_audio_{i}.wav"

    # 保存音频文件
    torchaudio.save(output_filename, audio.cpu(), sample_rate)

    print(f"音频 {i} 已保存为 {output_filename}")

音频 0 已保存为 generated_audio_0.wav
音频 1 已保存为 generated_audio_1.wav
音频 2 已保存为 generated_audio_2.wav
音频 3 已保存为 generated_audio_3.wav


In [63]:
speech_padded[0:1].shape

torch.Size([1, 272])

In [64]:
spec_padded[0:1].shape

torch.Size([1, 1025, 272])

In [67]:
spec_lengths[0:1].shape

torch.Size([1])

In [66]:
text_padded[0:1].shape

torch.Size([1, 16])

In [60]:
audio, _, _ = net_g.infer(
    speech_padded[0:1],
    spec_padded[3:4],
    spec_lengths[0:1],
    text_padded[2:3],
    text_lengths[2:3],
)

# 确保音频数据是二维的 (channels, samples)
if audio.dim() == 3:
    audio = audio.squeeze(0)  # 移除批次维度

# 设置采样率
sample_rate = 32000

# 生成文件名
output_filename = f"generated_audio.wav"

# 保存音频文件
torchaudio.save(output_filename, audio.cpu(), sample_rate)

In [53]:
audio.shape

torch.Size([1, 1, 174080])

In [54]:
# 保存生成的音频
import torchaudio

# 确保音频数据是二维的 (channels, samples)
if audio.dim() == 3:
    audio = audio.squeeze(0)  # 移除批次维度

# 设置采样率（假设为22050Hz，如果不同请调整）
sample_rate = 32000

# 生成文件名
output_filename = "generated_audio.wav"

# 保存音频文件
torchaudio.save(output_filename, audio.cpu(), sample_rate)

print(f"音频已保存为 {output_filename}")


音频已保存为 generated_audio.wav


In [13]:
wav_padded[0:1].shape

torch.Size([1, 1, 159740])