In [1]:
import ffmpeg
import numpy as np

In [34]:
import torchaudio
import librosa
import torch
max_val = 0.8
target_sr = 16000
def load_wav(wav, target_sr):
    speech, sample_rate = torchaudio.load(wav)
    speech = speech.mean(dim=0, keepdim=True)
    if sample_rate != target_sr:
        assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
        speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
    return speech
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

In [35]:
import torch
speech_16k = postprocess(load_wav("./花火.mp3", 16000))
#导出16k的音频
# 导出16k的音频
torchaudio.save("花火_16k.wav", speech_16k, target_sr)
print("16k音频已保存为 '花火_16k.wav'")


16k音频已保存为 '花火_16k.wav'


In [49]:
# 计算语音时长
audio_duration = speech_16k.shape[1] / target_sr

print(f"语音时长: {audio_duration:.4f} 秒")


语音时长: 7.4325 秒


In [20]:
import torch
feat = torch.zeros(1, 207920//2)

In [21]:
import whisper
# load 
feat = whisper.log_mel_spectrogram(feat, n_mels=128)

In [22]:
import onnxruntime
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
speech_tokenizer_session = onnxruntime.InferenceSession(
    "../pretrained_models/speech_tokenizer_v1.onnx", sess_options=option, providers=["CUDAExecutionProvider"]
)

[0;93m2024-07-24 15:39:00.404377785 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 12 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2024-07-24 15:39:00.405305975 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-07-24 15:39:00.405310589 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [23]:
print(speech_tokenizer_session.get_inputs()[0].name)
print(speech_tokenizer_session.get_inputs()[1].name)

feats
feats_length


In [24]:
feat.detach().cpu().numpy()

array([[[-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5],
        [-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5],
        [-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5],
        ...,
        [-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5],
        [-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5],
        [-1.5, -1.5, -1.5, ..., -1.5, -1.5, -1.5]]], dtype=float32)

In [25]:
feat_length = np.array([feat.shape[2]], dtype=np.int32)

In [26]:
print(f"feat_length: {feat_length}")

feat_length: [649]


In [27]:
speech_token = speech_tokenizer_session.run(None, {speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
                                                                speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()

In [28]:
token = np.array(speech_token)

In [29]:
len(token)

325

In [30]:
feat = torch.zeros(1, 207920)

In [48]:
from torch.nn import functional as F
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=True):
    # 确保输入信号在合理范围内
    if torch.min(y) < -1.0 or torch.max(y) > 1.0:
        print(f"警告: 输入信号超出[-1, 1]范围. 最小值: {torch.min(y)}, 最大值: {torch.max(y)}")
    
    # 创建汉宁窗
    window = torch.hann_window(win_size, dtype=y.dtype, device=y.device)
    
    # 对信号进行填充
    pad_len = (n_fft - hop_size) // 2
    y_padded = F.pad(y.unsqueeze(1), (pad_len, pad_len), mode='reflect').squeeze(1)
    
    # 执行短时傅里叶变换 (STFT)
    stft_matrix = torch.stft(
        y_padded,
        n_fft=n_fft,
        hop_length=hop_size,
        win_length=win_size,
        window=window,
        center=center,
        normalized=False,
        onesided=True,
        return_complex=True
    )
    
    # 计算幅度谱
    magnitudes = torch.abs(stft_matrix)
    
    # 添加小量值以避免取对数时出现问题
    spec = torch.log1p(magnitudes)
    
    return spec
spec = spectrogram_torch(
            feat,
            2048,
            32000,
            640,
            2048,
            center=False,
        )
spec = torch.squeeze(spec, 0)
print(spec.shape)

torch.Size([1025, 325])


In [50]:
import torch
from rotary_embedding_torch import RotaryEmbedding

In [51]:
rotary_emb = RotaryEmbedding(dim = 768)

In [64]:
q = torch.ones(1, 99999, 768)
q = rotary_emb.rotate_queries_or_keys(q)
print(q.shape)

torch.Size([1, 99999, 768])


In [72]:
q[0][1]

tensor([-0.3012,  1.3818, -0.2683,  1.3885, -0.2361,  1.3944, -0.2046,  1.3993,
        -0.1737,  1.4035, -0.1434,  1.4069, -0.1138,  1.4096, -0.0849,  1.4117,
        -0.0566,  1.4131, -0.0289,  1.4139, -0.0019,  1.4142,  0.0245,  1.4140,
         0.0502,  1.4133,  0.0753,  1.4122,  0.0998,  1.4107,  0.1237,  1.4088,
         0.1470,  1.4066,  0.1697,  1.4040,  0.1918,  1.4012,  0.2133,  1.3980,
         0.2343,  1.3947,  0.2547,  1.3911,  0.2746,  1.3873,  0.2940,  1.3833,
         0.3128,  1.3792,  0.3312,  1.3749,  0.3491,  1.3705,  0.3664,  1.3659,
         0.3833,  1.3613,  0.3998,  1.3565,  0.4158,  1.3517,  0.4314,  1.3468,
         0.4465,  1.3419,  0.4613,  1.3369,  0.4756,  1.3318,  0.4895,  1.3268,
         0.5031,  1.3217,  0.5163,  1.3166,  0.5291,  1.3115,  0.5416,  1.3064,
         0.5537,  1.3013,  0.5655,  1.2962,  0.5769,  1.2912,  0.5881,  1.2861,
         0.5989,  1.2811,  0.6095,  1.2761,  0.6198,  1.2712,  0.6297,  1.2663,
         0.6394,  1.2614,  0.6489,  1.25

In [75]:
k = torch.ones(1, 99, 768)
k = rotary_emb.rotate_queries_or_keys(k)
print(k.shape)
k[0][1]

torch.Size([1, 99, 768])


tensor([-0.3012,  1.3818, -0.2683,  1.3885, -0.2361,  1.3944, -0.2046,  1.3993,
        -0.1737,  1.4035, -0.1434,  1.4069, -0.1138,  1.4096, -0.0849,  1.4117,
        -0.0566,  1.4131, -0.0289,  1.4139, -0.0019,  1.4142,  0.0245,  1.4140,
         0.0502,  1.4133,  0.0753,  1.4122,  0.0998,  1.4107,  0.1237,  1.4088,
         0.1470,  1.4066,  0.1697,  1.4040,  0.1918,  1.4012,  0.2133,  1.3980,
         0.2343,  1.3947,  0.2547,  1.3911,  0.2746,  1.3873,  0.2940,  1.3833,
         0.3128,  1.3792,  0.3312,  1.3749,  0.3491,  1.3705,  0.3664,  1.3659,
         0.3833,  1.3613,  0.3998,  1.3565,  0.4158,  1.3517,  0.4314,  1.3468,
         0.4465,  1.3419,  0.4613,  1.3369,  0.4756,  1.3318,  0.4895,  1.3268,
         0.5031,  1.3217,  0.5163,  1.3166,  0.5291,  1.3115,  0.5416,  1.3064,
         0.5537,  1.3013,  0.5655,  1.2962,  0.5769,  1.2912,  0.5881,  1.2861,
         0.5989,  1.2811,  0.6095,  1.2761,  0.6198,  1.2712,  0.6297,  1.2663,
         0.6394,  1.2614,  0.6489,  1.25