In [1]:
import torch
import torchaudio

In [2]:
from safetensors.torch import load_file
loaded_tensors = load_file("weights/ve.safetensors")

In [3]:
type(loaded_tensors)

dict

In [4]:
for k, v in loaded_tensors.items():
    print(k, v.shape)

lstm.bias_hh_l0 torch.Size([1024])
lstm.bias_hh_l1 torch.Size([1024])
lstm.bias_hh_l2 torch.Size([1024])
lstm.bias_ih_l0 torch.Size([1024])
lstm.bias_ih_l1 torch.Size([1024])
lstm.bias_ih_l2 torch.Size([1024])
lstm.weight_hh_l0 torch.Size([1024, 256])
lstm.weight_hh_l1 torch.Size([1024, 256])
lstm.weight_hh_l2 torch.Size([1024, 256])
lstm.weight_ih_l0 torch.Size([1024, 40])
lstm.weight_ih_l1 torch.Size([1024, 256])
lstm.weight_ih_l2 torch.Size([1024, 256])
proj.bias torch.Size([256])
proj.weight torch.Size([256, 256])
similarity_bias torch.Size([1])
similarity_weight torch.Size([1])


In [5]:
loaded_tensors["lstm.bias_hh_l0"].shape

torch.Size([1024])

In [6]:
class VEModel(torch.nn.Module):
    def __init__(self, input_size=40, hidden_size=256, num_layers=3):
        super(VEModel, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.proj = torch.nn.Linear(hidden_size,hidden_size)

        self.similarity_weight = torch.nn.Parameter(torch.ones(1))
        self.similarity_bias = torch.nn.Parameter(torch.zeros(1))

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.proj(hn[-1])

        out = out / torch.norm(out, dim=-1, keepdim=True)
        return out

In [7]:
ve_model = VEModel()

In [8]:
print(ve_model)

VEModel(
  (lstm): LSTM(40, 256, num_layers=3, batch_first=True)
  (proj): Linear(in_features=256, out_features=256, bias=True)
)


In [9]:
ve_model.load_state_dict(loaded_tensors, strict=False)

<All keys matched successfully>

In [10]:
import librosa
import numpy as np

def preprocess_audio(audio_path, n_mels=40):
    
    wav, sr = librosa.load(audio_path, sr=16000)
    
    wav, _ = librosa.effects.trim(wav, top_db=20)
    
    mel = librosa.feature.melspectrogram(
        y=wav, 
        sr=sr, 
        n_fft=1024, 
        hop_length=256, 
        win_length=1024, 
        n_mels=n_mels
    )
    
    log_mel = np.log10(np.maximum(mel, 1e-5)).T
    
    return torch.FloatTensor(log_mel).unsqueeze(0)

In [12]:
input_tensor = preprocess_audio("sample/sample.wav")

with torch.no_grad():
    embedding = ve_model(input_tensor)
    
embedding = embedding / torch.norm(embedding, dim=-1, keepdim=True)

print(f"Speaker DNA (256-dim vector): {embedding}")

Speaker DNA (256-dim vector): tensor([[-0.0920, -0.0409, -0.0213, -0.0884,  0.0018, -0.0564,  0.0012,  0.0429,
         -0.0365, -0.0421,  0.0191, -0.0889, -0.0038, -0.0878, -0.1047, -0.0648,
         -0.0010, -0.0142, -0.0020,  0.0093, -0.0525, -0.0171,  0.1558, -0.0247,
         -0.0499,  0.0154,  0.0981, -0.0278, -0.0942, -0.1994, -0.0215, -0.0062,
          0.0211, -0.1289, -0.0434,  0.0198,  0.0118, -0.0519, -0.0736, -0.0343,
         -0.2248,  0.0456,  0.0174, -0.0116,  0.0412, -0.0210, -0.0369,  0.0328,
          0.0260, -0.0350, -0.0192, -0.0675, -0.1081, -0.0028, -0.0546, -0.0173,
         -0.0017, -0.1323, -0.0889, -0.0315,  0.0080, -0.0858, -0.0128,  0.0197,
         -0.0681,  0.0647, -0.1134,  0.0411, -0.0110, -0.0457, -0.0068, -0.0290,
         -0.0817,  0.0423,  0.1000,  0.0280,  0.0079,  0.0535,  0.0723, -0.0541,
         -0.0459, -0.0193, -0.0040, -0.0851, -0.0476, -0.1243, -0.0868, -0.0052,
         -0.0846,  0.0426,  0.0500, -0.0058, -0.0617, -0.0701, -0.0170, -0.0426

In [13]:
print("Speaker embedding extraction model works correctly!")

Speaker embedding extraction model works correctly!
