In [1]:
from Amphion.models.codec.ns3_codec import FACodecEncoder, FACodecDecoder
import torch

fa_encoder = FACodecEncoder(
    ngf=32,
    up_ratios=[2, 4, 5, 5],
    out_channels=256,
)

fa_decoder = FACodecDecoder(
    in_channels=256,
    upsample_initial_channel=1024,
    ngf=32,
    up_ratios=[5, 5, 4, 2],
    vq_num_q_c=2,
    vq_num_q_p=1,
    vq_num_q_r=3,
    vq_dim=256,
    codebook_dim=8,
    codebook_size_prosody=10,
    codebook_size_content=10,
    codebook_size_residual=10,
    use_gr_x_timbre=True,
    use_gr_residual_f0=True,
    use_gr_residual_phone=True,
)

encoder_ckpt = "pre_trained/ns3_facodec_encoder.bin"
decoder_ckpt = "pre_trained/ns3_facodec_decoder.bin"

fa_encoder.load_state_dict(torch.load(encoder_ckpt))
fa_decoder.load_state_dict(torch.load(decoder_ckpt))

fa_encoder.eval()
fa_decoder.eval()

FACodecDecoder(
  (quantizer): ModuleList(
    (0): ResidualVQ(
      (layers): ModuleList(
        (0): FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (1): ResidualVQ(
      (layers): ModuleList(
        (0-1): 2 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (2): ResidualVQ(
      (layers): ModuleList(
        (0-2): 3 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
  )
  (model): Sequential(
    (0): Conv1d(256, 

In [2]:
import librosa
import soundfile as sf

In [4]:
test_wav_path = "recordings//afrikaans1.mp3"
test_wav = librosa.load(test_wav_path, sr=16000)[0]
test_wav = torch.from_numpy(test_wav).float()
test_wav = test_wav.unsqueeze(0).unsqueeze(0)

with torch.no_grad():

    # encode
    enc_out = fa_encoder(test_wav)
    print(enc_out.shape)

    # quantize
    vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder(enc_out, eval_vq=False, vq=True)
    
    # latent after quantization
    print(vq_post_emb.shape)
    
    # codes
    print("vq id shape:", vq_id.shape)
    
    # get prosody code
    prosody_code = vq_id[:1]
    print("prosody code shape:", prosody_code.shape)
    
    # get content code
    cotent_code = vq_id[1:3]
    print("content code shape:", cotent_code.shape)
    
    # get residual code (acoustic detail codes)
    residual_code = vq_id[3:]
    print("residual code shape:", residual_code.shape)
    
    # speaker embedding
    print("speaker embedding shape:", spk_embs.shape)

    # decode (recommand)
    recon_wav = fa_decoder.inference(vq_post_emb, spk_embs)
    print(recon_wav.shape)
    sf.write("recon.wav", recon_wav[0][0].cpu().numpy(), 16000)

torch.Size([1, 256, 1662])
torch.Size([1, 256, 1662])
vq id shape: torch.Size([6, 1, 1662])
prosody code shape: torch.Size([1, 1, 1662])
content code shape: torch.Size([2, 1, 1662])
residual code shape: torch.Size([3, 1, 1662])
speaker embedding shape: torch.Size([1, 256])
torch.Size([1, 1, 332400])


In [5]:
wav_a = "recordings/japanese12.mp3"
wav_a = librosa.load(wav_a, sr=16000)[0]
wav_a = torch.from_numpy(wav_a).float()
wav_a = wav_a.unsqueeze(0).unsqueeze(0)

wav_b = "recordings/afrikaans1.mp3"
wav_b = librosa.load(wav_b, sr=16000)[0]
wav_b = torch.from_numpy(wav_b).float()
wav_b = wav_b.unsqueeze(0).unsqueeze(0)

wav_c = "recordings/albanian3.mp3"
wav_c = librosa.load(wav_c, sr=16000)[0]
wav_c = torch.from_numpy(wav_c).float()
wav_c = wav_c.unsqueeze(0).unsqueeze(0)

In [6]:
from Amphion.models.codec.ns3_codec import FACodecRedecoder

fa_redecoder = FACodecRedecoder()

redecoder_ckpt = "pre_trained/ns3_facodec_redecoder.bin"

fa_redecoder.load_state_dict(torch.load(redecoder_ckpt))

with torch.no_grad():
    enc_out_a = fa_encoder(wav_a)
    enc_out_b = fa_encoder(wav_b)
    enc_out_c = fa_encoder(wav_c)

    vq_post_emb_a, vq_id_a, _, quantized_a, spk_embs_a = fa_decoder(enc_out_a, eval_vq=False, vq=True)
    vq_post_emb_b, vq_id_b, _, quantized_b, spk_embs_b = fa_decoder(enc_out_b, eval_vq=False, vq=True)
    vq_post_emb_c, vq_id_c, _, quantized_c, spk_embs_c = fa_decoder(enc_out_c, eval_vq=False, vq=True)

    # convert speaker
    import numpy as np
    print("a", np.linalg.norm(spk_embs_a))
    print("b", np.linalg.norm(spk_embs_b))
    print("c", np.linalg.norm(spk_embs_c))
    print("========")
    print("a-b", np.linalg.norm(0.5*(spk_embs_a+spk_embs_b)))
    print("b-c", np.linalg.norm(0.5*(spk_embs_b+spk_embs_c)))
    print("a-c", np.linalg.norm(0.5*(spk_embs_a+spk_embs_c)))
    vq_post_emb_a_to_b = fa_redecoder.vq2emb(vq_id_a, 0.5*(spk_embs_a+spk_embs_b), use_residual=False)
    recon_wav_a_to_b = fa_redecoder.inference(vq_post_emb_a_to_b, 0.5*(spk_embs_a+spk_embs_b))
    sf.write("recon_mix_0.wav", recon_wav_a_to_b[0][0].cpu().numpy(), 16000)

    vq_post_emb_a_to_b = fa_redecoder.vq2emb(vq_id_a, 0.5*(spk_embs_c+spk_embs_b), use_residual=False)
    recon_wav_a_to_b = fa_redecoder.inference(vq_post_emb_a_to_b, 0.5*(spk_embs_c+spk_embs_b))
    sf.write("recon_mix_1.wav", recon_wav_a_to_b[0][0].cpu().numpy(), 16000)

    vq_post_emb_a_to_b = fa_redecoder.vq2emb(vq_id_a, 0.5*(spk_embs_a+spk_embs_c), use_residual=False)
    recon_wav_a_to_b = fa_redecoder.inference(vq_post_emb_a_to_b, 0.5*(spk_embs_a+spk_embs_c))
    sf.write("recon_mix_2.wav", recon_wav_a_to_b[0][0].cpu().numpy(), 16000)

a 4.6099305
b 4.61502
c 4.6312647
a-b tensor([[-2.9740e-02, -4.1889e-02,  4.6665e-02,  1.1185e-02, -3.3905e-03,
          2.6135e-02,  8.3394e-03, -1.2767e-02,  1.5883e-02,  4.4698e-02,
         -8.2097e-02, -1.1117e-01, -2.2878e-02,  1.3188e-02, -5.4757e-02,
         -7.1202e-02, -5.1012e-02,  1.2391e-02, -5.9583e-02, -6.2296e-02,
          7.0550e-02, -2.9883e-02,  8.5506e-03,  9.0103e-02, -1.7014e-01,
         -5.2118e-02,  2.8445e-02,  5.9354e-02,  8.4130e-02, -5.7961e-02,
          4.1223e-02,  2.5807e-02,  1.6551e-02,  1.1494e-02,  1.4469e-02,
         -7.8633e-02, -2.1798e-02, -3.2687e-02,  6.2329e-02, -2.2057e-02,
         -7.3358e-02,  1.0057e-01,  2.4521e-02,  1.4519e-01, -8.5477e-03,
          1.2722e-01,  3.7495e-02,  2.2706e-02,  3.6226e-02,  5.4672e-02,
         -4.0516e-02, -4.8729e-02,  1.9640e-02,  2.7480e-01, -6.1631e-02,
         -6.0146e-03,  8.0231e-02, -5.4857e-02, -9.1438e-03,  7.1098e-02,
         -8.9541e-02, -2.7011e-03,  1.7345e-02,  6.7650e-02,  4.9921e-02,


In [7]:
vq_post_emb_a.shape

torch.Size([1, 256, 2303])

In [8]:
vq_id_a.shape

torch.Size([6, 1, 2303])

In [9]:
quantized_a[0].shape

torch.Size([1, 256, 2303])

In [10]:
len(quantized_a)

3

In [11]:
spk_embs_a.shape

torch.Size([1, 256])

In [12]:
import torch
import torch.nn as nn

# Assuming a vocabulary size of 10000 and an embedding dimension of 300
embedding_layer = nn.Embedding(num_embeddings=10000, embedding_dim=300)

# Example input: indices for the words in a sentence
input_ids = torch.tensor([1, 2, 4, 5, 987], dtype=torch.long)

# Lookup the embeddings for these indices
embeddings = embedding_layer(input_ids)

print(embeddings)

tensor([[ 0.3080,  0.9916,  1.0956,  ...,  0.9795,  1.3352, -0.9671],
        [-0.5183, -0.9238, -1.2977,  ..., -0.3542, -0.5446, -0.2042],
        [ 1.8300,  0.6884, -0.8165,  ...,  1.6946,  2.4258, -0.3433],
        [-1.3175, -1.5950,  0.0354,  ...,  0.6821,  0.2656,  1.8985],
        [-0.1071,  1.4058, -0.8028,  ...,  1.2904, -0.5537, -0.9080]],
       grad_fn=<EmbeddingBackward0>)
