In [4]:
from Amphion.models.codec.ns3_codec import FACodecEncoder, FACodecDecoder
import torch

fa_encoder = FACodecEncoder(
    ngf=32,
    up_ratios=[2, 4, 5, 5],
    out_channels=256,
)

fa_decoder = FACodecDecoder(
    in_channels=256,
    upsample_initial_channel=1024,
    ngf=32,
    up_ratios=[5, 5, 4, 2],
    vq_num_q_c=2,
    vq_num_q_p=1,
    vq_num_q_r=3,
    vq_dim=256,
    codebook_dim=8,
    codebook_size_prosody=10,
    codebook_size_content=10,
    codebook_size_residual=10,
    use_gr_x_timbre=True,
    use_gr_residual_f0=True,
    use_gr_residual_phone=True,
)

encoder_ckpt = "pre_trained/ns3_facodec_encoder.bin"
decoder_ckpt = "pre_trained/ns3_facodec_decoder.bin"

fa_encoder.load_state_dict(torch.load(encoder_ckpt))
fa_decoder.load_state_dict(torch.load(decoder_ckpt))

fa_encoder.eval()
fa_decoder.eval()

FACodecDecoder(
  (quantizer): ModuleList(
    (0): ResidualVQ(
      (layers): ModuleList(
        (0): FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (1): ResidualVQ(
      (layers): ModuleList(
        (0-1): 2 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
    (2): ResidualVQ(
      (layers): ModuleList(
        (0-2): 3 x FactorizedVectorQuantize(
          (in_proj): Linear(in_features=256, out_features=8, bias=True)
          (out_proj): Linear(in_features=8, out_features=256, bias=True)
          (_codebook): Embedding(1024, 8)
        )
      )
    )
  )
  (model): Sequential(
    (0): Conv1d(256, 

In [5]:
import librosa
import soundfile as sf

In [10]:
test_wav_path = "models/resources/afrikaans1.mp3"
test_wav = librosa.load(test_wav_path, sr=16000)[0]
test_wav = torch.from_numpy(test_wav).float()
test_wav = test_wav.unsqueeze(0).unsqueeze(0)

with torch.no_grad():

    # encode
    enc_out = fa_encoder(test_wav)
    print(enc_out.shape)

    # quantize
    vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder(enc_out, eval_vq=False, vq=True)
    
    # latent after quantization
    print(vq_post_emb.shape)
    
    # codes
    print("vq id shape:", vq_id.shape)
    
    # get prosody code
    prosody_code = vq_id[:1]
    print("prosody code shape:", prosody_code.shape)
    
    # get content code
    cotent_code = vq_id[1:3]
    print("content code shape:", cotent_code.shape)
    
    # get residual code (acoustic detail codes)
    residual_code = vq_id[3:]
    print("residual code shape:", residual_code.shape)
    
    # speaker embedding
    print("speaker embedding shape:", spk_embs.shape)

    # decode (recommand)
    recon_wav = fa_decoder.inference(vq_post_emb, spk_embs)
    print(recon_wav.shape)
    sf.write("recon.wav", recon_wav[0][0].cpu().numpy(), 16000)

torch.Size([1, 256, 1662])
torch.Size([1, 256, 1662])
vq id shape: torch.Size([6, 1, 1662])
prosody code shape: torch.Size([1, 1, 1662])
content code shape: torch.Size([2, 1, 1662])
residual code shape: torch.Size([3, 1, 1662])
speaker embedding shape: torch.Size([1, 256])
torch.Size([1, 1, 332400])


In [11]:
wav_a = "models/resources/japanese12.mp3"
wav_a = librosa.load(wav_a, sr=16000)[0]
wav_a = torch.from_numpy(wav_a).float()
wav_a = wav_a.unsqueeze(0).unsqueeze(0)

wav_b = "models/resources/a1m5.wav"
wav_b = librosa.load(wav_b, sr=16000)[0]
wav_b = torch.from_numpy(wav_b).float()
wav_b = wav_b.unsqueeze(0).unsqueeze(0)

wav_c = "models/resources/albanian3.mp3"
wav_c = librosa.load(wav_c, sr=16000)[0]
wav_c = torch.from_numpy(wav_c).float()
wav_c = wav_c.unsqueeze(0).unsqueeze(0)

In [12]:
from Amphion.models.codec.ns3_codec import FACodecRedecoder

fa_redecoder = FACodecRedecoder()

redecoder_ckpt = "pre_trained/ns3_facodec_redecoder.bin"

fa_redecoder.load_state_dict(torch.load(redecoder_ckpt))

with torch.no_grad():
    enc_out_a = fa_encoder(wav_a)
    enc_out_b = fa_encoder(wav_b)
    enc_out_c = fa_encoder(wav_c)

    vq_post_emb_a, vq_id_a, _, quantized_a, spk_embs_a = fa_decoder(enc_out_a, eval_vq=False, vq=True)
    vq_post_emb_b, vq_id_b, _, quantized_b, spk_embs_b = fa_decoder(enc_out_b, eval_vq=False, vq=True)
    vq_post_emb_c, vq_id_c, _, quantized_c, spk_embs_c = fa_decoder(enc_out_c, eval_vq=False, vq=True)

    # convert speaker
    import numpy as np
    print(np.linalg.norm(spk_embs_a))
    print(np.linalg.norm(spk_embs_b))
    print(np.linalg.norm(spk_embs_c))
    # vq_post_emb_a_to_b = fa_redecoder.vq2emb(vq_id_a, spk_embs_a, use_residual=False)
    # recon_wav_a_to_b = fa_redecoder.inference(vq_post_emb_a_to_b, spk_embs_b)

    # sf.write("recon_a_to_b.wav", recon_wav_a_to_b[0][0].cpu().numpy(), 16000)

4.605275
4.6856565
4.633703


In [None]:
wav_d = "E:\Programming Projects\\recon.wav"
wav_d = librosa.load(wav_d, sr=16000)[0]
wav_d = torch.from_numpy(wav_d).float()
wav_d = wav_d.unsqueeze(0).unsqueeze(0)
with torch.no_grad():
    enc_out_d = fa_encoder(wav_d)

    vq_post_emb_d, vq_id_d, _, quantized_d, spk_embs_d = fa_decoder(enc_out_d, eval_vq=False, vq=True)
    print(np.linalg.norm(spk_embs_d))

4.6276045


In [None]:
vq_post_emb_a.shape

torch.Size([1, 256, 2303])

In [None]:
vq_id_a.shape

torch.Size([6, 1, 2303])

In [None]:
quantized_a[0].shape

torch.Size([1, 256, 2303])

In [None]:
len(quantized_a)

3

In [None]:
spk_embs_a.shape

torch.Size([1, 256])

In [None]:
import torch
import torch.nn as nn

# Assuming a vocabulary size of 10000 and an embedding dimension of 300
embedding_layer = nn.Embedding(num_embeddings=10000, embedding_dim=300)

# Example input: indices for the words in a sentence
input_ids = torch.tensor([1, 2, 4, 5, 987], dtype=torch.long)

# Lookup the embeddings for these indices
embeddings = embedding_layer(input_ids)

print(embeddings)

tensor([[ 1.8703, -0.3206, -0.7072,  ...,  0.5520, -1.4492,  0.2872],
        [ 1.4557,  0.3330, -0.1581,  ..., -1.5206, -0.7678,  1.2498],
        [ 0.3099,  0.1577,  0.3483,  ..., -0.3485,  0.6334, -0.4492],
        [ 0.3130, -0.0395, -0.8988,  ...,  1.5833,  0.1798, -0.1109],
        [ 1.1139, -0.7131, -0.7881,  ...,  0.9772,  0.0831,  0.6960]],
       grad_fn=<EmbeddingBackward0>)
