In [13]:
from architectures.linear_attributes_autoencoder import Builder as AttrVAEBuilder

from architectures.gru_seq2seq_bidirectional_enc import Builder as AudioVAEBuilder
from architectures.gru_seq2seq_bidirectional_enc import Wrapper as AudioVAEWrapper

from readers import AudioReader

# Model Definition

In [14]:
attr_vae = AttrVAEBuilder().build(
    128,
    [32, 128, 512]
)

In [15]:
attr_vae

ImageVAE(
  (encoder): LinearEncoder(
    (layers): Sequential(
      (0): Sequential(
        (0): Linear(in_features=32, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (1): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
    )
    (mu_proj): Linear(in_features=512, out_features=128, bias=True)
    (logvar_proj): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): LinearDecoder(
    (latent_proj): Linear(in_features=128, out_features=512, bias=True)
    (decoder_blocks): Sequential(
      (0): Sequential(
        (0): Linear(in_features=512, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplac

In [31]:
builder = AudioVAEBuilder()
audio_model = builder.build(
    embedding_dim=2050,
    latent_dim=128,
    context_length=944,
    num_layers=1
)

In [32]:
audio_wrapper = AudioVAEWrapper(audio_model)

# Data preparation

In [33]:
from utils.audio import (
    concat_FT,
    reverse_FT
)

In [34]:
fourier_params = {
    'fs': 16000,
    'window_size': 2048,
    'window_shift': 1024,
    'type': "hamming"
}

In [21]:
dataset = AudioReader(fourier_params)

In [35]:
for audio in dataset:
    print(audio.shape)
    break

torch.Size([1, 944, 1025])


In [36]:
audio_model.cuda()

ImageVAE(
  (encoder): BidirectionalEncoder(
    (encoder): GRU(2050, 2050, batch_first=True, bidirectional=True)
    (mu_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
    (sigma_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
  )
  (decoder): AutoregressiveDecoder(
    (proj_h): Linear(in_features=128, out_features=2050, bias=True)
    (decoder): GRU(2050, 2050, batch_first=True)
  )
)

In [37]:
for audio in dataset:
    X = concat_FT(audio).cuda()
    output = audio_model(X)
    print(X.shape, output[0].shape)
    break

torch.Size([1, 944, 2050]) torch.Size([1, 944, 2050])
