In [1]:
from architectures.linear_attributes_autoencoder import Builder as AttrVAEBuilder

from architectures.gru_seq2seq_bidirectional_enc import Builder as AudioVAEBuilder
from architectures.gru_seq2seq_bidirectional_enc import Wrapper as AudioVAEWrapper

from readers import AudioReader

In [2]:
import torch
import torch.nn as nn

# Model Definition

In [3]:
attr_vae = AttrVAEBuilder().build(
    128,
    [32, 128, 512]
)

In [4]:
attr_vae

ImageVAE(
  (encoder): LinearEncoder(
    (layers): Sequential(
      (0): Sequential(
        (0): Linear(in_features=32, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (1): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
    )
    (mu_proj): Linear(in_features=512, out_features=128, bias=True)
    (logvar_proj): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): LinearDecoder(
    (latent_proj): Linear(in_features=128, out_features=512, bias=True)
    (decoder_blocks): Sequential(
      (0): Sequential(
        (0): Linear(in_features=512, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplac

In [5]:
builder = AudioVAEBuilder()
audio_model = builder.build(
    embedding_dim=2050,
    latent_dim=128,
    context_length=944,
    num_layers=1
)

In [6]:
audio_wrapper = AudioVAEWrapper(audio_model)

# Data preparation

In [41]:
from utils.audio import (
    concat_FT,
    reverse_FT
)

In [8]:
fourier_params = {
    'fs': 16000,
    'window_size': 2048,
    'window_shift': 1024,
    'type': "hamming"
}

In [9]:
dataset = AudioReader(fourier_params)

In [10]:
for audio in dataset:
    print(audio.shape)
    break

torch.Size([1, 944, 1025])


In [11]:
audio_model.cuda()

ImageVAE(
  (encoder): BidirectionalEncoder(
    (encoder): GRU(2050, 2050, batch_first=True, bidirectional=True)
    (mu_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
    (sigma_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
  )
  (decoder): AutoregressiveDecoder(
    (proj_h): Linear(in_features=128, out_features=2050, bias=True)
    (decoder): GRU(2050, 2050, batch_first=True)
  )
)

In [12]:
for audio in dataset:
    X = concat_FT(audio).cuda()
    output = audio_wrapper(X)
    print(X.shape, output[0].shape)
    break

torch.Size([1, 944, 2050]) torch.Size([1, 944, 2050])


# Training

In [21]:
optimizer = torch.optim.Adam(audio_wrapper.parameters(), lr=1e-3, betas=(0.5, 0.999), weight_decay=1e-5)
criterion = nn.MSELoss()

In [22]:
epochs = 10

In [23]:
audio_model.train()

ImageVAE(
  (encoder): BidirectionalEncoder(
    (encoder): GRU(2050, 2050, batch_first=True, bidirectional=True)
    (mu_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
    (sigma_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
  )
  (decoder): AutoregressiveDecoder(
    (proj_h): Linear(in_features=128, out_features=2050, bias=True)
    (decoder): GRU(2050, 2050, batch_first=True)
  )
)

In [26]:
for epoch in range(epochs):
    total_loss = 0
    for audio in dataset:
        optimizer.zero_grad()
        audio = concat_FT(audio).cuda()

        audio_pred, _, _ = audio_wrapper(audio)
        loss = criterion(audio_pred, audio)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch}, Loss: {total_loss/len(dataset)}')
    total_loss = 0

Epoch: 0, Loss: 2.0337943602726228e-05


KeyboardInterrupt: 

# Eval

In [54]:
audio_model.eval()

ImageVAE(
  (encoder): BidirectionalEncoder(
    (encoder): GRU(2050, 2050, batch_first=True, bidirectional=True)
    (mu_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
    (sigma_proj): Sequential(
      (0): Linear(in_features=4100, out_features=128, bias=True)
    )
  )
  (decoder): AutoregressiveDecoder(
    (proj_h): Linear(in_features=128, out_features=2050, bias=True)
    (decoder): GRU(2050, 2050, batch_first=True)
  )
)

In [48]:
import torch
import numpy as np
from scipy import signal

def get_waveform_from_spectrogram_tensor(X: torch.Tensor, stft_params: dict) -> np.ndarray:
    X = X.squeeze(0).permute(1, 0)
    X_np = X.numpy()

    _, waveform = signal.istft(X_np,
                               fs=stft_params['fs'],
                               nperseg=stft_params['window_size'],
                               noverlap=stft_params['window_size'] - stft_params['window_shift'],
                               window=stft_params['type'])
    
    return waveform

In [49]:
import sounddevice as sd
import numpy as np
import torch

def play_audio(waveform, sample_rate=16000, duration=5):
    if isinstance(waveform, torch.Tensor):
        waveform = waveform.detach().cpu().numpy()
    
    if waveform.ndim > 1:
        waveform = np.squeeze(waveform)
    
    max_samples = sample_rate * duration
    waveform = waveform[:max_samples]

    waveform = waveform.astype(np.float32)

    sd.play(waveform, samplerate=sample_rate)
    sd.wait()

In [56]:
for wave in dataset:
    break

In [63]:
audio = get_waveform_from_spectrogram_tensor(wave.cpu(), fourier_params)
play_audio(audio)

In [64]:
wave_recon = audio_model(concat_FT(wave).cuda())[0]
audio_recon = reverse_FT(wave_recon)
audio_recon = get_waveform_from_spectrogram_tensor(audio_recon.detach().cpu(), fourier_params)
play_audio(audio_recon)