In [8]:
import os
import numpy as np
import sys
import librosa
import warnings
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" # set vis gpus 
import torch
import pytorch_lightning as pl
import wwv.config  as cfg 
from wwv.util import CallbackCollection 
from wwv.data import AudioDataModule
device = "cuda" if torch.cuda.is_available() else "cpu"

cfg_fitting = cfg.Fitting(batch_size=64, train_bs=64, val_bs=64)
cfg_feature = cfg.Feature()
cfg_signal = cfg.Signal()
cfg_model = cfg.CNNAE()


In [9]:

data_path = cfg.DataPath("/home/akinwilson/Code/HTS", cfg_model.model_name, cfg_model.model_dir)
data_module = AudioDataModule(data_path.root_data_dir, cfg_model=cfg_model, cfg_feature=cfg_feature, cfg_fitting=cfg_fitting)
                              
train_loader =  data_module.train_dataloader()
val_loader =  data_module.val_dataloader()
test_loader =  data_module.test_dataloader()

# x = next(iter(train_loader))['x']
# x.shape

## Discriminative Undercomplete Autoencoder

In [14]:
import torch
import torch.nn as nn 
import torch.nn.functional as F



class CNNAE(nn.Module):
    def __init__(self, n_input=1, n_output=1024, stride=16, n_channel=32):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.n_channel = n_channel
        # encoder layers 
        self.e_conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.e_bn1 = nn.BatchNorm1d(n_channel)
        self.e_pool1 = nn.MaxPool1d(4, return_indices=True)
        self.e_conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.e_bn2 = nn.BatchNorm1d(n_channel)
        self.e_pool2 = nn.MaxPool1d(4, return_indices=True)
        self.e_conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        # 
        self.e_bn3 = nn.BatchNorm1d(2 * n_channel)
        self.e_pool3 = nn.MaxPool1d(4, return_indices=True)
        self.e_conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
#######################################################################################
        self.e_bn4 = nn.BatchNorm1d(2 * n_channel)
        self.e_pool4 = nn.MaxPool1d(2, return_indices=True)
        self.e_fc4 = nn.Linear(2 * n_channel * 28, n_output)
#######################################################################################
        # decoder layers 
        self.d_fc4 = nn.Linear(n_output, 2 * n_channel * 28)
        self.d_pool4 = nn.MaxUnpool1d(2)
        self.d_bn4 = nn.BatchNorm1d(2 * n_channel)
#######################################################################################
        self.d_conv4 = nn.ConvTranspose1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.d_pool3 = nn.MaxUnpool1d(4)
        self.d_bn3 = nn.BatchNorm1d(2 * n_channel)
#######################################################################################
        self.d_conv3 = nn.ConvTranspose1d(2 * n_channel, n_channel, kernel_size=3)
        self.d_pool2 = nn.MaxUnpool1d(4)
        self.d_bn2 = nn.BatchNorm1d(n_channel)

        self.d_conv2 = nn.ConvTranspose1d(n_channel, n_channel, kernel_size=3)
        self.d_pool1 = nn.MaxUnpool1d(4)
        self.d_bn1 = nn.BatchNorm1d(n_channel)

        self.d_conv1 = nn.ConvTranspose1d(n_channel, n_input, kernel_size=80, stride=stride)
    


    def encode(self, x):
        x = self.e_conv1(x)
        x = F.relu(self.e_bn1(x))
        x, idx1 = self.e_pool1(x)
        x = self.e_conv2(x)
        x = F.relu(self.e_bn2(x))
        x, idx2 = self.e_pool2(x)
        x = self.e_conv3(x)
        x = F.relu(self.e_bn3(x))
        x, idx3  = self.e_pool3(x)
        x = self.e_conv4(x)
        x = F.relu(self.e_bn4(x))
        x = x.view(x.shape[0], -1)
        x = self.e_fc4(x)
        return idx1, idx2, idx3, x


    def decode(self, idx1, idx2, idx3, x):
        bs = x.shape[0]
        x = self.d_fc4(x)
        x = x.view(bs, 2 * self.n_channel,  28)
        x = F.relu(self.d_bn4(x))
        x = self.d_conv4(x)


        x = self.d_pool3(x, idx3)
        x = F.relu(self.d_bn3(x))

        x = self.d_conv3(x)

        padding = idx2.shape[2] - x.shape[2] 
        pad = torch.zeros((bs,32, padding),device=self.device)
        
        x = torch.cat([x,pad],dim=2)
        

        x = self.d_pool2(x, idx2)

        x = F.relu(self.d_bn2(x))
        x = self.d_conv2(x)
        

        padding = idx1.shape[2] - x.shape[2] 
        pad = torch.zeros((bs,32, padding), device=self.device)
        x = torch.cat([x,pad],dim=2)

        x = self.d_pool1(x, idx1)
        x = F.relu(self.d_bn1(x))
        x = self.d_conv1(x)
        return x 



    def forward(self, x):
        idx1, idx2, idx3, encoded_x = self.encode(x)
        decoded_x = self.decode(idx1, idx2, idx3,encoded_x)
        return decoded_x

x = torch.randn((1,1,32000), device=device)


model = CNNAE()
model.to(device=device)
x_reconstructed = model(x)
assert x_reconstructed.shape == x.shape,  f"The reconstructed input is of different dimensions to the original input. Original: {x_reconstructed.shape}. Reconstructed: {x.shape}"
x_reconstructed.shape

torch.Size([1, 1, 32000])

In [15]:
import torch.nn.functional as F 
import pytorch_lightning as pl 
from pytorch_lightning import Trainer

class Routine(pl.LightningModule):

    def __init__ (self, model): # , cfg_model):
        super().__init__()
        self.model = model
        self.lr = 1e-4
        # self.cfg_model = cfg_model 


    def forward(self,x):
        x_reconstructed = self.model(x)
        return x_reconstructed 


    def training_step( self, batch, batch_idx):
        x = batch['x']
        x_reconstructed = self(x)
        loss = F.mse_loss(x,x_reconstructed)
        return {"loss": loss }

    def training_epoch_end(self, training_step_outputs):
        results = {"loss": torch.tensor( [ x['loss'].float().mean().item() for x in training_step_outputs]).mean()}
        for (k,v) in results.items():
            self.log(f"train_{k}", v, on_epoch=True, prog_bar=True, logger=True)    



    def validation_step( self, batch, batch_idx):
        x = batch['x']
        x_reconstructed = self(x)
        loss = F.mse_loss(x,x_reconstructed)
        return {"val_loss": loss }

    def validation_epoch_end(self, training_step_outputs):
        results = {"loss": torch.tensor( [ x['val_loss'].float().mean().item() for x in training_step_outputs]).mean()}
        for (k,v) in results.items():
            self.log(f"val_{k}", v, on_epoch=True, prog_bar=True, logger=True)    


    def configure_optimizers(self):

        optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr = self.lr, 
            betas = (0.9, 0.999), eps = 1e-08, weight_decay = 0.05, 
        )
        # scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08, verbose=False)
        return  {"optimizer": optimizer } # , "lr_scheduler": scheduler, "monitor": "val_loss"} 


routine = Routine(model)
trainer = Trainer(accelerator="gpu",devices=3,strategy='dp',sync_batchnorm = True,max_epochs = 20,num_sanity_val_steps = 2, gradient_clip_val=1.0)
# # PATH  = "/home/akinwilson/Code/pytorch/output/model/ResNet/epoch=18-val_loss=0.15-val_acc=0.95-val_ttr=0.92-val_ftr=0.03.ckpt"                  
# # Trainer executes fitting; training and validating proceducres 
trainer.fit(routine, train_dataloaders=train_loader, val_dataloaders=val_loader)
m = trainer.model.module.module.model
encoded_wav = m.encode_forward(torch.randn((1,1,32000)))

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type  | Params
--------------------------------
0 | model | CNNAE | 3.7 M 
--------------------------------
3.7 M     Trainable params
0         Non-trainable params
3.7 M     Total params
14.888    Total estimated model params size (MB)


Epoch 0:   2%|▏         | 29/1333 [00:11<08:42,  2.50it/s, loss=0.291, v_num=4]

In [2]:
# condition to correctly extract model in data parallel and single device training 
m = trainer.model.module.module.model if isinstance(trainer, torch.nn.DataParallel) else  trainer.model.module.model

encoded_wav = m.encode_forward(torch.randn((1,1,32000)))



NameError: name 'm' is not defined

In [None]:
path = "/home/akinwilson/Code/HTS/val.csv"
import pandas as pd 
cols= ['annotated_quality', 'annotated_age', 'annotated_voice_type']
df = pd.read_csv(path) # [cols]# .columns
categorical_cols = ['annotated_quality', 'annotated_age', 'annotated_voice_type']

df = df[categorical_cols]
#import pandas as pd
df = pd.get_dummies(df, columns = categorical_cols)

df.head().to_numpy().shape

In [None]:
n_input = 1
n_channel = 32
stride= 2
e_conv1 = nn.Conv1d(n_input, n_channel, kernel_size=160, stride=stride)

## Generative variational autoencoder

In [None]:
class CVAE(nn.Module):
    def __init__(self, x_dim, h_dim1, h_dim2, z_dim, c_dim):
        super().__init__()
        
        # encoder part
        self.fc1 = nn.Linear(x_dim + c_dim, h_dim1)
        self.fc2 = nn.Linear(h_dim1, h_dim2)
        self.fc31 = nn.Linear(h_dim2, z_dim)
        self.fc32 = nn.Linear(h_dim2, z_dim)
        # decoder part
        self.fc4 = nn.Linear(z_dim + c_dim, h_dim2)
        self.fc5 = nn.Linear(h_dim2, h_dim1)
        self.fc6 = nn.Linear(h_dim1, x_dim)
    
    def encoder(self, x, c):
        concat_input = torch.cat([x, c], 1)
        h = F.relu(self.fc1(concat_input))
        h = F.relu(self.fc2(h))
        return self.fc31(h), self.fc32(h)
    
    def sampling(self, mu, log_var):
        std = torch.exp(0.5*log_var)
        eps = torch.randn_like(std)
        return eps.mul(std).add(mu) # return z sample
    
    def decoder(self, z, c):
        concat_input = torch.cat([z, c], 1)
        h = F.relu(self.fc4(concat_input))
        h = F.relu(self.fc5(h))
        return F.sigmoid(self.fc6(h))
    
    def forward(self, x, c):

        
        mu, log_var = self.encoder(x.view(-1, 784), c)
        z = self.sampling(mu, log_var)
        return self.decoder(z, c), mu, log_var


In [None]:
from wwv.routine import Routine

In [None]:
class CVAE(nn.Module):
    def __init__(self, input_size, hidden_size=20):
        super(CVAE, self).__init__()
        input_size_with_label = input_size + labels_length
        hidden_size += labels_length
        
        self.fc1 = nn.Linear(input_size_with_label,512)
        self.fc21 = nn.Linear(512, hidden_size)
        self.fc22 = nn.Linear(512, hidden_size)
        
        self.relu = nn.ReLU()
        
        self.fc3 = nn.Linear(hidden_size, 512)
        self.fc4 = nn.Linear(512, input_size)
    
    def encode(self, x, labels):
        x = x.view(-1, 1*28*28)
        x = torch.cat((x, labels), 1)
        x = self.relu(self.fc1(x))
        return self.fc21(x), self.fc22(x)
        
    def decode(self, z, labels):
        torch.cat((z, labels), 1)
        z = self.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(z))
        
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 *logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)
        
    def forward(self,x, labels):
        #targets = one_hot(targets,labels_length-1).float().to(DEVICE)
        mu, logvar = self.encode(x, labels)
        z = self.reparameterize(mu, logvar)
        x = self.decode(z, labels)
        return x, mu, logvar

def train_cvae(net, dataloader, test_dataloader, flatten=True, epochs=20):
    validation_losses = []
    optim = torch.optim.Adam(net.parameters())

    log_template = "\nEpoch {ep:03d} val_loss {v_loss:0.4f}"
    with tqdm(desc="epoch", total=epochs) as pbar_outer:  
        for i in range(epochs):
            for batch, labels in dataloader:
                batch = batch.to(DEVICE)
                labels = one_hot(labels,9).to(DEVICE)

                if flatten:
                    batch = batch.view(batch.size(0), 28*28)

                optim.zero_grad()
                x,mu,logvar = net(batch, labels)
                loss = vae_loss_fn(batch, x[:, :784], mu, logvar)
                loss.backward()
                optim.step()
            evaluate(validation_losses, net, test_dataloader, flatten=True)
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=i+1, v_loss=validation_losses[i]))
    plt.show()
    return validation_losses
cvae = CVAE(28*28).to(DEVICE)
def vae_loss_fn(x, recon_x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

def evaluate(losses, autoencoder, dataloader, flatten=True):
    model = lambda x, y: autoencoder(x, y)[0]    
    loss_sum = []
    inp, out = [],[]
    loss_fn = nn.MSELoss()
    for inputs, labels in dataloader:
        inputs = inputs.to(DEVICE)
        labels = one_hot(labels,9).to(DEVICE)

        if flatten:
            inputs = inputs.view(inputs.size(0), 28*28)

        outputs = model(inputs, labels)
        loss = loss_fn(inputs, outputs)            
        loss_sum.append(loss)
        inp = inputs
        out = outputs

    with torch.set_grad_enabled(False):
        plot_gallery([inp[0].detach().cpu(),out[0].detach().cpu()],28,28,1,2)    

    losses.append((sum(loss_sum)/len(loss_sum)).item())
def train_cvae(net, dataloader, test_dataloader, flatten=True, epochs=50):
    validation_losses = []
    optim = torch.optim.Adam(net.parameters())

    log_template = "\nEpoch {ep:03d} val_loss {v_loss:0.4f}"
    with tqdm(desc="epoch", total=epochs) as pbar_outer:  
        for i in range(epochs):
            for batch, labels in dataloader:
                batch = batch.to(DEVICE)
                labels = one_hot(labels,9).to(DEVICE)

                if flatten:
                    batch = batch.view(batch.size(0), 28*28)

                optim.zero_grad()
                x,mu,logvar = net(batch, labels)
                loss = vae_loss_fn(batch, x[:, :784], mu, logvar)
                loss.backward()
                optim.step()
            evaluate(validation_losses, net, test_dataloader, flatten=True)
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=i+1, v_loss=validation_losses[i]))
    plt.show()
    return validation_losses



history = train_cvae(cvae, train_dataset, val_dataset)