In [1]:
# Import packages and dataset

# %matplotlib inline

import os

import torch
import torchaudio
import requests
import matplotlib
import matplotlib.pyplot as plt
import IPython
from VocalSetDataset import VocalSetDataset
from codebase import utils as ut

matplotlib.rcParams['figure.figsize'] = [16.0, 4.8]

torch.random.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(torchaudio.__version__)
print(device)

1.10.0
0.10.0
cpu


## Wav2Vec

In [2]:
class Wav2Vec():
    def __init__(self, bundle_name, device='cpu'):
        super().__init__()
        
        self.bundles = {'WAV2VEC2_XLSR53':torchaudio.pipelines.WAV2VEC2_XLSR53,           
                    'HUBERT_BASE':torchaudio.pipelines.HUBERT_BASE,
                    'HUBERT_LARGE':torchaudio.pipelines.HUBERT_LARGE
                  }
        self.bundle_name = bundle_name
        self.bundle = self.bundles[bundle_name]
        print("Sample Rate:", self.bundle.sample_rate)
        self.device = device
        self.model = self.load_model()

    def load_model(self):
        # https://pytorch.org/audio/main/pipelines.html <--- for specific model choice, look here!
        model = self.bundle.get_model().to(self.device)
        print(model.__class__)
        return model
    
    def extract_features(self, filepath):
        waveform, sample_rate = torchaudio.load(filepath)
        waveform = waveform.to(device)
        if sample_rate != self.bundle.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.bundle.sample_rate)
        # extract acoustic features
        with torch.inference_mode():
            features, _ = self.model.extract_features(waveform)
        return features
    
    def plot_features(self, features):
        # plot features
        fig, ax = plt.subplots(len(features), 1, figsize=(16, 4.3 * len(features)))
        for i, feats in enumerate(features):
            ax[i].imshow(feats[0].cpu())
            ax[i].set_title(f"Feature from transformer layer {i+1}")
            ax[i].set_xlabel("Feature dimension")
            ax[i].set_ylabel("Frame (time-axis)")
        plt.tight_layout()
        plt.show()

W2V = Wav2Vec('HUBERT_BASE')

Sample Rate: 16000
<class 'torchaudio.models.wav2vec2.model.Wav2Vec2Model'>


## VGGish

In [3]:
class VGGish():
    def __init__(self):
        super().__init__()
        self.model = torch.hub.load('harritaylor/torchvggish', 'vggish')
        self.feat_max = 255.0
    
    #returns 128 dim embedding
    def extract_features(self, filepath):
        self.model.eval()
        features = self.model.forward(filepath)
        return features/self.feat_max
    
    
    def plot_features(self, features):
        # plot features
        plt.plot(features)
        plt.tight_layout()
        plt.xlabel('Dim')
        plt.title('VGGish Embedding')
        plt.show()
        
VGGish = VGGish()

Using cache found in /Users/camillenoufi/.cache/torch/hub/harritaylor_torchvggish_master


## Compute Embeddings

In [4]:
data_dir = '/Users/camillenoufi/cnoufi (not syncing)/Research/VQM-VAE/data/VocalSet/train/split_1s'
meas_file = 'voicelab_results.xlsx'
embedding_model = 'vggish'
VS = VocalSetDataset(data_dir, meas_file, device='cpu')

Using cpu as device for dataset.
Applying mean-normalization to all features


In [None]:
import numpy as np
embedding_list = VS._create_audio_embeddings(VGGish) #slow
embeddings = np.np.asarray(embedding_list)
np.save(os.path.join(data_dir,'VGGish_embeddings.npy'), embeddings, allow_pickle=True)

Creating audio embeddings using given model...


## Downstream Classifier

In [18]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, output_size, h_dim=64):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, h_dim),
            nn.LeakyReLU(),
            nn.Linear(h_dim, h_dim),
            nn.LeakyReLU(),
            nn.Linear(h_dim, output_size)
        )
        
    def forward(self, x):
        x = x.view(x.size(0), -1) #collapse into single dimension
        x = self.layers(x)
        return x

In [19]:
train_loader, validation_loader = ut.partition_dataset(VS, batch_size=200, validation_split=0.05, shuffle_dataset=True)

Using 11905 samples to train and 626 for validation.


In [21]:
vqm_dim = list(VS[0][1].shape)[0]
input_size = 128 if embedding_model=='vggish' else (49*768) 

regressor = MLP(input_size=128, output_size=vqm_dim)
print(regressor)

optimizer = torch.optim.Adam(regressor.parameters(), lr=0.001)
loss_fn = nn.MSE()
mean_train_losses = []
mean_valid_losses = []
valid_acc_list = []
epochs = 15

MLP(
  (layers): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=64, out_features=12, bias=True)
  )
)


In [22]:
for epoch in range(epochs):
    regressor.train()
    
    train_losses = []
    valid_losses = []
    for i, (nu, xu, lu, fu) in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        outputs = regressor(fu)
        loss = loss_fn(outputs, xu)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
            
    regressor.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (nu, xu, lu, fu) in enumerate(valid_loader):
            outputs = regressor(fu)
            loss = loss_fn(outputs, xu)
            
            valid_losses.append(loss.item())
            
    mean_train_losses.append(np.mean(train_losses))
    mean_valid_losses.append(np.mean(valid_losses))
    
    print('epoch : {}, train loss : {:.4f}, valid loss : {:.4f}'\
         .format(epoch+1, np.mean(train_losses), np.mean(valid_losses)))

KeyboardInterrupt: 