In [1]:
import torchaudio
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torchaudio.functional as F
from util.patch_embed import PatchEmbed_org
from model import PositionalEncoding
from models_mae import MaskedAutoencoder
from datasets import load_dataset, Audio
from util import misc
from model.vit import Attention

In [None]:
win_length = int(sample_rate * 0.025)  # 25ms
hop_length = int(sample_rate * 0.01)  # 10ms
transform = MelSpectrogram(
    sample_rate=args.sample_rate,
    win_length=win_length,
    hop_length=hop_length,
    n_fft=win_length,
    n_mels=128,
    window_fn=torch.hamming_window
)

In [4]:
ds = load_dataset("agkphysics/AudioSet", "unbalanced", trust_remote_code=True)["train"].cast_column("audio", Audio(sampling_rate=16000))
loader = DataLoader(ds, batch_size=1)

Loading dataset shards:   0%|          | 0/1739 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [5]:
next(iter(loader))

{'video_id': ['---1_cCGK4M'],
 'audio': {'path': ['audio/unbal_train/---1_cCGK4M.flac'],
  'array': tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.1237, -0.0929,  0.0805]],
         dtype=torch.float64),
  'sampling_rate': tensor([16000])},
 'labels': [('/m/01g50p',),
  ('/m/0284vy3',),
  ('/m/06d_3',),
  ('/m/07jdr',),
  ('/m/07rwm0c',)],
 'human_labels': [('Railroad car, train wagon',),
  ('Train horn',),
  ('Rail transport',),
  ('Train',),
  ('Clickety-clack',)]}

In [10]:
ds[100]

{'video_id': '--6CkUtkLUI',
 'audio': {'path': 'audio/unbal_train/--6CkUtkLUI.flac',
  'array': array([-0.01472282, -0.0122354 , -0.01414704, ...,  0.05027008,
          0.07025385,  0.08694172]),
  'sampling_rate': 48000},
 'labels': ['/m/09x0r'],
 'human_labels': ['Speech']}

In [3]:
def collate(batch):
    wavs = torch.tensor([])
    for data in batch:
        wav = torch.from_numpy(data['audio']['array']).reshape(1, -1).float()

        # resample to 16000
        wav = F.resample(wav, data['audio']['sampling_rate'], 16000)
        
        # pad small white noise to 160000, which is 10 seconds
        N, L = wav.shape
        if L < 16000*10:
            append_len = 16000*10 - L 
            wav = torch.cat([wav, torch.randn(1, append_len)*0.001], dim=-1)
        elif L > 16000*10:
            wav = wav[:, :16000*10]
            
        wavs = torch.cat([wavs, wav], dim=0)
            
    return wavs.unsqueeze(1)

In [5]:
sampler_train = torch.utils.data.DistributedSampler(
            ds, num_replicas=1, rank=0, shuffle=True
        )

In [6]:
data_loader_train = torch.utils.data.DataLoader(
        ds,
        batch_size=512,
        sampler=sampler_train,
        num_workers=10,
        pin_memory=True,
        drop_last=True,
        collate_fn=collate,
    )

In [7]:
for samples in data_loader_train:
    print(samples)

tensor([[[-2.2006e-02, -1.2006e-02,  4.7385e-02,  ..., -1.3327e-03,
          -2.0157e-04,  2.5619e-04]],

        [[-5.8748e-02, -9.1860e-02, -6.6118e-02,  ...,  1.4414e-03,
           1.0130e-03,  5.5223e-04]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -9.2150e-04,
          -1.6165e-05,  2.5540e-04]],

        ...,

        [[ 1.3873e-01,  3.1841e-01,  4.7802e-01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.6606e-02,  2.8510e-02,  2.7521e-02,  ..., -8.5156e-04,
          -2.2067e-03, -2.4989e-03]],

        [[ 3.2547e-03,  6.7471e-03,  2.8442e-03,  ..., -3.5225e-03,
          -1.6203e-03, -4.9664e-03]]])
tensor([[[ 4.5977e-01,  7.3646e-01,  6.4494e-01,  ..., -7.0729e-01,
          -7.4497e-01, -5.0585e-01]],

        [[ 6.6824e-03, -2.1251e-02, -4.0596e-02,  ..., -2.3610e-02,
          -8.8257e-03,  3.1037e-02]],

        [[-3.0426e-02, -4.8213e-02, -3.9586e-02,  ..., -2.1600e-04,
          -3.9023e-03, -7.3961e-03]],

        ...,

        [

TypeError: an integer is required

In [2]:
a = torch.ones(2, 512, 768)
a.repeat(1, 1, 3).shape

torch.Size([2, 512, 2304])

In [2]:
attention = Attention(768, 12, True)

In [3]:
a = torch.rand(2, 512, 768)
mask = torch.zeros(2, 512, 768).bool()

In [4]:
attention(a, mask)

torch.Size([2, 12, 512, 64])
torch.Size([2, 12, 512, 64])
torch.Size([2, 12, 512, 512])
12


RuntimeError: The size of tensor a (768) must match the size of tensor b (512) at non-singleton dimension 3

In [8]:
model = MaskedAutoencoder(embed_dim=768, do_mask=True).cuda()

In [13]:
model.encoder_pos_embed.position_encoding.weight

Parameter containing:
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2843e-01,  ...,  1.0000e+00,
          1.0243e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.2799e-01,  ...,  1.0000e+00,
          2.0486e-04,  1.0000e+00],
        ...,
        [-5.2150e-01,  8.5325e-01,  1.4631e-01,  ...,  9.9677e-01,
          7.8379e-02,  9.9692e-01],
        [ 4.3622e-01,  8.9984e-01,  9.0147e-01,  ...,  9.9676e-01,
          7.8481e-02,  9.9692e-01],
        [ 9.9288e-01,  1.1912e-01,  8.6351e-01,  ...,  9.9676e-01,
          7.8583e-02,  9.9691e-01]], device='cuda:0')

In [3]:
def padding(spec_batch, in_chanel, embed_dim, patch_size=16, smallest_length=1024):
    # the default longest length of a spectrogram is 1024
    padded_specs = torch.tensor([])
    embeder = nn.Conv2d(in_chanel, embed_dim, kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), bias=False)
    embeder.weight.requires_grad = False
    N, C, H, W = spec_batch.shape
    longest = smallest_length

    # find the longest 
    for idx in range(N):
        spec = spec_batch[idx, :, :, :]
        spec_c, spec_h, spec_w = spec.shape
        if spec_w > longest:
            longest = spec_w

    # pad the spectrogram
    for idx in range(N):
        spec = spec_batch[idx, :, :, :]
        spec_c, spec_h, spec_w = spec.shape
        if spec_w < longest:
            pads = torch.zeros(spec_c, spec_h, longest - spec_w)
            padded_spec = torch.cat([spec, pads], dim=-1).unsqueeze(0)
            padded_specs = torch.cat([padded_specs, padded_spec], dim=0)

    # get the padding mask
    padding_masks = embeder(padded_specs).flatten(2).transpose(1, 2)
    padding_masks = torch.where(padding_masks == 0, 1, 0).bool()

    return padded_specs, padding_masks

In [4]:
a = torch.rand(2, 1, 128, 1001)
a, masks = padding(a, 1, 768, 16, 1024)

In [5]:
print(masks.shape)
print(masks[:, :, :512].shape)
print(masks[:, :, :256].shape)

torch.Size([2, 512, 768])
torch.Size([2, 512, 512])
torch.Size([2, 512, 256])


In [6]:
model(a, padding_mask=masks)

torch.Size([2, 512, 256])


(tensor([[1.5013, 1.5960, 1.5284,  ..., 1.4298, 1.4465, 0.0000],
         [1.5653, 1.5508, 1.5583,  ..., 1.5301, 1.4107, 0.0000]],
        grad_fn=<MeanBackward1>),
 tensor([[[-0.1342, -1.7399,  2.5878,  ...,  1.4804,  1.3534,  1.0617],
          [-0.1890, -1.7116,  2.8103,  ...,  1.5090,  1.4721,  1.0208],
          [-0.2631, -1.5977,  3.0126,  ...,  1.5937,  1.0246,  1.2272],
          ...,
          [-0.1900, -2.7658,  2.5324,  ...,  0.8216,  0.5004,  1.2674],
          [-0.0825, -2.7024,  2.5177,  ...,  0.8202,  0.5130,  1.2409],
          [ 0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-0.2108, -1.7987,  2.5642,  ...,  1.4829,  1.3316,  0.8886],
          [-0.1336, -1.7770,  2.9069,  ...,  1.5689,  1.3558,  1.1317],
          [-0.2668, -1.5569,  3.0893,  ...,  1.3738,  1.0580,  1.2533],
          ...,
          [-0.1837, -2.8484,  2.5315,  ...,  0.7940,  0.5916,  1.2866],
          [-0.0241, -2.7930,  2.5626,  ...,  0.8455,  0.5201,  1.2673],
          

In [7]:
def patchify(imgs):
    h = imgs.shape[2] // 16
    w = imgs.shape[3] // 16
    x = imgs.reshape(shape=(imgs.shape[0], 1, h, 16, w, 16))
    x = torch.einsum('nchpwq->nhwpqc', x)
    x = x.reshape(shape=(imgs.shape[0], h * w, 16**2 * 1))
    return x

In [8]:
patches = patchify(torch.rand(2, 1, 128, 1024))

In [9]:
patches.shape

torch.Size([2, 512, 256])