In [1]:
import os
import os.path as osp

import numpy as np
import torch
from torch.utils.data.dataset import Dataset
import torchaudio
import torch.nn as nn


In [2]:
from google.colab import drive 
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!curl https://zenodo.org/record/6387880/files/foa_dev.zip?download=1 --output /content/gdrive/MyDrive/ProjectData/foa_dev.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2097M  100 2097M    0     0  20.8M      0  0:01:40  0:01:40 --:--:-- 27.9M


In [None]:
!curl https://zenodo.org/record/6387880/files/metadata_dev.zip?download=1 --output /content/gdrive/MyDrive/ProjectData/metadata_dev.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  619k  100  619k    0     0   334k      0  0:00:01  0:00:01 --:--:--  334k


In [None]:
!unzip -q /content/gdrive/MyDrive/ProjectData/foa_dev.zip -d /content/gdrive/MyDrive/ProjectData/data

In [None]:
!unzip -q /content/gdrive/MyDrive/ProjectData/metadata_dev.zip -d /content/gdrive/MyDrive/ProjectData/data


In [3]:


SOUND_EVENT_CLASSES = [
    "Female speech, woman speaking",
    "Male speech, man speaking",
    "Clapping",
    "Telephone",
    "Laughter",
    "Domestic sounds",
    "Walk, footsteps",
    "Door, open or close",
    "Music",
    "Musical instrument",
    "Water tap, faucet",
    "Bell",
    "Knock"
]

In [4]:
config = {
    
    "track" : 3,
    "classes" : len(SOUND_EVENT_CLASSES),
    

}

In [5]:
config

{'classes': 13, 'track': 3}

In [41]:
import os
import os.path as osp

import numpy as np
import torch
import torchaudio
from torch.utils.data.dataset import Dataset
from torch.nn.functional import pad

SOUND_EVENT_CLASSES = [
    "Female speech, woman speaking",
    "Male speech, man speaking",
    "Clapping",
    "Telephone",
    "Laughter",
    "Domestic sounds",
    "Walk, footsteps",
    "Door, open or close",
    "Music",
    "Musical instrument",
    "Water tap, faucet",
    "Bell",
    "Knock"
]


class FOADataset(Dataset):
    """
    Custom PyTorch Dataset for DCASE FOA Datsets
    """

    implemented_model_features = ["seldnet", "rd3net"]

    def __init__(self, data_path, folds=None, train=True, model="seldnet", hop_length=20, context=0):
        """
        Init Function for FOADataset
        :param data_path: String path to root folder containing 'foa_dev' and 'metadata_dev'
        :param folds: List of fold integers to use in this dataset
        :param train: Bool indicating whether to use train dataset of val dataset
        """

        # Assert that requested features are currently implemented
        assert model in FOADataset.implemented_model_features
        feat_size = 250
        hop_length = 20
        self.model = model
        # Calculate Directory Names
        foa_directory_sony = osp.join(data_path, "foa_dev", "dev-train-sony" if train else "dev-test-sony")
        meta_directory_sony = osp.join(data_path, "metadata_dev", "dev-train-sony" if train else "dev-test-sony")
        foa_directory_tau = osp.join(data_path, "foa_dev", "dev-train-tau" if train else "dev-test-tau")
        meta_directory_tau = osp.join(data_path, "metadata_dev", "dev-train-tau" if train else "dev-test-tau")

        all_foa_files = [osp.join(foa_directory_tau, file) for file in os.listdir(foa_directory_tau)]
        all_foa_files.extend([osp.join(foa_directory_sony, file) for file in os.listdir(foa_directory_sony)])
        all_meta_files = [osp.join(meta_directory_tau, file) for file in os.listdir(meta_directory_tau)]
        all_meta_files.extend([osp.join(meta_directory_sony, file) for file in os.listdir(meta_directory_sony)])

        # Parse File Names
        foa_file_data = [self.parse_foa_file_name(file) for file in all_foa_files]
        meta_file_data = [self.parse_foa_file_name(file) for file in all_meta_files]

        # Create Lists of All Valid File Paths in Given Folds
        self.folds = folds
        self.foa_files = [
            file for file, data in zip(all_foa_files, foa_file_data)
            if (folds is None or data["fold"] in folds)
        ]
        self.foa_files.sort()
        self.meta_files = [
            file for file, data in zip(all_meta_files, meta_file_data)
            if (folds is None or data["fold"] in folds)
        ]
        self.meta_files.sort()
        self.foa_files = self.foa_files[:10]
        self.meta_files = self.meta_files[:10]

        assert len(self.foa_files) == len(self.meta_files)

        # Load SELDNet Input Features and ACCDOA Output
        features = []
        multi_accdoas = []
        self.feature_width = 100 // hop_length
        for foa_file, meta_file in zip(self.foa_files, self.meta_files):
            if model == "seldnet":
                feature = self.audio_to_seldnet_features(foa_file, hop_length=hop_length)[:,:, :-1]
                multi_accdoa = self.metadata_to_multi_accdoa(self.load_metadata(meta_file),
                                                         total_frames=feature.shape[2] // (100 // 20))[:,:, :-1]
                feature_chunked = self.chunk_seldnet_feature(feature, feat_size)
                multi_accdoa_chunked = self.chunk_seldnet_multiaccdoa(multi_accdoa, feat_size, hop_length )
                # print(len(feature_chunked))
                # print(len(multi_accdoa_chunked))
                # print(feature.shape)
                # print(multi_accdoa.shape)

                assert(len(feature_chunked) == len(multi_accdoa_chunked))
                features.extend(feature_chunked)
                multi_accdoas.extend(multi_accdoa_chunked)

            else:
                feature = self.audio_to_rd3net_features(foa_file, hop_length=hop_length)
                total_frames = feature.shape[2] // (100 // hop_length)
                feature = feature[:, :, :total_frames * (100 // hop_length)]
                multi_accdoa = self.metadata_to_multi_accdoa(self.load_metadata(meta_file),
                                                             total_frames=total_frames)
                features.append(feature)
                multi_accdoas.append(multi_accdoa)

        if model=="seldnet":
            self.features = np.stack(features)
            self.multi_accdoa = np.stack(multi_accdoas)
        else:
            self.features = pad(torch.concat(features, dim=-1), (context, context))
            self.multi_accdoa = np.concatenate(multi_accdoas, axis=-1)
        self.context = context

    @staticmethod
    def parse_foa_file_name(file):
        """
        Parses filenames of the following format:
        "fold[fold number]_room[room number per fold]_mix[recording number per room per split].wav"
        :param file: filename
        :return: metadata dictionary
        """

        name, extension = osp.splitext(osp.basename(file))
        fold_text, room_text, mix_text = name.split("_")
        fold = int(fold_text.replace("fold", ""))
        room = int(room_text.replace("room", ""))
        mix = int(mix_text.replace("mix", ""))
        return {"fold": fold, "room": room, "mix": mix}

    @staticmethod
    def audio_to_seldnet_features(file, fft_size=1024, hop_length=20, eps=1e-8):
        """
        Generates the SELDNet Input Features
        :param file: Filepath to Audio File to Load
        :param fft_size: Size of FFT calculation to perform
        :param hop_length: Stride of FFT in ms
        :param eps: Division eps to prevent NaN outputs
        :return: torch.Tensor of Shape 7x64xT
        """
        waveform, sample_rate = torchaudio.load(file, normalize=True)

        spec_trans = torchaudio.transforms.Spectrogram(n_fft=fft_size, hop_length=sample_rate // (1000 // hop_length),
                                                       pad=0, power=None)
        mel_trans = torchaudio.transforms.MelScale(n_mels=64, sample_rate=sample_rate, n_stft=fft_size // 2 + 1)

        with torch.no_grad():
            spectrogram = spec_trans(waveform)
            mel_spec = mel_trans(torch.real(torch.pow(spectrogram, 2)))

            intensity = torch.real(torch.conj(spectrogram[0]) * spectrogram[1:])
            intensity = intensity / (torch.pow(torch.abs(spectrogram[0]), 2) +
                                     torch.mean(torch.pow(torch.abs(spectrogram[1:]), 2), dim=0) + eps)
            mel_intensity = mel_trans(intensity)
        return torch.concat((mel_spec, mel_intensity), dim=0)

    @staticmethod
    def audio_to_rd3net_features(file, fft_size=1024, hop_length=20):
        """
        Generates the RD3Net Input Features
        :param file: Filepath to Audio File to Load
        :param fft_size: Size of FFT calculation to perform
        :param hop_length: Stride of FFT in ms
        :return: torch.Tensor of Shape 7x(fft/2+1)xT
        """
        waveform, sample_rate = torchaudio.load(file, normalize=True)

        spec_trans = torchaudio.transforms.Spectrogram(n_fft=fft_size, hop_length=sample_rate // (1000 // hop_length),
                                                       pad=0, power=None)

        with torch.no_grad():
            spectrogram = spec_trans(waveform)

            amplitude = torch.abs(spectrogram)
            ipd = torch.angle(spectrogram[0]) - torch.angle(spectrogram[1:])

        return torch.concat((amplitude, ipd), dim=0)

    @staticmethod
    def load_metadata(file):
        """
        Reads in the CSV Label File of the Format
        '[frame number (int)], [active class index (int)], [source number index (int)], [azimuth (int)], [elevation (int)]'

        :param file: Filepath to CSV File to Load
        :return: List of Metadata Dictionaries
        """
        metadata = []
        with open(file, 'r') as f:
            for line in f.readlines():
                frame_number, active_class, source_number, azimuth, elevation = line.split(",")
                metadata.append({
                    "frame_number": int(frame_number),
                    "active_class": int(active_class),
                    "source_number": int(source_number),
                    "azimuth": int(azimuth),
                    "elevation": int(elevation)
                })
        return metadata

    @staticmethod
    def metadata_to_multi_accdoa(metadata, total_frames, n=3, c=len(SOUND_EVENT_CLASSES)):
        """
        Turns a List of Python Dictionaries with SELD Labels Into A Multi-ACCDOA Truth Vector
        :param metadata: List of Python Dictionaries (from 'load_metadata')
        :param total_frames: Total number of 100ms frames in source audio
        :param n: Maximum number of repetitions
        :param c: Number of classes
        :return: N x 3 x C x Total Frames Numpy Ndarray
        """
        multi_accdoa = np.zeros((n, 3, c, total_frames))
        event_count_per_frame = np.zeros((c, total_frames), dtype=np.int)
        for metadata_i in metadata:
            f, a, s, az, el = (metadata_i["frame_number"], metadata_i["active_class"], metadata_i["source_number"],
                               metadata_i["azimuth"], metadata_i["elevation"])
            f -= 1
            norm_az_el = np.array([np.cos(np.deg2rad(az)), np.sin(np.deg2rad(az)), np.sin(np.deg2rad(el))])
            multi_accdoa[event_count_per_frame[a, f]:, :, a, f] = norm_az_el
            event_count_per_frame[a, f] += 1
        return multi_accdoa

    @staticmethod
    def chunk_seldnet_feature(feature, feat_size=250):
      
      s0,s1,s2 = feature.shape
      # print(feature.shape)
      news2 = int(np.ceil(s2/feat_size)*feat_size)
      # print("padded length  ", news2)
      feature = np.pad(feature, ((0,0), (0,0), (0,news2-s2)))
      # print(feature.shape, "  new feature shape")
      feature = np.reshape(feature, (7,news2,64))
      return np.split(feature, news2/feat_size, axis=1 )
      # return feature

    @staticmethod
    def chunk_seldnet_multiaccdoa(multi_accdoa,feat_size, hop_length):
      split_size = feat_size//(100//hop_length)
      # print(multi_accdoa.shape, "  multi accdoa shape")
      # print(split_size, " split size")
      split_count = multi_accdoa.shape[-1]/split_size
      toPad = int(np.ceil(split_count)*split_size) - multi_accdoa.shape[-1]

      multi_accdoa = np.pad(multi_accdoa, ((0,0), (0,0),(0,0), (0,toPad)))
      # print(multi_accdoa.shape, "  multi accdoa shape")
      split_count = multi_accdoa.shape[-1]/split_size
      # print(split_count)


      return np.split(multi_accdoa, split_count, axis=-1)


    def __len__(self):
        if self.model=="seldnet":
            return self.features.shape[0]
        return self.multi_accdoa.shape[-1]

    def __getitem__(self, item):
        if self.model =="seldnet":
            return torch.from_numpy(self.features[item]), torch.from_numpy(self.multi_accdoa[item])
        return self.features[:, :, item*self.feature_width:(item+1)*self.feature_width+self.context*2], \
               self.multi_accdoa[:, :, :, item]


In [42]:
train_data = FOADataset("/content/gdrive/MyDrive/ProjectData/data")
train_loader = torch.utils.data.DataLoader( train_data , batch_size= 128, shuffle=True, drop_last=True)


print(len(train_data))


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


189


In [None]:
print(train_data.features.shape)

(190, 7, 250, 64)


To import files, on colab I am keeping them in the ProjectData folder might not be needed locally

In [None]:
# import sys  
# sys.path.insert(0, '/content/gdrive/MyDrive/ProjectData')

In [None]:
# from foa_dataset import FOADataset as FOADataset2

In [None]:
# train_data2 = FOADataset2("/content/gdrive/MyDrive/ProjectData/data")
# train_loader = torch.utils.data.DataLoader( train_data , batch_size= 2, shuffle=True)


# print(len(train_data))

ValueError: ignored

In [26]:
# import numpy as np
# import tor


class ConvBlock(nn.Module):

	def __init__(self, max_pool = (5,4), out_filter=64, in_filter=64, kernel_size=3, dropout_rate=0.01):
		super().__init__()
		self.max_pool = max_pool
		self.out_filter = out_filter
		self.conv = nn.Conv2d(in_filter,out_filter, kernel_size=kernel_size, padding=(1,1))
		self.bn = nn.BatchNorm2d(out_filter)
		self.mpool = nn.MaxPool2d(self.max_pool)
		self.dropout = nn.Dropout2d(dropout_rate)

	def forward(self,x):
		x = self.conv(x)
		x = self.bn(x)
		x = nn.ReLU()(x)
		x = self.mpool(x)
		x = self.dropout(x)
		return x



class Network_Seldnet(nn.Module):
	def __init__(self):
		super().__init__()
		# print("Here")
		max_pool_list = [(5,4),(1,4),(1,2)]
		self.conv_list = nn.ModuleList()
		for i,pool in enumerate(max_pool_list):
			# print("adding pool ")
			if i == 0:
				self.conv_list.append(
					ConvBlock(pool, 64,7)
				)
			else:
				self.conv_list.append(
					ConvBlock(pool)
				)
		print(len(self.conv_list))

		conv_out = 64*int(64/(4*4*2))
		self.rnn = nn.GRU(conv_out, 128, num_layers=2, bidirectional=True, batch_first=True, dropout=0.01)
		self.rnn_act = nn.Tanh()

		self.linear = nn.Linear(128,3*config["track"] *config["classes"])
		self.linear1 = nn.Linear(128,128)
		self.act1 = nn.Tanh()
		self.act = nn.Tanh()

	def forward(self, x):
		print("Forward")
		print(len(self.conv_list))
		for i in range(len(self.conv_list)):
			x = self.conv_list[i](x)
		print(" Post conv list")
	
		x = x.transpose(1, 2).contiguous()
		x = x.view(x.shape[0], x.shape[1], -1).contiguous()
		x,_ = self.rnn(x)
		x = self.rnn_act(x)
	
		x = x[:, :, x.shape[-1]//2:] * x[:, :, :x.shape[-1]//2]

		x = self.linear1(x)
		x = self.act1(x)
	
		x = self.linear(x)
		x = self.act(x)
		

		return x






In [43]:
print(len(train_data.features))
print(len(train_data.multi_accdoa))

189
189


In [44]:
# Optional
# Test code for checking shapes and return arguments of the train and val loaders
for data in train_loader:
    x, y = data # if you face an error saying "Cannot unpack", then you are not passing the collate_fn argument
    print(x.shape, y.shape)
    break

torch.Size([128, 7, 250, 64]) torch.Size([128, 3, 3, 12, 50])


In [45]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [46]:
!pip install torchsummaryX # We also install a summary package to check our model's forward before training

Collecting torchsummaryX
  Downloading torchsummaryX-1.3.0-py3-none-any.whl (3.6 kB)
Installing collected packages: torchsummaryX
Successfully installed torchsummaryX-1.3.0


In [47]:
from torchsummaryX import summary


In [48]:
model = Network_Seldnet().to(device)
print(model)
summary(model, x.to(device))

3
Network_Seldnet(
  (conv_list): ModuleList(
    (0): ConvBlock(
      (conv): Conv2d(7, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (mpool): MaxPool2d(kernel_size=(5, 4), stride=(5, 4), padding=0, dilation=1, ceil_mode=False)
      (dropout): Dropout2d(p=0.01, inplace=False)
    )
    (1): ConvBlock(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (mpool): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), padding=0, dilation=1, ceil_mode=False)
      (dropout): Dropout2d(p=0.01, inplace=False)
    )
    (2): ConvBlock(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (mpool): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding

  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_conv_list.0.Conv2d_conv,"[7, 64, 3, 3]","[128, 64, 250, 64]",4096.0,64512000.0
1_conv_list.0.BatchNorm2d_bn,[64],"[128, 64, 250, 64]",128.0,64.0
2_conv_list.0.MaxPool2d_mpool,-,"[128, 64, 50, 16]",,
3_conv_list.0.Dropout2d_dropout,-,"[128, 64, 50, 16]",,
4_conv_list.1.Conv2d_conv,"[64, 64, 3, 3]","[128, 64, 50, 16]",36928.0,29491200.0
5_conv_list.1.BatchNorm2d_bn,[64],"[128, 64, 50, 16]",128.0,64.0
6_conv_list.1.MaxPool2d_mpool,-,"[128, 64, 50, 4]",,
7_conv_list.1.Dropout2d_dropout,-,"[128, 64, 50, 4]",,
8_conv_list.2.Conv2d_conv,"[64, 64, 3, 3]","[128, 64, 50, 4]",36928.0,7372800.0
9_conv_list.2.BatchNorm2d_bn,[64],"[128, 64, 50, 4]",128.0,64.0


In [None]:
!git clone https://github.com/sharathadavanne/seld-dcase2022.git

Cloning into 'seld-dcase2022'...
remote: Enumerating objects: 103, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 103 (delta 66), reused 65 (delta 31), pack-reused 0[K
Receiving objects: 100% (103/103), 1.06 MiB | 7.88 MiB/s, done.
Resolving deltas: 100% (66/66), done.


In [None]:
import sys  
sys.path.insert(0, '/content/seld-dcase2022')

In [None]:
!python /content/seld-dcase2022/batch_feature_extraction.py

16: fold4_room24_mix003.wav, (4865, 448)
17: fold4_room24_mix005.wav, (6865, 448)
Traceback (most recent call last):
  File "/content/seld-dcase2022/batch_feature_extraction.py", line 30, in <module>
    sys.exit(main(sys.argv))
  File "/content/seld-dcase2022/batch_feature_extraction.py", line 22, in main
    dev_feat_cls.extract_all_feature()
  File "/content/seld-dcase2022/cls_feature_class.py", line 380, in extract_all_feature
    self.extract_file_feature((file_cnt, wav_path, feat_path))
  File "/content/seld-dcase2022/cls_feature_class.py", line 338, in extract_file_feature
    mel_spect = self._get_mel_spectrogram(spect)
  File "/content/seld-dcase2022/cls_feature_class.py", line 135, in _get_mel_spectrogram
    log_mel_spectra = librosa.power_to_db(mel_spectra)
  File "/usr/local/lib/python3.7/dist-packages/librosa/core/spectrum.py", line 1559, in power_to_db
    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
KeyboardInterrupt


In [None]:
feat = np.load('/content/gdrive/MyDrive/ProjectData/data/seld_feat_label/foa_dev/fold4_room23_mix002.npy')

In [None]:
feat.shape

(3035, 448)

In [None]:
feat.shape


(2235, 448)

In [None]:
!python /content/seld-dcase2022/train_seldnet.py

['/content/seld-dcase2022/train_seldnet.py']



-------------------------------------------------------------------------------------------------------
The code expected two optional inputs
	>> python seld.py <task-id> <job-id>
		<task-id> is used to choose the user-defined parameter set from parameter.py
Using default inputs for now
		<job-id> is a unique identifier which is used for output filenames (models, training plots). You can use any number or string for this.
-------------------------------------------------------------------------------------------------------



SET: 1
USING DEFAULT PARAMETERS

	quick_test: True
	finetune_mode: False
	pretrained_model_weights: models/1_1_foa_dev_split6_model.h5
	dataset_dir: /content/gdrive/MyDrive/ProjectData/data
	unique_classes: 13
	feat_label_dir: /content/gdrive/MyDrive/ProjectData/data/seld_feat_label
	model_dir: models/
	dcase_output_dir: results/
	mode: dev
	dataset: foa
	fs: 24000
	hop_len_s: 0.02
	label_hop_len_s: 0.1
	max_audio_l