# Benchmarking Pipeline for DroneAudioset dataset
### We perform: beamforming, spectral gating, MPSeNet-based noise suppression and SSLAM-based audio classification steps

In [24]:
# initialize imports
import os
import torch
import numpy as np
from numpy import matlib
import soundfile as sf
import noisereduce as nr #type: ignore
from speechbrain.processing.features import STFT, ISTFT # type: ignore
from speechbrain.processing.multi_mic import Covariance, Mvdr # type: ignore

# specific to memory profile/computation
from memory_profiler import memory_usage #type: ignore
import time
import psutil
# %load_ext memory_profiler

In [25]:
# initalize param
ROOT_PATH = '/data/BlindDroneData/ComputeResourcesCheck/'
fs = 16000
# chosen setting
volume = '80pc'
room = 'room1'
drone = 'drone1'
speaker_dist = 'speaker-dist-1m'
mic_dist = 'mic-dist-25cm'
throttle = 'throttle-100'
mic = 'mic3_8array-up'
file_list = [f'{mic}-File{idx}.wav' for idx in range(1, 7)]

print('Chosen Setting:')
print(f'Volume: {volume}\nRoom: {room}\nDrone: {drone}\nDrone-Speaker Distance: {speaker_dist}')
print(f'Mic: {mic}\nDrone-Mic Distance: {mic_dist}')
print(f'File List: {file_list}')

Chosen Setting:
Volume: 80pc
Room: room1
Drone: drone1
Drone-Speaker Distance: speaker-dist-1m
Mic: mic3_8array-up
Drone-Mic Distance: mic-dist-25cm
File List: ['mic3_8array-up-File1.wav', 'mic3_8array-up-File2.wav', 'mic3_8array-up-File3.wav', 'mic3_8array-up-File4.wav', 'mic3_8array-up-File5.wav', 'mic3_8array-up-File6.wav']


In [26]:
# Function definitions
# === Microphone array geometry (2D circular) ===
def circular_array_positions(radius, num_mics, reorder_idx_list):
    angles = np.linspace(0, 2 * np.pi, num_mics, endpoint=False)
    mic_positions = torch.zeros((num_mics,3), dtype=torch.float)
    x = radius * np.cos(angles)
    y = radius * np.sin(angles)
    z = np.zeros_like(x)
    for idx, reorder_idx in enumerate(reorder_idx_list):
        mic_positions[idx, :] = torch.FloatTensor([x[reorder_idx], y[reorder_idx], z[reorder_idx]])
    return mic_positions

def cartesian_to_azimuth_elevation(cartesian_coord_list):
    cartesian_coord_list = cartesian_coord_list.detach().cpu().numpy()
    x,y,z = cartesian_coord_list
    # Compute azimuth in radians
    azimuth = np.arctan2(y, x)
    # Compute elevation in radians
    elevation = np.arctan2(z, np.sqrt(x**2 + y**2))
    # Convert radians to degrees
    azimuth_deg = np.degrees(azimuth)
    elevation_deg = np.degrees(elevation)
    return azimuth_deg, elevation_deg

# read single channel audio files
def read_audio_signal(file_path, fs, always_2d=True):
    sig, sig_fs = sf.read(file_path, dtype='float32', always_2d=always_2d)
    assert sig_fs == fs
    return sig

# write audio signals, including multi-channel
def write_audio_signal(file_path, sig, fs):
	sf.write(file=file_path, data=sig, samplerate=fs)

# return the direction of arrival [x,y,z] in meters
def compute_doa_from_location(speaker_str, mic_str, mic_name_str, num_windows,
                              z_drone=1.5, z_src=0.485):
    x_mic = 0; y_mic = 0
    z_drone_to_mic = int(mic_str.split('-')[-1][:-2])/100. # convert to meters
    if 'down' in mic_name_str:
        z_mic = z_drone - z_drone_to_mic
    elif 'up' in mic_name_str:
        z_mic = z_drone + z_drone_to_mic
    else:
        print(f'Incorrect mic type: {mic_name_str}')
    x_src = -int(speaker_str.split('-')[-1][:-1])/1.414
    y_src = x_src
    doa = np.array([x_src-x_mic, y_src-y_mic, z_src-z_mic])
    azim, _ = cartesian_to_azimuth_elevation(torch.tensor(doa, dtype=torch.float32))
    # current data collection fixes the azimuth at 135 degrees
    assert np.abs(azim+135) < 1e-5, "Wrong azimuth value!"
    doas = matlib.repmat(doa, m=num_windows, n=1)
    doas = torch.tensor(doas, dtype=torch.float32)
    doas = doas.unsqueeze(0)
    return doas

def profile_code(code_func, *args, **kwargs):
    start_time = time.time()
    cpu_before = psutil.cpu_percent(interval=None)
    mem_usage = memory_usage((code_func, args, kwargs), max_usage=True)
    cpu_after = psutil.cpu_percent(interval=None)
    end_time = time.time()
    print('='*50)
    print(f"Execution Time: {end_time - start_time:.2f} sec")
    print(f"CPU Usage: {cpu_after - cpu_before:.2f}%")
    print(f"Max Memory Usage: {mem_usage} MB")

## MVDR Beamforming (using SpeechBrain Library)

In [27]:
# initalize parameters for beamforming
N_MICS = 8
# MIC_ANGLE_VECTOR = np.array([270, 225, 0, 135, 315, 180, 45, 90])
MIC_DIAMETER = 0.5 # 0.3 for drone2, 0.5 for drone1
reorder_idx_list = np.array([6, 5, 0, 3, 7, 4, 1, 2])
MIC_GEOMETRY = circular_array_positions(MIC_DIAMETER/2, N_MICS, reorder_idx_list)

In [28]:
MIC_GEOMETRY

tensor([[-4.5924e-17, -2.5000e-01,  0.0000e+00],
        [-1.7678e-01, -1.7678e-01,  0.0000e+00],
        [ 2.5000e-01,  0.0000e+00,  0.0000e+00],
        [-1.7678e-01,  1.7678e-01,  0.0000e+00],
        [ 1.7678e-01, -1.7678e-01,  0.0000e+00],
        [-2.5000e-01,  3.0616e-17,  0.0000e+00],
        [ 1.7678e-01,  1.7678e-01,  0.0000e+00],
        [ 1.5308e-17,  2.5000e-01,  0.0000e+00]])

In [29]:
def perform_beamforming():
    # read each file, perform mvdr beamforming
    print('Performing MVDR Beamforming using SpeechBrain')
    for file_name in file_list:
        print(f'File: {file_name}')
        audio_path = os.path.join(ROOT_PATH, 'preprocessed-audio', 'drone-with-source-recordings', file_name)
        noise_path = os.path.join(ROOT_PATH, 'preprocessed-audio', 'drone-only-recordings', file_name)
        # === Load multichannel audio ===
        audio_sig_orig = read_audio_signal(audio_path, fs)
        audio_sig = torch.tensor(audio_sig_orig, dtype=torch.float32) # convert to tensor
        audio_sig = audio_sig.unsqueeze(0) # dim: [1, time, channels]
        # === Load multichannel noise ===
        noise_sig_orig = read_audio_signal(noise_path, fs)
        # retain only a small sample noise -- taking samples in the middle to model stationary noise
        noise_sig_orig = noise_sig_orig[(30*fs):(40*fs), :] 
        noise_sig = torch.tensor(noise_sig_orig, dtype=torch.float32)
        noise_sig = noise_sig.unsqueeze(0)
        # === initialize modules ===
        stft = STFT(sample_rate=fs, n_fft=2048)
        cov = Covariance()
        istft = ISTFT(sample_rate=fs, n_fft=2048)
        mvdr = Mvdr()
        # === compute STFT and Covariance ===
        Xs = stft(audio_sig)
        Ns = stft(noise_sig)
        NNs = cov(Ns)
        # == match the number of time steps across noise and audio
        audio_time_steps = Xs.shape[1]
        noise_time_steps = NNs.shape[1]
        if noise_time_steps < audio_time_steps:
            num_repeats = (audio_time_steps // noise_time_steps) + 1
            NNs_repeated = NNs.repeat(1, num_repeats, 1, 1, 1)
            NNs_repeated = NNs_repeated[:, :audio_time_steps, :, :, :]
        assert Xs.shape[1] == NNs_repeated.shape[1], "Incompatible time steps!"
        # compute DOA from source location
        doas = compute_doa_from_location(speaker_str=speaker_dist,
                                        mic_str=mic_dist,
                                        mic_name_str=mic,
                                        num_windows=Xs.shape[1])
        # compute MVDR and obtain the beamformed signal
        Ys_mvdr = mvdr(Xs, NNs_repeated, doas, doa_mode=True, mics=MIC_GEOMETRY, fs=fs)
        beamformed_sig = istft(Ys_mvdr)
        bf_folder = os.path.join(ROOT_PATH, 'beamforming')
        os.makedirs(bf_folder, exist_ok=True)
        save_path = os.path.join(bf_folder, f'mvdr-{file_name}')
        write_audio_signal(save_path, beamformed_sig[0,:,0], fs)

profile_code(perform_beamforming)

Performing MVDR Beamforming using SpeechBrain
File: mic3_8array-up-File1.wav
File: mic3_8array-up-File2.wav
File: mic3_8array-up-File3.wav
File: mic3_8array-up-File4.wav
File: mic3_8array-up-File5.wav
File: mic3_8array-up-File6.wav
Execution Time: 85.48 sec
CPU Usage: 12.40%
Max Memory Usage: 19955.25 MB


## Spectral Gating (using NoiseReduce Library)

In [30]:
# initialize parameters for Spectral Gating
aggressiveness = 0.5

In [31]:
# %%memit
def perform_spectral_gating():
    print('Performing Spectral Gating using NoiseReduce')
    for file_name in file_list:
        print(f'File: {file_name}')
        bf_path = os.path.join(ROOT_PATH, 'beamforming', f'mvdr-{file_name}')
        bf_sig = read_audio_signal(bf_path, fs)
        assert bf_sig.shape[1] == 1
        nr_sig = nr.reduce_noise(y=bf_sig[:,0], sr=fs, stationary=False, 
                                    thresh_n_mult_nonstationary=aggressiveness)
        # save audio
        nr_folder = os.path.join(ROOT_PATH, 'spectral-gating')
        os.makedirs(nr_folder, exist_ok=True)
        save_path = os.path.join(nr_folder, f'nr-{file_name}-agg{aggressiveness}.wav')
        write_audio_signal(save_path, nr_sig, fs)
profile_code(perform_spectral_gating)

Performing Spectral Gating using NoiseReduce
File: mic3_8array-up-File1.wav
File: mic3_8array-up-File2.wav
File: mic3_8array-up-File3.wav
File: mic3_8array-up-File4.wav
File: mic3_8array-up-File5.wav
File: mic3_8array-up-File6.wav
Execution Time: 5.03 sec
CPU Usage: -2.00%
Max Memory Usage: 780.51171875 MB


In [32]:
!nvidia-smi

Wed May 21 13:48:38 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:1A:00.0 Off |                  N/A |
| 30%   28C    P8              13W / 350W |     58MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:68:00.0 Off |  

In [33]:
import platform, psutil, GPUtil

print("=== System ===")
print(f"OS: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.machine()}")
print(f"Python Version: {platform.python_version()}")

print("\n=== CPU ===")
print(f"Processor: {platform.processor()}")
print(f"Physical Cores: {psutil.cpu_count(logical=False)}")
print(f"Logical Cores: {psutil.cpu_count(logical=True)}")
print(f"Frequency: {psutil.cpu_freq().current:.2f} MHz")

print("\n=== RAM ===")
mem = psutil.virtual_memory()
print(f"Total RAM: {mem.total / 1e9:.2f} GB")

print("\n=== GPU ===")
try:
    GPUs = GPUtil.getGPUs()
    for gpu in GPUs:
        print(f"GPU {gpu.id} - {gpu.name}")
        print(f"  Load: {gpu.load * 100:.1f}%")
        print(f"  Memory: {gpu.memoryUsed}/{gpu.memoryTotal} MB")
        print(f"  Temperature: {gpu.temperature} °C")
except Exception as e:
    print("GPUtil not available or no GPU found:", e)

=== System ===
OS: Linux 5.15.0-119-generic
Architecture: x86_64
Python Version: 3.12.2

=== CPU ===
Processor: x86_64
Physical Cores: 18
Logical Cores: 36
Frequency: 2905.96 MHz

=== RAM ===
Total RAM: 134.73 GB

=== GPU ===
GPU 0 - NVIDIA GeForce RTX 3090
  Load: 0.0%
  Memory: 58.0/24576.0 MB
  Temperature: 28.0 °C
GPU 1 - NVIDIA GeForce RTX 3090
  Load: 0.0%
  Memory: 647.0/24576.0 MB
  Temperature: 30.0 °C
