# Benchmarking Pipeline for DroneAudioset dataset
### We perform: beamforming, spectral gating, MPSeNet-based noise suppression and SSLAM-based audio classification steps

In [1]:
# initialize imports
import os
import torch
import numpy
import numpy as np
from numpy import matlib
import soundfile as sf
import noisereduce as nr #type: ignore
from speechbrain.processing.features import STFT, ISTFT # type: ignore
from speechbrain.processing.multi_mic import Covariance, Mvdr # type: ignore

# specific to memory profile/computation
from memory_profiler import memory_usage #type: ignore
import time
import psutil
# %load_ext memory_profiler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# initalize param
ROOT_PATH = '../ComputeResourcesCheck/'
fs = 16000
# chosen setting
volume = '80pc'
room = 'room1'
drone = 'drone1'
speaker_dist = 'speaker-dist-1m'
mic_dist = 'mic-dist-25cm'
throttle = 'throttle-100'
mic = 'mic3_8array-up'
file_list = [f'{mic}-File{idx}.wav' for idx in range(1, 7)]

print('Chosen Setting:')
print(f'Volume: {volume}\nRoom: {room}\nDrone: {drone}\nDrone-Speaker Distance: {speaker_dist}')
print(f'Mic: {mic}\nDrone-Mic Distance: {mic_dist}')
print(f'File List: {file_list}')

Chosen Setting:
Volume: 80pc
Room: room1
Drone: drone1
Drone-Speaker Distance: speaker-dist-1m
Mic: mic3_8array-up
Drone-Mic Distance: mic-dist-25cm
File List: ['mic3_8array-up-File1.wav', 'mic3_8array-up-File2.wav', 'mic3_8array-up-File3.wav', 'mic3_8array-up-File4.wav', 'mic3_8array-up-File5.wav', 'mic3_8array-up-File6.wav']


In [3]:
# Function definitions
# === Microphone array geometry (2D circular) ===
def circular_array_positions(radius, num_mics, reorder_idx_list):
    angles = np.linspace(0, 2 * np.pi, num_mics, endpoint=False)
    mic_positions = torch.zeros((num_mics,3), dtype=torch.float)
    x = radius * np.cos(angles)
    y = radius * np.sin(angles)
    z = np.zeros_like(x)
    for idx, reorder_idx in enumerate(reorder_idx_list):
        mic_positions[idx, :] = torch.FloatTensor([x[reorder_idx], y[reorder_idx], z[reorder_idx]])
    return mic_positions

def cartesian_to_azimuth_elevation(cartesian_coord_list):
    cartesian_coord_list = cartesian_coord_list.detach().cpu().numpy()
    x,y,z = cartesian_coord_list
    # Compute azimuth in radians
    azimuth = np.arctan2(y, x)
    # Compute elevation in radians
    elevation = np.arctan2(z, np.sqrt(x**2 + y**2))
    # Convert radians to degrees
    azimuth_deg = np.degrees(azimuth)
    elevation_deg = np.degrees(elevation)
    return azimuth_deg, elevation_deg

# read single channel audio files
def read_audio_signal(file_path, fs, always_2d=True):
    sig, sig_fs = sf.read(file_path, dtype='float32', always_2d=always_2d)
    assert sig_fs == fs
    return sig

# write audio signals, including multi-channel
def write_audio_signal(file_path, sig, fs):
	sf.write(file=file_path, data=sig, samplerate=fs)

# return the direction of arrival [x,y,z] in meters
def compute_doa_from_location(speaker_str, mic_str, mic_name_str, num_windows,
                              z_drone=1.5, z_src=0.485):
    x_mic = 0; y_mic = 0
    z_drone_to_mic = int(mic_str.split('-')[-1][:-2])/100. # convert to meters
    if 'down' in mic_name_str:
        z_mic = z_drone - z_drone_to_mic
    elif 'up' in mic_name_str:
        z_mic = z_drone + z_drone_to_mic
    else:
        print(f'Incorrect mic type: {mic_name_str}')
    x_src = -int(speaker_str.split('-')[-1][:-1])/1.414
    y_src = x_src
    doa = np.array([x_src-x_mic, y_src-y_mic, z_src-z_mic])
    azim, _ = cartesian_to_azimuth_elevation(torch.tensor(doa, dtype=torch.float32))
    # current data collection fixes the azimuth at 135 degrees
    assert np.abs(azim+135) < 1e-5, "Wrong azimuth value!"
    doas = matlib.repmat(doa, m=num_windows, n=1)
    doas = torch.tensor(doas, dtype=torch.float32)
    doas = doas.unsqueeze(0)
    return doas



In [4]:
## CPU usage profile
from memory_profiler import memory_usage #type: ignore
import time
import psutil

def profile_code(code_func, *args, **kwargs):
    start_time = time.time()
    cpu_before = psutil.cpu_percent(interval=None)
    mem_usage = memory_usage((code_func, args, kwargs), max_usage=True)
    cpu_after = psutil.cpu_percent(interval=None)
    end_time = time.time()
    print('='*50)
    print(f"Execution Time: {end_time - start_time:.2f} sec")
    print(f"CPU Usage: {cpu_after - cpu_before:.2f}%")
    print(f"Max Memory Usage: {mem_usage} MB")

In [5]:
## GPU usage profile
import time
import psutil
import subprocess
import threading
from functools import wraps

gpuid = 0
class ResourceMonitor:
    def __init__(self):
        self.max_cpu_usage = 0
        self.max_ram_usage = 0
        self.max_ram_mb = 0
        self.max_gpu_usage = 0
        self.max_gpu_mem = 0
        self.max_gpu_mem_mb = 0
        self.stop_monitor = False
        self.monitor_thread = None
        self.has_gpu = self._check_gpu_available()
        
    def _check_gpu_available(self):
        try:
            subprocess.check_output(['nvidia-smi'], stderr=subprocess.DEVNULL)
            return True
        except (FileNotFoundError, subprocess.CalledProcessError):
            return False
    
    def _get_gpu_stats(self):
        try:
            
            # Get GPU utilization and memory info
            output = subprocess.check_output([
                'nvidia-smi',
                '--query-gpu=utilization.gpu,memory.used,memory.total',
                '--format=csv,nounits,noheader'
            ]).decode('utf-8').strip()
            
            if output:
                gpu_util, mem_used, mem_total = map(float, output.split('\n')[gpuid].split(','))
                gpu_mem_percent = (mem_used / mem_total) * 100
                # print(gpu_util, gpu_mem_percent)
                return gpu_util, mem_used #gpu_mem_percent
        except Exception:
            pass
        return 0, 0
        
    def start_monitoring(self, interval=0.5):
        """Start monitoring resources in a background thread"""
        self.stop_monitor = False
        
        def monitor():
            while not self.stop_monitor:
                # CPU and RAM monitoring
                cpu_usage = psutil.cpu_percent()
                ram_mb = psutil.virtual_memory().used / (1024 * 1024)
                # ram_usage = psutil.virtual_memory().percent
                
                self.max_cpu_usage = max(self.max_cpu_usage, cpu_usage)
                self.max_ram_usage = max(self.max_ram_usage, ram_mb)
                
                # GPU monitoring
                
                if self.has_gpu:
                    gpu_util, gpu_mem = self._get_gpu_stats()
                    self.max_gpu_usage = max(self.max_gpu_usage, gpu_util)
                    self.max_gpu_mem_mb = max(self.max_gpu_mem_mb, gpu_mem)
                    # self.max_gpu_mem = max(self.max_gpu_mem, gpu_mem)
                
                time.sleep(interval)
        
        self.monitor_thread = threading.Thread(target=monitor)
        self.monitor_thread.start()
    
    def stop_monitoring(self):
        """Stop the monitoring thread"""
        self.stop_monitor = True
        if self.monitor_thread:
            self.monitor_thread.join()

def profile_resources(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        monitor = ResourceMonitor()
        
        print("Starting resource monitoring...")
        monitor.start_monitoring()
        
        start_time = time.time()
        try:
            result = func(*args, **kwargs)
        finally:
            end_time = time.time()
            monitor.stop_monitoring()
            
            execution_time = end_time - start_time
            
            print("\n" + "="*50)
            # print(f"Function '{func.__name__}' executed in {execution_time:.4f} seconds")
            # print(f"Execution Time: {execution_time:.2f} sec")
            # print("\nResource Usage Summary:")
            # print(f"Max CPU Usage: {monitor.max_cpu_usage:.2f}%")
            # print(f"Max RAM Usage: {monitor.max_ram_usage:.2f}%")
            # print(f"CPU Usage: {monitor.max_cpu_usage:.1f}%")
            # print(f"Max Memory Usage: {monitor.max_ram_mb:.1f} MB")
            
            
            if monitor.has_gpu:
                print(f"Max GPU Usage: {monitor.max_gpu_usage:.2f}%")
                print(f"Max GPU Memory Usage: {monitor.max_gpu_mem_mb:.2f} MB")
            else:
                print("No GPU detected (nvidia-smi not available)")
            print("="*50 + "\n")
            
        return result
    return wrapper

## MVDR Beamforming (using SpeechBrain Library)

In [6]:
# initalize parameters for beamforming
N_MICS = 8
# MIC_ANGLE_VECTOR = np.array([270, 225, 0, 135, 315, 180, 45, 90])
MIC_DIAMETER = 0.5 # 0.3 for drone2, 0.5 for drone1
reorder_idx_list = np.array([6, 5, 0, 3, 7, 4, 1, 2])
MIC_GEOMETRY = circular_array_positions(MIC_DIAMETER/2, N_MICS, reorder_idx_list)

In [7]:
MIC_GEOMETRY

tensor([[-4.5924e-17, -2.5000e-01,  0.0000e+00],
        [-1.7678e-01, -1.7678e-01,  0.0000e+00],
        [ 2.5000e-01,  0.0000e+00,  0.0000e+00],
        [-1.7678e-01,  1.7678e-01,  0.0000e+00],
        [ 1.7678e-01, -1.7678e-01,  0.0000e+00],
        [-2.5000e-01,  3.0616e-17,  0.0000e+00],
        [ 1.7678e-01,  1.7678e-01,  0.0000e+00],
        [ 1.5308e-17,  2.5000e-01,  0.0000e+00]])

In [9]:
@profile_resources
def perform_beamforming():
    # read each file, perform mvdr beamforming
    print('Performing MVDR Beamforming using SpeechBrain')
    for file_name in file_list:
        print(f'File: {file_name}')
        audio_path = os.path.join(ROOT_PATH, 'preprocessed-audio', 'drone-with-source-recordings', file_name)
        noise_path = os.path.join(ROOT_PATH, 'preprocessed-audio', 'drone-only-recordings', file_name)
        # === Load multichannel audio ===
        audio_sig_orig = read_audio_signal(audio_path, fs)
        audio_sig = torch.tensor(audio_sig_orig, dtype=torch.float32) # convert to tensor
        audio_sig = audio_sig.unsqueeze(0) # dim: [1, time, channels]
        # === Load multichannel noise ===
        noise_sig_orig = read_audio_signal(noise_path, fs)
        # retain only a small sample noise -- taking samples in the middle to model stationary noise
        noise_sig_orig = noise_sig_orig[(30*fs):(40*fs), :] 
        noise_sig = torch.tensor(noise_sig_orig, dtype=torch.float32)
        noise_sig = noise_sig.unsqueeze(0)
        # === initialize modules ===
        stft = STFT(sample_rate=fs, n_fft=2048)
        cov = Covariance()
        istft = ISTFT(sample_rate=fs, n_fft=2048)
        mvdr = Mvdr()
        # === compute STFT and Covariance ===
        Xs = stft(audio_sig)
        Ns = stft(noise_sig)
        NNs = cov(Ns)
        # == match the number of time steps across noise and audio
        audio_time_steps = Xs.shape[1]
        noise_time_steps = NNs.shape[1]
        if noise_time_steps < audio_time_steps:
            num_repeats = (audio_time_steps // noise_time_steps) + 1
            NNs_repeated = NNs.repeat(1, num_repeats, 1, 1, 1)
            NNs_repeated = NNs_repeated[:, :audio_time_steps, :, :, :]
        assert Xs.shape[1] == NNs_repeated.shape[1], "Incompatible time steps!"
        # compute DOA from source location
        doas = compute_doa_from_location(speaker_str=speaker_dist,
                                        mic_str=mic_dist,
                                        mic_name_str=mic,
                                        num_windows=Xs.shape[1])
        # compute MVDR and obtain the beamformed signal
        Ys_mvdr = mvdr(Xs, NNs_repeated, doas, doa_mode=True, mics=MIC_GEOMETRY, fs=fs)
        beamformed_sig = istft(Ys_mvdr)
        bf_folder = os.path.join(ROOT_PATH, 'beamforming')
        os.makedirs(bf_folder, exist_ok=True)
        save_path = os.path.join(bf_folder, f'mvdr-{file_name}')
        write_audio_signal(save_path, beamformed_sig[0,:,0], fs)

profile_code(perform_beamforming)

Starting resource monitoring...
Performing MVDR Beamforming using SpeechBrain
File: mic3_8array-up-File1.wav
File: mic3_8array-up-File2.wav
File: mic3_8array-up-File3.wav
File: mic3_8array-up-File4.wav
File: mic3_8array-up-File5.wav
File: mic3_8array-up-File6.wav

Max GPU Usage: 0.00%
Max GPU Memory Usage: 58.00 MB

Execution Time: 94.61 sec
CPU Usage: 12.60%
Max Memory Usage: 19899.92578125 MB


## Spectral Gating (using NoiseReduce Library)

In [10]:
# initialize parameters for Spectral Gating
aggressiveness = 0.5

In [11]:
# %%memit
@profile_resources
def perform_spectral_gating():
    print('Performing Spectral Gating using NoiseReduce')
    for file_name in file_list:
        print(f'File: {file_name}')
        bf_path = os.path.join(ROOT_PATH, 'beamforming', f'mvdr-{file_name}')
        bf_sig = read_audio_signal(bf_path, fs)
        assert bf_sig.shape[1] == 1
        nr_sig = nr.reduce_noise(y=bf_sig[:,0], sr=fs, stationary=False, 
                                    thresh_n_mult_nonstationary=aggressiveness)
        # save audio
        nr_folder = os.path.join(ROOT_PATH, 'spectral-gating')
        os.makedirs(nr_folder, exist_ok=True)
        save_path = os.path.join(nr_folder, f'nr-{file_name}-agg{aggressiveness}.wav')
        write_audio_signal(save_path, nr_sig, fs)
profile_code(perform_spectral_gating)

Starting resource monitoring...
Performing Spectral Gating using NoiseReduce
File: mic3_8array-up-File1.wav
File: mic3_8array-up-File2.wav
File: mic3_8array-up-File3.wav
File: mic3_8array-up-File4.wav
File: mic3_8array-up-File5.wav
File: mic3_8array-up-File6.wav

Max GPU Usage: 0.00%
Max GPU Memory Usage: 58.00 MB

Execution Time: 5.52 sec
CPU Usage: 2.60%
Max Memory Usage: 727.58984375 MB


# MPSENET Neural after MVDR Beamforming (Hybrid)

In [12]:
import os
import shutil
import librosa
import soundfile as sf
import subprocess
import numpy as np
from collections import defaultdict

def get_num_channels(file_path):
    """Get the number of channels in a wav file."""
    data, sr = librosa.load(file_path, sr=None, mono=False)
    return data.shape[0] if len(data.shape) > 1 else 1

def split_wav_by_channels_and_time(file_path, output_folder):
    """Splits wav file into individual channels and then into 10-sec segments."""
    os.makedirs(output_folder, exist_ok=True)
    
    data, sr = librosa.load(file_path, sr=None, mono=False)
    # num_channels = 1 if "soundskrit" in file_path else (8 if "up" in file_path or "down" in file_path else 1)
    num_channels = 1
    if num_channels > data.shape[0]:
        num_channels = data.shape[0]  # Handle cases where actual channels are fewer
    
    split_files = []
    
    for ch in range(num_channels):
        if num_channels==1:
            channel_data = data#[0]
        else:
            channel_data = data[ch]
        duration = librosa.get_duration(y=channel_data, sr=sr)
        num_splits = int(np.ceil(duration / 10))
        
        for split_idx in range(num_splits):
            output_file = os.path.join(output_folder, f"{os.path.basename(file_path).replace('.wav', '')}_ch{ch+1}_split{split_idx+1}.wav")
            if os.path.exists(output_file):
                split_files.append(output_file)
                continue

            start_sample = split_idx * 10 * sr
            end_sample = min((split_idx + 1) * 10 * sr, len(channel_data))
            split_audio = channel_data[start_sample:end_sample]
            
            sf.write(output_file, split_audio, sr)
            split_files.append(output_file)
    
    return split_files

def process_mpsenet(input_folder, output_folder):
    """Run MPSENET on split wav files."""
    os.makedirs(output_folder, exist_ok=True)
    if any(f.endswith(".wav") for f in os.listdir(output_folder)):
        return  # Skip if already processed

    cmd = f"python mpsenet/inference.py --checkpoint_file best_ckpt/g_best_dns --input_noisy_wavs_dir {input_folder} --output_dir {output_folder}"
    subprocess.run(cmd, shell=True, check=True)



def stitch_wav_files(original_file, split_files_folder, output_folder):
    """
    Stitch processed split WAV files back into complete multichannel WAV files.
    
    Args:
        split_files_folder: Folder containing split WAV files with format:
            mic<micnumber>_<arraytype>-File<filenumber>_ch<channelnumber>_split<splitnumber>.wav
        output_folder: Folder to save stitched multichannel WAV files
    """
    os.makedirs(output_folder, exist_ok=True)
    
    # Group files by their base identifiers (mic, arraytype, filenumber)
    file_groups = defaultdict(lambda: defaultdict(list))
    
    # First, organize all split files by their base components
    for filename in os.listdir(split_files_folder):
        if not filename.endswith('.wav'):
            continue
            
        try:
            # Parse filename components
            parts = filename.split('_')
            mic_part = parts[0]  # <suffix>-mic<micnumber> eg. nr-mic1
            array_file_part = parts[1]  # <arraytype>-File<filenumber>
            channel_part = parts[2]  # ch<channelnumber>
            split_part = parts[3]  # split<splitnumber>.wav
            
            mic_number = mic_part[-4:]  # Extract micnumber
            array_type = array_file_part.split('-File')[0]  # Extract arraytype
            file_number = array_file_part.split('-File')[1]  # Extract filenumber
            channel_number = int(channel_part[2:])  # Extract channelnumber
            split_number = int(split_part[5:-4])  # Extract splitnumber
            
            # Create unique key for each original file
            file_key = (mic_number, array_type, file_number)
            
            # Add to our grouped dictionary
            file_groups[file_key][channel_number].append((split_number, filename))
            
        except (IndexError, ValueError) as e:
            print(f"Skipping malformed filename: {filename} - {str(e)}")
            continue
    
    # Process each file group
    for file_key, channel_data in file_groups.items():
        mic_number, array_type, file_number = file_key
        output_filename = f"mpsenetmvdr-{mic_number}_{array_type}-File{file_number}.wav"
        output_path = os.path.join(output_folder, output_filename)
        
        if os.path.exists(output_path):
            # print(f"Skipping existing file: {output_filename}")
            continue
        
        print(f"Processing {output_filename}...")
        
        # Determine number of channels from the data we found
        num_channels = len(channel_data.keys())
        if num_channels == 0:
            continue
            
        # For each channel, stitch its splits together in order
        stitched_channels = []
        sr = None
        max_length = 0
        
        for ch in sorted(channel_data.keys()):
            # Sort splits by their split number
            sorted_splits = sorted(channel_data[ch], key=lambda x: x[0])
            channel_pieces = []
            
            for split_num, split_file in sorted_splits:
                split_path = os.path.join(split_files_folder, split_file)
                try:
                    data, current_sr = librosa.load(split_path, sr=None, mono=True)
                    if sr is None:
                        sr = current_sr
                    elif current_sr != sr:
                        print(f"Warning: Sample rate mismatch in {split_file}")
                    channel_pieces.append(data)
                except Exception as e:
                    print(f"Error loading {split_file}: {str(e)}")
                    continue
            
            if channel_pieces:
                stitched_channel = np.concatenate(channel_pieces)
                stitched_channels.append(stitched_channel)
                max_length = max(max_length, len(stitched_channel))
        
        if not stitched_channels:
            print(f"No valid data for {output_filename}")
            continue
            
        # Pad all channels to equal length
        padded_channels = []
        for channel in stitched_channels:
            padding = max_length - len(channel)
            padded_channels.append(np.pad(channel, (0, padding), mode='constant'))
        
        # Combine channels into multichannel array
        multichannel_audio = np.vstack(padded_channels).T
        
        # Save the stitched file
        try:
            sf.write(output_path, multichannel_audio, sr)
            print(f"Successfully saved {output_filename}")
        except Exception as e:
            print(f"Error saving {output_filename}: {str(e)}")

@profile_resources
def main():
    source_root = ROOT_PATH+"beamforming/"
    target_root = ROOT_PATH+"mpsenet/"
    
    for root, _, files in os.walk(source_root):
    
        relative_path = os.path.relpath(root, source_root)
        new_root = os.path.join(target_root, relative_path)
        
        noisy_wav_folder = os.path.join(new_root, "noisy_wavfiles")
        processed_wav_folder = os.path.join(new_root, "generated_files")
        final_output_folder = new_root
        
        os.makedirs(noisy_wav_folder, exist_ok=True)
        os.makedirs(processed_wav_folder, exist_ok=True)
        os.makedirs(final_output_folder, exist_ok=True)
        
        ### skip if wav file already exists in output folder
        # List all files in the directory (not subdirectories)
        checkwavfiles = [f for f in os.listdir(final_output_folder) 
                if os.path.isfile(os.path.join(final_output_folder, f))]

        # Check if any file ends with .wav (case-insensitive)
        has_wav = any(f.lower().endswith('.wav') for f in checkwavfiles)

        if has_wav:
            print("skipping",final_output_folder) 
            continue
        #####

        for file in files:
            if "File" in file and file.endswith(".wav"):
                file_path = os.path.join(root, file)
                split_files = split_wav_by_channels_and_time(file_path, noisy_wav_folder)
        
        process_mpsenet(noisy_wav_folder, processed_wav_folder)
        
        for file in files:
            if "File" in file and file.endswith(".wav"):
                stitch_wav_files(os.path.join(root, file), processed_wav_folder, final_output_folder)
        
        
        shutil.rmtree(noisy_wav_folder)
        shutil.rmtree(processed_wav_folder)
       
    
if __name__ == "__main__":
    profile_code(main)
    


Starting resource monitoring...
Initializing Inference Process..
Loading 'best_ckpt/g_best_dns'
Complete.
[2KWorking... [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [33m0:00:24[0m00:01[0m00:01[0m
[?25hProcessing mpsenetmvdr-mic3_8array-up-File5.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File5.wav
Processing mpsenetmvdr-mic3_8array-up-File4.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File4.wav
Processing mpsenetmvdr-mic3_8array-up-File2.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File2.wav
Processing mpsenetmvdr-mic3_8array-up-File1.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File1.wav
Processing mpsenetmvdr-mic3_8array-up-File6.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File6.wav
Processing mpsenetmvdr-mic3_8array-up-File3.wav...
Successfully saved mpsenetmvdr-mic3_8array-up-File3.wav

Max GPU Usage: 100.00%
Max GPU Memory Usage: 13721.00 MB

Execution Time: 31.35 sec
CPU Usage: 3.50%
Max Memory Usage: 703.9375 MB


# Classification

In [4]:
import os
import sys
import csv
from pathlib import Path
import pandas as pd
import subprocess
import numpy as np
import soundfile as sf

# Configuration
ref_root = ROOT_PATH+'mpsenet/'
timestamps_root = "SourceTimestamps/"
main_dir = "SSLAM"
sslam_dirname = "SSLAM_Inference"
output_csv = ROOT_PATH+"classification/classification_results.csv"

# Add the inference directory to Python path
inference_path = os.path.join(main_dir, sslam_dirname, "inference")
if inference_path not in sys.path:
    sys.path.append(inference_path)

def get_lowest_rms_channel(audio_path):
    """Find the channel with lowest RMS in the audio file"""
    # audio_path = os.path.join(audio_folder, f"{filename_prefix}.wav")
    if not os.path.exists(audio_path):
        return None
    
    try:
        audio, sr = sf.read(audio_path)
        if len(audio.shape) == 1:  # Mono file
            return 1
        
        # Calculate RMS for each channel
        rms_values = [np.sqrt(np.mean(channel**2)) for channel in audio.T]
        return np.argmin(rms_values) + 1  # Channels are 1-indexed
    
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None
    
# Load the label mapping from CSV
def load_label_mapping(mapping_file):
    label_map = {}
    with open(mapping_file, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            index = int(row[0])
            label = row[2]
            category = row[3]
            label_map[label] = category
    return label_map

# Load label mapping
label_mapping = load_label_mapping(os.path.join(main_dir, 'label_descriptors.csv'))

def analyze_results(prediction_results):
    """Analyze prediction results and determine H/NH classification."""
    if not prediction_results:
        return "Unknown"
    
    sorted_predictions = sorted(prediction_results.items(), key=lambda x: x[1], reverse=True)
    top_prediction = sorted_predictions[0][0]  # Get the top prediction label
    
    return top_prediction, label_mapping.get(top_prediction, "Unknown")

def process_audio_segment(audio_path, start_time, end_time, channel=None):
    """Process an audio segment with optional channel selection."""
    # Create temp segment file
    segment_file = f"/tmp/segment_{os.path.basename(audio_path)}"
    
    try:
        # Base ffmpeg command
        cmd = f"ffmpeg -i {audio_path} -ss {start_time} -to {end_time}"
        
        # Add channel selection if specified
        if channel is not None:
            # For 7.1 audio files, we need to use the pan filter to extract specific channels
            # Channel mapping for 7.1: 0=FL, 1=FR, 2=FC, 3=LFE, 4=BL, 5=BR, 6=SL, 7=SR
            cmd += f" -af 'pan=mono|c0=c{channel}'"
        
        cmd += f" {segment_file} -y"
        
        subprocess.run(cmd, shell=True, check=True)
        
        if not os.path.exists(segment_file):
            return None
            
        # Process the segment
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        from inference import main
        
        original_argv = sys.argv
        try:
            sys.argv = [
                'inference.py',
                '--source_file', segment_file,
                '--label_file', os.path.join(main_dir, sslam_dirname, "inference", "labels.csv"),
                '--model_dir', os.path.join(main_dir, sslam_dirname),
                '--checkpoint_dir', os.path.join(main_dir, "checkpoint_best.pt"),
                '--target_length', '1024',
                '--top_k_prediction', '12',
                '--norm_mean', '-4.268',
                '--norm_std', '4.569'
            ]
            
            prediction_results = main()
            return prediction_results
        finally:
            sys.argv = original_argv
    except subprocess.CalledProcessError as e:
        print(f"Error processing segment: {e}")
        return None
    finally:
        if os.path.exists(segment_file):
            os.remove(segment_file)

def get_ground_truth_class(soundclass):
    """Map soundclass to ground truth (H or NH)"""
    human_classes = ['male', 'female', 'crying', 'humansounds']
    return "H" if soundclass in human_classes else "NH"

@profile_resources
def process_audio_files():
    """Process all audio files and generate classification results."""
    results = []
    
    # Find all relevant audio files in throttle-0 folders
    # throttle0_path = os.path.join(ref_root, "throttle-0")
    audio_files = []
    
    for root, _, files in os.walk(ref_root):
        for file in files:
            # if "throttle-0" not in root: continue
            if "noisy_wavfiles" in root or "generated_files" in root or "throttle-0" in root: continue
            if ("mic1_soundskrit-File" in file or 
                "mic2_8array-down-File" in file or 
                "mic3_8array-up-File" in file):
                audio_files.append(os.path.join(root, file))
    # print(audio_files)
    # Process each audio file
    for audio_path in audio_files:
        audio_name = os.path.basename(audio_path)
        base_file_name = audio_name.split('.')[0].split('-')[-1]

        
        # Determine channel to use
        channel = None
        # if "soundskrit" in audio_name.lower():
        #     channel = 0  # First channel
        # elif "down" in audio_name.lower() or "up" in audio_name.lower():
        #     channel = get_lowest_rms_channel(audio_path)
        
        # Find corresponding timestamp file
        # print(audio_name)
        # if 'agg' in audio_name:
        #     tfilename = audio_name.replace('nr-','').replace('-agg0.5','')
        timestamp_file = os.path.join(timestamps_root, f"{base_file_name}.txt")
        if not os.path.exists(timestamp_file):
            print(f"Timestamp file not found for {timestamp_file}")
            continue
            
        # Read timestamps and soundclasses
        try:
            with open(timestamp_file, 'r') as f:
                segments = [line.strip().split('\t') for line in f if line.strip()]
        except Exception as e:
            print(f"Error reading {timestamp_file}: {e}")
            continue
            
        # Process each segment
        for segment in segments:
            if len(segment) != 3:
                continue
                
            start_time, end_time, soundclass = segment
            ground_truth = get_ground_truth_class(soundclass)
            
            try:
                start_time = float(start_time)
                end_time = float(end_time)
            except ValueError:
                continue
                
            # Process the segment
            prediction_results = process_audio_segment(
                audio_path=audio_path,
                start_time=start_time,
                end_time=end_time,
                channel=channel
            )
            
            predicted_unmapped, predicted_class = analyze_results(prediction_results)
            
            results.append({
                'AudioFile': audio_path,
                'SegmentStart': start_time,
                'SegmentEnd': end_time,
                'GroundTruth': ground_truth,
                'Predicted': predicted_class,
                'GTSoundClass': soundclass,
                'PSoundClass': predicted_unmapped
            })
        # break
    # Save results to CSV
    if results:
        df = pd.DataFrame(results)
        df.to_csv(output_csv, index=False)
        print(f"Results saved to {output_csv}")
        
        # Print summary statistics
        # print("\nClassification Report:")
        # print("Ground Truth vs Predicted:")
        # print(pd.crosstab(df['GroundTruth'], df['Predicted']))
        
        # print("\nBy Sound Class:")
        # print(df.groupby(['SoundClass', 'GroundTruth', 'Predicted']).size().unstack())
    else:
        print("No valid segments processed")

if __name__ == "__main__":
    profile_code(process_audio_files)
    # process_audio_files()

Starting resource monitoring...


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


  from .autonotebook import tqdm as notebook_tqdm
2025-05-23 10:58:32 | INFO | SSLAM_Inference.models.EAT_pretraining | making target model


Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.580
Male speech, man speaking      0.171
Narration, monologue           0.131
Stomach rumble                 0.064
Music                          0.058
Silence                        0.056
Inside, small room             0.044
Scratching (performance technique) 0.040
Speech synthesizer             0.037
Clicking                       0.030
Writing                        0.023
Scratch                        0.021
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.605
Narration, monologue           0.268
Female speech, woman speaking  0.232
Music                          0.102
Synthetic singing              0.094
Speech synthesizer             0.091
Inside, small room             0.081
Male singing                   0.059
Mantra                         0.046
Female singing                 0.037
Singing                        0.029
Vocal music                    0.027
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.380
Speech                         0.141
Music                          0.107
Hands                          0.073
Inside, small room             0.053
Babbling                       0.036
Child speech, kid speaking     0.027
Writing                        0.022
Sigh                           0.019
Whispering                     0.015
Whoosh, swoosh, swish          0.014
Whistling                      0.013
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Hands                          0.215
Plop                           0.093
Silence                        0.080
Writing                        0.078
Speech                         0.066
Whispering                     0.029
Hiccup                         0.027
Music                          0.026
Inside, small room             0.024
Scratch                        0.021
Bouncing                       0.020
Shuffling cards                0.017
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File5.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.484
Hands                          0.155
Writing                        0.105
Speech                         0.082
Rub                            0.074
Inside, small room             0.048
Music                          0.041
Shuffling cards                0.026
Animal                         0.025
Electric shaver, electric razor 0.023
Domestic animals, pets         0.013
Hiccup                         0.013
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.757
Narration, monologue           0.467
Speech synthesizer             0.309
Male speech, man speaking      0.289
Radio                          0.092
Conversation                   0.070
Television                     0.052
Inside, small room             0.046
Music                          0.045
Frog                           0.043
Female speech, woman speaking  0.039
Animal                         0.019
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.764
Narration, monologue           0.406
Female speech, woman speaking  0.329
Inside, small room             0.117
Speech synthesizer             0.104
Child speech, kid speaking     0.100
Telephone                      0.058
Silence                        0.045
Music                          0.040
Hands                          0.038
Conversation                   0.036
Clicking                       0.034
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Babbling                       0.729
Inside, small room             0.155
Speech                         0.141
Child speech, kid speaking     0.052
Music                          0.033
Baby cry, infant cry           0.032
Humming                        0.029
Baby laughter                  0.024
Silence                        0.021
Sigh                           0.021
Laughter                       0.021
Chuckle, chortle               0.018
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Hands                          0.217
Speech                         0.165
Knock                          0.133
Plop                           0.088
Animal                         0.060
Domestic animals, pets         0.055
Chicken, rooster               0.054
Silence                        0.044
Cluck                          0.035
Music                          0.032
Whispering                     0.031
Child speech, kid speaking     0.029
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File2.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.521
Speech                         0.091
Music                          0.060
Hands                          0.046
Inside, small room             0.036
Writing                        0.027
Sizzle                         0.021
Animal                         0.020
Hiss                           0.017
Electric shaver, electric razor 0.017
Whispering                     0.016
Spray                          0.015
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.759
Narration, monologue           0.507
Male speech, man speaking      0.491
Speech synthesizer             0.263
Radio                          0.148
Mantra                         0.066
Television                     0.053
Music                          0.050
Inside, small room             0.049
Chant                          0.031
Silence                        0.023
Conversation                   0.023
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.821
Female speech, woman speaking  0.581
Narration, monologue           0.476
Silence                        0.076
Inside, small room             0.072
Music                          0.068
Speech synthesizer             0.065
Male speech, man speaking      0.036
Hiccup                         0.035
Conversation                   0.032
Child speech, kid speaking     0.027
Hands                          0.016
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File3.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Babbling                       0.393
Speech                         0.365
Silence                        0.137
Baby cry, infant cry           0.118
Inside, small room             0.105
Whimper                        0.099
Child speech, kid speaking     0.097
Crying, sobbing                0.086
Music                          0.076
Electric shaver, electric razor 0.030
Child singing                  0.023
Sigh                           0.021
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.810
Inside, small room             0.158
Silence                        0.120
Television                     0.118
Speech synthesizer             0.081
Hands                          0.073
Narration, monologue           0.071
Music                          0.071
Female speech, woman speaking  0.060
Conversation                   0.049
Male speech, man speaking      0.042
Radio                          0.025
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.893
Female speech, woman speaking  0.300
Narration, monologue           0.133
Hands                          0.133
Silence                        0.073
Inside, small room             0.065
Child speech, kid speaking     0.057
Conversation                   0.044
Music                          0.036
Television                     0.024
Male speech, man speaking      0.024
Hiccup                         0.023
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.269
Silence                        0.207
Inside, small room             0.099
Whistling                      0.060
Hiccup                         0.056
Child speech, kid speaking     0.055
Whistle                        0.034
Hands                          0.031
Babbling                       0.029
Female speech, woman speaking  0.018
Domestic animals, pets         0.017
Whispering                     0.017
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.408
Writing                        0.147
Hands                          0.099
Speech                         0.061
Inside, small room             0.048
Singing bowl                   0.034
Music                          0.026
Sine wave                      0.023
Animal                         0.023
Shuffle                        0.019
Rub                            0.017
Hum                            0.017
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File6.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.173
Bee, wasp, etc.                0.141
Speech                         0.121
Fly, housefly                  0.119
Insect                         0.093
Writing                        0.068
Sine wave                      0.060
Music                          0.041
Singing bowl                   0.035
Buzz                           0.027
Inside, small room             0.026
Animal                         0.021
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.772
Narration, monologue           0.362
Speech synthesizer             0.217
Female speech, woman speaking  0.204
Music                          0.117
Television                     0.111
Inside, small room             0.073
Synthetic singing              0.069
Male speech, man speaking      0.067
Silence                        0.054
Male singing                   0.041
Mantra                         0.033
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.745
Female speech, woman speaking  0.536
Narration, monologue           0.523
Speech synthesizer             0.224
Child speech, kid speaking     0.078
Inside, small room             0.068
Music                          0.060
Male speech, man speaking      0.053
Conversation                   0.049
Mantra                         0.033
Child singing                  0.027
Television                     0.024
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.271
Meow                           0.262
Cat                            0.238
Domestic animals, pets         0.176
Animal                         0.137
Silence                        0.119
Child speech, kid speaking     0.089
Inside, small room             0.067
Babbling                       0.058
Hands                          0.045
Music                          0.040
Female speech, woman speaking  0.032
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Writing                        0.224
Silence                        0.191
Speech                         0.166
Singing bowl                   0.047
Bee, wasp, etc.                0.045
Inside, small room             0.038
Music                          0.034
Buzz                           0.030
Hands                          0.029
Female speech, woman speaking  0.026
Fly, housefly                  0.025
Rub                            0.022
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File1.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.484
Speech                         0.148
Hands                          0.063
Inside, small room             0.048
Writing                        0.040
Coin (dropping)                0.034
Music                          0.033
Shuffling cards                0.028
Scissors                       0.021
Single-lens reflex camera      0.020
Sneeze                         0.018
Whispering                     0.015
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File4.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File4.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Speech                         0.121
Sine wave                      0.108
Chirp tone                     0.094
Music                          0.088
Synthesizer                    0.044
Plop                           0.024
Inside, small room             0.024
Silence                        0.021
Singing bowl                   0.019
Musical instrument             0.019
Hands                          0.019
Tuning fork                    0.017
**************************************************
Namespace(source_file='/tmp/segment_mpsenetmvdr-mic3_8array-up-File4.wav', label_file='SSLAM/SSLAM_Inference/inference/labels.csv', model_dir='SSLAM/SSLAM_Inference', checkpoint_dir='SSLAM/checkpoint_best.pt', target_length=1024, top_k_prediction=12, norm_mean=-4.268, norm_std=4.569)


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

Original sample rate is already 16kHz in file /tmp/segment_mpsenetmvdr-mic3_8array-up-File4.wav
************ Acoustic Event Inference ************
LABEL                          PREDICTION
Silence                        0.183
Speech                         0.074
Writing                        0.062
Music                          0.048
Singing bowl                   0.048
Inside, small room             0.036
Bee, wasp, etc.                0.029
Buzz                           0.028
Hands                          0.022
Animal                         0.021
Fly, housefly                  0.018
Insect                         0.016
**************************************************
Results saved to ../ComputeResourcesCheck/classification/classification_results.csv

Max GPU Usage: 19.00%
Max GPU Memory Usage: 819.00 MB

Execution Time: 83.28 sec
CPU Usage: 3.20%
Max Memory Usage: 4215.703125 MB


In [14]:
!nvidia-smi

Thu May 22 21:20:03 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:1A:00.0 Off |                  N/A |
| 30%   28C    P8              13W / 350W |     58MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        On  | 00000000:68:00.0 Off |  

In [15]:
import platform, psutil, GPUtil

print("=== System ===")
print(f"OS: {platform.system()} {platform.release()}")
print(f"Architecture: {platform.machine()}")
print(f"Python Version: {platform.python_version()}")

print("\n=== CPU ===")
print(f"Processor: {platform.processor()}")
print(f"Physical Cores: {psutil.cpu_count(logical=False)}")
print(f"Logical Cores: {psutil.cpu_count(logical=True)}")
print(f"Frequency: {psutil.cpu_freq().current:.2f} MHz")

print("\n=== RAM ===")
mem = psutil.virtual_memory()
print(f"Total RAM: {mem.total / 1e9:.2f} GB")

print("\n=== GPU ===")
try:
    GPUs = GPUtil.getGPUs()
    for gpu in GPUs:
        print(f"GPU {gpu.id} - {gpu.name}")
        print(f"  Load: {gpu.load * 100:.1f}%")
        print(f"  Memory: {gpu.memoryUsed}/{gpu.memoryTotal} MB")
        print(f"  Temperature: {gpu.temperature} °C")
except Exception as e:
    print("GPUtil not available or no GPU found:", e)

=== System ===
OS: Linux 5.15.0-119-generic
Architecture: x86_64
Python Version: 3.9.21

=== CPU ===
Processor: x86_64
Physical Cores: 18
Logical Cores: 36
Frequency: 2946.86 MHz

=== RAM ===
Total RAM: 134.73 GB

=== GPU ===
GPU 0 - NVIDIA GeForce RTX 3090
  Load: 0.0%
  Memory: 58.0/24576.0 MB
  Temperature: 28.0 °C
GPU 1 - NVIDIA GeForce RTX 3090
  Load: 0.0%
  Memory: 629.0/24576.0 MB
  Temperature: 30.0 °C
