This notebook is an update of 13.1-data-process-augmentation-with-batch.ipynb

Because I want to add mixed audio of different audio clips later, that is, stack multiple audios together, the length remains unchanged, but this audio will contain multiple types of birds.

But if I want to do this, I need to modify the output of audio label.

This is assuming that I have 182 categories, then I want to change the audiolabel of each data to [1,0,0,0,.....,0,0,0,0] tensor.shape=[182]

In this way, if I mix audio, then audio label can also be mixed

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
metadata_path='../../data/train_metadata_new_add_rating.csv'

In [3]:
# Need to interpolate missing values ​​for ratings in metadata csv files

def rating_value_interplote(df:pd.DataFrame):
    '''
    interplote Nan values for rating col in metadata csv 

    parameters:
        df: the df of the metadata csv file

    rating col means the quality of the corresponding audio file
        5 is high quality
        1 is low quality
        0 is without defined quality level
    '''

    if df['rating'].isna().sum()>0: # with missing value
        df['rating'].fillna(0, inplace=True)

    # For all places where the value is 0, a random value is given, choosing from the specified choices.
    mask = df['rating'] == 0  # Create a boolean mask indicating which positions are 0

    choices=np.arange(0.5,5.1,0.5).tolist() # [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    random_values = np.random.choice(choices, size=mask.sum())  # Generate random numbers for these 0 values
    df.loc[mask, 'rating'] = random_values  # Fill the generated random numbers back into the corresponding positions of the original DataFrame

    return df

In [4]:
# Calculating the weight of each audio file by rating helps model training
def audio_weight(df):
    '''
    calculate the weight corresponding to each audio file through the rating value

    Because each audio has different quality level, we use weight to affect the inportance of each audio in models,
    the lower the quality of the audio, the lower the weight
    '''
    # Through rating, we calculate the credibility of each audio and express it through weight. 
    # The purpose of this is to improve the model by increasing the weight of high-quality audio and reducing the weight of low-quality audio.
    df["audio_weight"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

    return df



In [5]:
# Because this is an unbalanced dataset, the amount of data in each category is very different
# So I will calculate the weight of each category here
# **(-0.5) The purpose is to reduce the relative influence of high-frequency categories and increase the influence of low-frequency categories, 
# so as to help the model better learn those uncommon categories
# The purpose of calculating this is to build a WeightedRandomSampler, so that each time a batch is extracted using dataloader, it is more friendly to data of different categories.
def sampling_weight(df)->torch.Tensor:
    '''
    calculate the sampling weight of each audio file

    because this is imbalanced dataset
    we hope the category with less data has large probability to be picked.
    '''
    sample_weights = (df['primary_label'].value_counts() / df['primary_label'].value_counts().sum()) ** (-0.5)

    # Map weights to each row of the original data
    sample_weights_map = df['primary_label'].map(sample_weights)

    # Convert pandas Series to NumPy array
    sample_weights_np = sample_weights_map.to_numpy(dtype=np.float32)

    # Convert a NumPy array to a PyTorch tensor using torch.from_numpy
    sample_weights_tensor = torch.from_numpy(sample_weights_np)

    return sample_weights_tensor


In [6]:
df=pd.read_csv(metadata_path,header=0)

sample_weights_tensor=sampling_weight(df=df)
# Here we will build an argument sampler that will be used by the dataloader
# It should be noted that the order of weights in the constructed sampler needs to be consistent with the order of data passed into the dataloader, otherwise the weights will not match

# Create a sampler based on the newly obtained weight list
sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

sampler

<torch.utils.data.sampler.WeightedRandomSampler at 0x29cbbef80>

In [7]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate


In [8]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class CustomCompose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class CustomOneOf:
    def __init__(self, transforms: list, p=1.0):
        self.transforms = transforms
        self.p = p

    def __call__(self, y: np.ndarray):
        if np.random.rand() < self.p:
            n_trns = len(self.transforms)
            trns_idx = np.random.choice(n_trns)
            trns = self.transforms[trns_idx]
            y = trns(y)
        return y


class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=40.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"):
        super().__init__(always_apply, p)

        assert mode in [
            "uniform",
            "fade",
            "fade",
            "cosine",
            "sine",
        ], "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"

        self.db_limit = db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == "uniform":
            db_translated = 10 ** (db / 20)
        elif self.mode == "fade":
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == "cosine":
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented


class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5, sr=32000):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        return augmented


class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1, sr=32000):
        super().__init__(always_apply, p)
        self.max_rate = max_rate
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented


def _db2float(db: float, amplitude=True):
    if amplitude:
        return 10 ** (db / 20)
    else:
        return 10 ** (db / 10)


def volume_down(y: np.ndarray, db: float):
    """
    Low level API for decreasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to decrease
    Returns
    -------
    applied: numpy.ndarray
        audio with decreased volume
    """
    applied = y * _db2float(-db)
    return applied


def volume_up(y: np.ndarray, db: float):
    """
    Low level API for increasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to increase
    Returns
    -------
    applied: numpy.ndarray
        audio with increased volume
    """
    applied = y * _db2float(db)
    return applied


class RandomVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        if db >= 0:
            return volume_up(y, db)
        else:
            return volume_down(y, db)


class CosineVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
        dbs = _db2float(cosine * db)
        return y * dbs


class AddGaussianNoise(AudioTransform):
    """Add gaussian noise to the samples"""

    supports_multichannel = True

    def __init__(
        self, always_apply=False, min_amplitude=0.001, max_amplitude=0.015, p=0.5
    ):
        """
        :param min_amplitude: Minimum noise amplification factor
        :param max_amplitude: Maximum noise amplification factor
        :param p:
        """
        super().__init__(always_apply, p)
        assert min_amplitude > 0.0
        assert max_amplitude > 0.0
        assert max_amplitude >= min_amplitude
        self.min_amplitude = min_amplitude
        self.max_amplitude = max_amplitude

    def apply(self, samples: np.ndarray, sample_rate=32000):
        amplitude = np.random.uniform(self.min_amplitude, self.max_amplitude)
        noise = np.random.randn(*samples.shape).astype(np.float32)
        samples = samples + amplitude * noise
        return samples


class AddGaussianSNR(AudioTransform):
    """
    Add gaussian noise to the input. A random Signal to Noise Ratio (SNR) will be picked
    uniformly in the decibel scale. This aligns with human hearing, which is more
    logarithmic than linear.
    """

    supports_multichannel = True

    def __init__(
        self,
        always_apply=False,
        min_snr_in_db: float = 5.0,
        max_snr_in_db: float = 40.0,
        p: float = 0.5,
    ):
        """
        :param min_snr_in_db: Minimum signal-to-noise ratio in dB. A lower number means more noise.
        :param max_snr_in_db: Maximum signal-to-noise ratio in dB. A greater number means less noise.
        :param p: The probability of applying this transform
        """
        super().__init__(always_apply, p)
        self.min_snr_in_db = min_snr_in_db
        self.max_snr_in_db = max_snr_in_db

    def apply(self, samples: np.ndarray, sample_rate=32000):
        snr = np.random.uniform(self.min_snr_in_db, self.max_snr_in_db)

        clean_rms = np.sqrt(np.mean(np.square(samples)))

        a = float(snr) / 20
        noise_rms = clean_rms / (10**a)

        noise = np.random.normal(0.0, noise_rms, size=samples.shape).astype(np.float32)
        return samples + noise


class Normalize(AudioTransform):
    """
    Apply a constant amount of gain, so that highest signal level present in the sound becomes
    0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known
    as peak normalization.
    """

    supports_multichannel = True

    def __init__(self, always_apply=False, apply_to: str = "all", p: float = 0.5):
        super().__init__(always_apply, p)
        assert apply_to in ("all", "only_too_loud_sounds")
        self.apply_to = apply_to

    def apply(self, samples: np.ndarray, sample_rate=32000):
        max_amplitude = np.amax(np.abs(samples))
        if self.apply_to == "only_too_loud_sounds" and max_amplitude < 1.0:
            return samples

        if max_amplitude > 0:
            return samples / max_amplitude
        else:
            return samples

class NormalizeMelSpec(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps

    def forward(self, X):
        mean = X.mean((1, 2), keepdim=True)
        std = X.std((1, 2), keepdim=True)
        Xstd = (X - mean) / (std + self.eps)
        norm_min, norm_max = Xstd.min(-1)[0].min(-1)[0], Xstd.max(-1)[0].max(-1)[0]
        fix_ind = (norm_max - norm_min) > self.eps * torch.ones_like(
            (norm_max - norm_min)
        )
        V = torch.zeros_like(Xstd)
        if fix_ind.sum():
            V_fix = Xstd[fix_ind]
            norm_max_fix = norm_max[fix_ind, None, None]
            norm_min_fix = norm_min[fix_ind, None, None]
            V_fix = torch.max(
                torch.min(V_fix, norm_max_fix),
                norm_min_fix,
            )
            # print(V_fix.shape, norm_min_fix.shape, norm_max_fix.shape)
            V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
            V[fix_ind] = V_fix
        return V

The previous content is the same as 13.1-data-process-augmentation-with-batch.ipynb

Next, I will add the processing steps for audio labels in the Dataset

In [9]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order is very important and needs to be matched one by one in the subsequent training, I will save these types here
# Save as .npy file
np.save("./temp_files/13-2-bird-cates.npy", bird_cates)

In [10]:
bird_cates

array(['malpar1', 'litgre1', 'houspa', 'indrob1', 'comtai1', 'grynig2',
       'rufwoo2', 'yebbul3', 'indpit1', 'gyhcaf1', 'ruftre2', 'wynlau1',
       'inpher1', 'comkin1', 'comior1', 'tibfly3', 'pomgrp2', 'oripip1',
       'indtit1', 'nutman', 'junmyn1', 'rutfly6', 'goflea1', 'litegr',
       'lesyel1', 'lewduc1', 'inbrob1', 'crfbar1', 'scamin3', 'shikra1',
       'gryfra', 'commoo3', 'grewar3', 'brodro1', 'rocpig', 'categr',
       'ingori1', 'plhpar1', 'sbeowl1', 'bwfshr1', 'junowl1', 'orihob2',
       'greegr', 'barswa', 'paisto1', 'moipig1', 'plapri1', 'forwag1',
       'maghor2', 'brasta1', 'lirplo', 'grecou1', 'kenplo1', 'bkcbul1',
       'grbeat1', 'junbab2', 'comsan', 'whbtre1', 'brnhao1', 'brcful1',
       'whcbar1', 'hoopoe', 'plaflo1', 'maltro1', 'piekin1', 'brnshr',
       'whiter2', 'brfowl1', 'pursun4', 'grehor1', 'pursun3', 'purswa3',
       'yebbab1', 'lblwar1', 'malwoo1', 'laudov1', 'grenig1', 'tilwar1',
       'heswoo1', 'compea', 'putbab1', 'smamin1', 'rorpar', 'gr

In [11]:
# load .npy file
loaded_array = np.load("./temp_files/13-2-bird-cates.npy",allow_pickle=True)

# Print the array contents to verify
print(loaded_array)
print(type(loaded_array))

['malpar1' 'litgre1' 'houspa' 'indrob1' 'comtai1' 'grynig2' 'rufwoo2'
 'yebbul3' 'indpit1' 'gyhcaf1' 'ruftre2' 'wynlau1' 'inpher1' 'comkin1'
 'comior1' 'tibfly3' 'pomgrp2' 'oripip1' 'indtit1' 'nutman' 'junmyn1'
 'rutfly6' 'goflea1' 'litegr' 'lesyel1' 'lewduc1' 'inbrob1' 'crfbar1'
 'scamin3' 'shikra1' 'gryfra' 'commoo3' 'grewar3' 'brodro1' 'rocpig'
 'categr' 'ingori1' 'plhpar1' 'sbeowl1' 'bwfshr1' 'junowl1' 'orihob2'
 'greegr' 'barswa' 'paisto1' 'moipig1' 'plapri1' 'forwag1' 'maghor2'
 'brasta1' 'lirplo' 'grecou1' 'kenplo1' 'bkcbul1' 'grbeat1' 'junbab2'
 'comsan' 'whbtre1' 'brnhao1' 'brcful1' 'whcbar1' 'hoopoe' 'plaflo1'
 'maltro1' 'piekin1' 'brnshr' 'whiter2' 'brfowl1' 'pursun4' 'grehor1'
 'pursun3' 'purswa3' 'yebbab1' 'lblwar1' 'malwoo1' 'laudov1' 'grenig1'
 'tilwar1' 'heswoo1' 'compea' 'putbab1' 'smamin1' 'rorpar' 'graher1'
 'ashpri1' 'piebus1' 'grnwar1' 'eurbla2' 'asikoe2' 'whbwat1' 'sqtbul1'
 'brwowl1' 'bncwoo3' 'ashwoo2' 'pabflo1' 'eaywag1' 'ashdro1' 'rerswa1'
 'emedov2' 'houcro1'

In [12]:
# Find the index of the target value in the array
index = np.where(loaded_array == 'gyhcaf1')[0][0]

index


9

In [13]:
a=torch.zeros(len(loaded_array))

a[9]=1

a

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [14]:
class BirdclefDataset(Dataset):
    def __init__(self,metadata_path:str,bird_category_dir:str,audio_dir:str='../../data/train_audio'):
        '''
        parameters:
            metadata_path: the directory of the metadata csv file
            bird_category_dir: the directory of the bird category array file (npy)
            audio_dir: the parent path where all audio files stored
        '''
        super().__init__()
        self.raw_df=pd.read_csv(metadata_path,header=0)
        # inperplote nan or 0 value of rating col
        self.raw_df=rating_value_interplote(df=self.raw_df)
        # Calculate the weight of each audio file by rating
        self.raw_df=audio_weight(self.raw_df)

        self.audio_dir=audio_dir

        self.bird_cate_array=np.load(bird_category_dir,allow_pickle=True)

    def get_audio_path(self,file_name:str) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            file_name: in format category_type/XC-ID.ogg (asbfly/XC134896.ogg)

        Return:
            the single audio path string
        '''

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,file_name)


    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.raw_df['clip_start_time'].iloc[index]
        duration_seconds=self.raw_df['duration'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # The length is not enough, a mask is required
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last part of the raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)
                
        else:
            raise ValueError('The clip start time is out of raw audio length')

        return clip


    def random_audio_augmentation(self,audio:torch.Tensor):
        '''
        audio (torch.Tensor): A 2D tensor of audio samples with shape (1, N), where N is the number of samples.
        '''
        np_audio_transforms = CustomCompose(
            [
                CustomOneOf(
                    [
                        NoiseInjection(p=1, max_noise_level=0.04),
                        GaussianNoise(p=1, min_snr=5, max_snr=20),
                        PinkNoise(p=1, min_snr=5, max_snr=20),
                        AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.03, p=0.5),
                        AddGaussianSNR(min_snr_in_db=5, max_snr_in_db=15, p=0.5),
                    ],
                    p=0.3,  
                ),
            ]
        )

        audio_aug=np_audio_transforms(audio[0].numpy())

        # tranfer the array to 2D tensor and keep the num channel is 1
        # this step is to keep the input and output shape adn type are the same

        audio_aug_tensor=torch.from_numpy(audio_aug)
        audio_aug_tensor=audio_aug_tensor.unsqueeze(0)

        return audio_aug_tensor
    

    def audio_label_tensor_generator(self,true_label:str)-> torch.Tensor:
        '''
        Generate a tensor containing all categories based on the given real audio label

        Parameters:
            true lable: a label string

        Return:
            If have 10 class, and give a true lable
            the return should be tensor([0,1,0,0,0,0,0,0,0,0])
        '''
        # Find the index of the target value in the array
        idx = np.where(loaded_array == true_label)[0][0]
        
        # Create a tensor of all zeros, with length equal to the length of the array
        audio_label_tensor = torch.zeros(len(self.bird_cate_array))

        # Set the value of the corresponding index position to 1
        audio_label_tensor[idx] = 1

        return audio_label_tensor


    
    def __len__(self):
        return self.raw_df.shape[0]

    def __getitem__(self,index):
        row=self.raw_df.iloc[index]

        audio_label=row['primary_label']
        audio_weight=row['audio_weight']

        # Get the path of a single audio file
        single_audio_dir=self.get_audio_path(row['filename'])

        # Read the audio array according to the path
        audio, sr=read_audio(single_audio_dir)

        # augmentation
        audio_augmentation=self.random_audio_augmentation(audio=audio)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio=audio_augmentation,sample_rate=sr)

        # change audio label to one-hot tensor
        audio_label_tensor=self.audio_label_tensor_generator(true_label=audio_label)

        audio_label_tensor=torch.tensor(audio_label_tensor, dtype=torch.float32)
        clip=torch.tensor(clip, dtype=torch.float32)
        audio_weight=torch.tensor(audio_weight, dtype=torch.float32)

        
        return audio_label_tensor,clip,audio_weight

In [15]:
BD=BirdclefDataset(metadata_path=metadata_path,bird_category_dir="./temp_files/13-2-bird-cates.npy")

train_dataloader = DataLoader(dataset=BD, batch_size=32, sampler=sampler, pin_memory=True)

In [16]:
batch = next(iter(train_dataloader))
audio_label,clip,audio_weights = batch
print(audio_label)
print(type(audio_label))
print(audio_label.shape)
print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
<class 'torch.Tensor'>
torch.Size([32, 182])
tensor([[[-2.5385e-01, -2.5273e-01, -2.5611e-01,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 1.4537e-02,  3.2269e-02,  3.5367e-02,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 2.2610e-04,  1.9591e-04,  1.3957e-04,  ..., -1.2754e-04,
          -1.7877e-04, -1.6762e-04]],

        ...,

        [[-6.7133e-03, -3.1187e-03,  5.8716e-03,  ...,  4.1194e-03,
           6.2594e-03,  5.5284e-03]],

        [[-8.9340e-03,  2.0985e-02,  4.7529e-02,  ...,  1.3210e-02,
           6.0540e-02,  8.2885e-02]],

        [[ 7.2077e-03,  3.4099e-03,  1.9629e-03,  ..., -5.2462e-03,
          -1.3889e-02, -1.6666e-02]]])
torch.Size([32, 1, 160000])
tensor([0.8000, 0

In [17]:
audio_label[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

### After completing the update of the audio label, I need to complete the audio mixing step for data augmentation

In [18]:
class Mixup(nn.Module):
    def __init__(self, mix_beta, mixup_prob, mixup_double):
        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup_prob = mixup_prob
        self.mixup_double = mixup_double

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0] # Generate a random number p and compare it with mixup_prob to decide whether to mix.
        if p < self.mixup_prob:
            bs = X.shape[0] # batch size
            n_dims = len(X.shape)
            perm = torch.randperm(bs) # Generate a random permutation for randomly selecting samples from the current batch for mixing.

            p1 = torch.rand((1,))[0] # If the random number p1 (determines whether to perform double mixing) is less than mixup_double, perform a single mix. Otherwise, perform double mixing:
            if p1 < self.mixup_double:
                X = X + X[perm]
                Y = Y + Y[perm]
                Y = torch.clamp(Y, 0, 1) # Use torch.clamp to clamp the values ​​of Y between 0 and 1 (suitable for probabilistic or binary labels).

                if weight is None:
                    return X, Y
                else:
                    weight = 0.5 * weight + 0.5 * weight[perm]
                    return X, Y, weight
            else:
                perm2 = torch.randperm(bs)
                X = X + X[perm] + X[perm2]
                Y = Y + Y[perm] + Y[perm2]
                Y = torch.clamp(Y, 0, 1)

                if weight is None:
                    return X, Y
                else:
                    weight = (
                        1 / 3 * weight + 1 / 3 * weight[perm] + 1 / 3 * weight[perm2]
                    )
                    return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [19]:
mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

clip2, audio_label2,audio_weights2=mixup(X=clip,Y=audio_label,weight=audio_weights)

In [20]:
print(clip2)
print(clip2.shape)

tensor([[[-0.2086, -0.1543, -0.1756,  ..., -0.0482, -0.0108,  0.0306]],

        [[ 0.0445,  0.0647,  0.0697,  ..., -0.0068, -0.0083, -0.0079]],

        [[-0.0051, -0.0199,  0.0076,  ...,  0.0051, -0.0060,  0.0104]],

        ...,

        [[ 0.0493,  0.0224, -0.0208,  ...,  0.0076,  0.0044, -0.0039]],

        [[ 0.0587,  0.0666,  0.0812,  ..., -0.0239,  0.1142,  0.1555]],

        [[-0.2373, -0.1934, -0.1899,  ..., -0.0416, -0.0322, -0.0106]]])
torch.Size([32, 1, 160000])


In [21]:
print(audio_label2)
print(audio_label2.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([32, 182])


In [22]:
print(audio_label2[0])
print(audio_label2[1])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])
tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.,

### Use Compose to combine multiple audio transformation operations. These operations are applied to the input audio data to enhance the generalization and robustness of the model.

This helps the model learn how to process audio with different pitches and time offsets, and enhances the model's ability to recognize audio data in different environments and conditions.

In [23]:
audio_transforms = Compose(
            [
                # AddColoredNoise(p=0.5),
                PitchShift(
                    min_transpose_semitones=-4,
                    max_transpose_semitones=4,
                    sample_rate=32000,
                    p=0.4,
                ),
                Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
            ]
        )

clip3=audio_transforms(clip2,sample_rate=32000)

In [24]:
print(clip3)

print(clip3.shape)

tensor([[[-0.1824, -0.1687, -0.1614,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0088, -0.0056, -0.0065,  ..., -0.0182, -0.0152, -0.0106]],

        [[-0.0152, -0.0144, -0.0030,  ..., -0.0122, -0.0156, -0.0015]],

        ...,

        [[ 0.0493,  0.0224, -0.0208,  ...,  0.0076,  0.0044, -0.0039]],

        [[-0.0176,  0.0132,  0.0423,  ..., -0.0548, -0.0536, -0.0306]],

        [[-0.2373, -0.1934, -0.1899,  ..., -0.0416, -0.0322, -0.0106]]])
torch.Size([32, 1, 160000])


### After mixup, mel spec operation is required

In [25]:
## Convert audio data into mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate)  

    # Calculate Mel Spectrogram
    # n_mels = 40 # Number of Mel filters

    # Set up Mel Spectrogram converter
    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec

In [26]:
clip4=mel_transform(sample_rate=32000,audio=clip3)

In [27]:
print(clip4)
print(clip4.shape)

tensor([[[[2.9909e+03, 9.7385e+02, 1.6147e+03,  ..., 5.8038e-01,
           9.2708e-01, 1.2047e+00],
          [8.2873e+00, 5.4697e-01, 6.4207e+00,  ..., 3.0296e-01,
           4.2942e-01, 1.6211e+00],
          [1.2902e-01, 5.0722e-02, 1.1639e-01,  ..., 6.4557e-01,
           3.4999e-01, 1.2016e+00],
          ...,
          [5.4295e-02, 7.3266e-03, 5.2465e-02,  ..., 3.2567e-03,
           2.4021e-03, 7.3107e-02],
          [2.6392e-02, 3.2367e-03, 2.2114e-02,  ..., 2.1022e-03,
           1.4728e-03, 1.4756e-02],
          [6.0133e-02, 8.4195e-04, 1.9565e-03,  ..., 1.5346e-03,
           1.1414e-03, 9.5083e-03]]],


        [[[4.7839e+01, 3.2372e+01, 3.0774e+01,  ..., 2.1115e+01,
           1.7425e+01, 8.9048e+01],
          [4.3204e+01, 5.7600e+01, 5.6757e+01,  ..., 3.1537e+01,
           3.6056e+01, 2.6504e+01],
          [1.3618e+01, 4.1824e+01, 1.3072e+01,  ..., 2.2552e+01,
           1.0491e+01, 1.5789e+01],
          ...,
          [6.0389e-02, 5.4137e-02, 5.7484e-02,  ..., 4.73

In [28]:
##Convert the amplitude of Mel Spectrogram to decibel (Decibel, dB)

db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

clip5=db_transform(clip4)

In [29]:
print(clip5)
print(clip5.shape)

tensor([[[[ 34.7580,  29.8849,  32.0809,  ...,  -2.3629,  -0.3288,   0.8089],
          [  9.1842,  -2.6203,   8.0758,  ...,  -5.1862,  -3.6712,   2.0982],
          [ -8.8934, -12.9480,  -9.3409,  ...,  -1.9005,  -4.5595,   0.7976],
          ...,
          [-12.6524, -21.3510, -12.8013,  ..., -24.8722, -26.1940, -11.3604],
          [-15.7853, -24.8990, -16.5532,  ..., -26.7733, -28.3186, -18.3103],
          [-12.2089, -30.7471, -27.0853,  ..., -28.1400, -29.4255, -20.2190]]],


        [[[ 16.7978,  15.1017,  14.8819,  ...,  13.2459,  12.4118,  19.4962],
          [ 16.3553,  17.6042,  17.5402,  ...,  14.9881,  15.5697,  14.2330],
          [ 11.3412,  16.2143,  11.1634,  ...,  13.5319,  10.2081,  11.9835],
          ...,
          [-12.1904, -12.6650, -12.4046,  ..., -13.2452, -11.9240, -13.7735],
          [-14.1509, -13.9835, -13.7224,  ..., -13.4755, -13.1523, -12.4560],
          [-17.7454, -18.5337, -17.5318,  ..., -19.2270, -17.0064, -17.9550]]],


        [[[  2.8658,   1.2

### Normalize the data

In [30]:
clip6=(clip5+80)/80

In [31]:
print(clip6)
print(clip6.shape)

tensor([[[[1.4345, 1.3736, 1.4010,  ..., 0.9705, 0.9959, 1.0101],
          [1.1148, 0.9672, 1.1009,  ..., 0.9352, 0.9541, 1.0262],
          [0.8888, 0.8381, 0.8832,  ..., 0.9762, 0.9430, 1.0100],
          ...,
          [0.8418, 0.7331, 0.8400,  ..., 0.6891, 0.6726, 0.8580],
          [0.8027, 0.6888, 0.7931,  ..., 0.6653, 0.6460, 0.7711],
          [0.8474, 0.6157, 0.6614,  ..., 0.6483, 0.6322, 0.7473]]],


        [[[1.2100, 1.1888, 1.1860,  ..., 1.1656, 1.1551, 1.2437],
          [1.2044, 1.2201, 1.2193,  ..., 1.1874, 1.1946, 1.1779],
          [1.1418, 1.2027, 1.1395,  ..., 1.1691, 1.1276, 1.1498],
          ...,
          [0.8476, 0.8417, 0.8449,  ..., 0.8344, 0.8510, 0.8278],
          [0.8231, 0.8252, 0.8285,  ..., 0.8316, 0.8356, 0.8443],
          [0.7782, 0.7683, 0.7809,  ..., 0.7597, 0.7874, 0.7756]]],


        [[[1.0358, 1.0162, 1.1105,  ..., 1.1106, 1.0428, 1.1132],
          [1.0927, 1.1211, 1.1153,  ..., 1.0938, 1.0776, 1.0843],
          [1.0274, 1.1094, 1.0742,  ..

### Random mask part of the spectrogram, which helps the model learn to be robust to missing information in certain time periods.

In [32]:
time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

clip7 = time_mask_transform(clip6)

In [33]:
print(clip7)
print(clip7.shape)

tensor([[[[1.4345, 1.3736, 1.4010,  ..., 0.9705, 0.9959, 1.0101],
          [1.1148, 0.9672, 1.1009,  ..., 0.9352, 0.9541, 1.0262],
          [0.8888, 0.8381, 0.8832,  ..., 0.9762, 0.9430, 1.0100],
          ...,
          [0.8418, 0.7331, 0.8400,  ..., 0.6891, 0.6726, 0.8580],
          [0.8027, 0.6888, 0.7931,  ..., 0.6653, 0.6460, 0.7711],
          [0.8474, 0.6157, 0.6614,  ..., 0.6483, 0.6322, 0.7473]]],


        [[[1.2100, 1.1888, 1.1860,  ..., 1.1656, 1.1551, 1.2437],
          [1.2044, 1.2201, 1.2193,  ..., 1.1874, 1.1946, 1.1779],
          [1.1418, 1.2027, 1.1395,  ..., 1.1691, 1.1276, 1.1498],
          ...,
          [0.8476, 0.8417, 0.8449,  ..., 0.8344, 0.8510, 0.8278],
          [0.8231, 0.8252, 0.8285,  ..., 0.8316, 0.8356, 0.8443],
          [0.7782, 0.7683, 0.7809,  ..., 0.7597, 0.7874, 0.7756]]],


        [[[1.0358, 1.0162, 1.1105,  ..., 1.1106, 1.0428, 1.1132],
          [1.0927, 1.1211, 1.1153,  ..., 1.0938, 1.0776, 1.0843],
          [1.0274, 1.1094, 1.0742,  ..

### Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.

In [34]:
def compute_deltas(
    specgram: torch.Tensor, win_length: int = 5, mode: str = "replicate"
) -> torch.Tensor:
    r"""Compute delta coefficients of a tensor, usually a spectrogram:

    .. math::
       d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}

    where :math:`d_t` is the deltas at time :math:`t`,
    :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
    :math:`N` is ``(win_length-1)//2``.

    Args:
        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
        win_length (int, optional): The window length used for computing delta (Default: ``5``)
        mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)

    Returns:
        Tensor: Tensor of deltas of dimension (..., freq, time)

    Example
        >>> specgram = torch.randn(1, 40, 1000)
        >>> delta = compute_deltas(specgram)
        >>> delta2 = compute_deltas(delta)
    """
    device = specgram.device
    print(device)
    dtype = specgram.dtype

    # pack batch
    shape = specgram.size()
    specgram = specgram.reshape(1, -1, shape[-1])

    assert win_length >= 3

    n = (win_length - 1) // 2

    # twice sum of integer squared
    denom = n * (n + 1) * (2 * n + 1) / 3

    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

    kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(
        specgram.shape[1], 1, 1
    )

    output = (
        torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
    )

    # unpack batch
    output = output.reshape(shape)

    return output


def make_delta(input_tensor: torch.Tensor):
    input_tensor = input_tensor.transpose(3, 2)
    input_tensor = compute_deltas(input_tensor)
    input_tensor = input_tensor.transpose(3, 2)
    return input_tensor


def image_delta(x):
    delta_1 = make_delta(x)
    delta_2 = make_delta(delta_1)
    x = torch.cat([x, delta_1, delta_2], dim=1)
    return x

In [35]:
clip8= image_delta(clip7)


cpu
cpu


In [36]:
clip8.shape

torch.Size([32, 3, 40, 251])

In [37]:
len(clip8.shape)

4

In [38]:
class Mixup2(nn.Module):
    def __init__(self, mix_beta, mixup2_prob):
        super(Mixup2, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup2_prob = mixup2_prob

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0]
        if p < self.mixup2_prob:
            bs = X.shape[0]
            n_dims = len(X.shape)
            perm = torch.randperm(bs)
            coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device)

            if n_dims == 2:
                X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
            elif n_dims == 3:
                X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
            else:
                X = (
                    coeffs.view(-1, 1, 1, 1) * X
                    + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]
                )
            Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]
            # Y = Y + Y[perm]
            # Y = torch.clamp(Y, 0, 1)

            if weight is None:
                return X, Y
            else:
                weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
                return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [39]:
mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

clip9, audio_label9,audio_weights9 = mixup2(clip8, audio_label2, audio_weights2)

In [40]:
print(clip9)
print(clip9.shape)

tensor([[[[ 1.4345e+00,  1.3736e+00,  1.4010e+00,  ...,  9.7046e-01,
            9.9589e-01,  1.0101e+00],
          [ 1.1148e+00,  9.6725e-01,  1.1009e+00,  ...,  9.3517e-01,
            9.5411e-01,  1.0262e+00],
          [ 8.8883e-01,  8.3815e-01,  8.8324e-01,  ...,  9.7624e-01,
            9.4301e-01,  1.0100e+00],
          ...,
          [ 8.4184e-01,  7.3311e-01,  8.3998e-01,  ...,  6.8910e-01,
            6.7257e-01,  8.5800e-01],
          [ 8.0268e-01,  6.8876e-01,  7.9308e-01,  ...,  6.6533e-01,
            6.4602e-01,  7.7112e-01],
          [ 8.4739e-01,  6.1566e-01,  6.6143e-01,  ...,  6.4825e-01,
            6.3218e-01,  7.4726e-01]],

         [[-1.4110e-01, -1.4771e-01, -1.3356e-01,  ..., -2.3733e-03,
           -1.4755e-02,  1.5833e-03],
          [-1.6941e-01, -1.6631e-01, -1.6581e-01,  ...,  2.7224e-03,
           -1.7414e-02, -4.3255e-03],
          [-1.4138e-01, -1.1789e-01, -1.3459e-01,  ...,  1.1019e-02,
            1.2764e-03, -4.9612e-03],
          ...,
     

In [41]:
print(audio_label9)
print(audio_label9.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([32, 182])


In [42]:
print(audio_label9[0])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])


In [43]:
print(audio_weights9)

tensor([0.8000, 0.7333, 0.6333, 0.5333, 0.7333, 0.9333, 0.7667, 0.8333, 0.8000,
        0.6333, 0.7333, 0.7333, 0.8000, 0.8000, 0.9333, 0.7333, 0.7333, 0.8667,
        0.8000, 0.8667, 0.7000, 0.8000, 0.7000, 0.8333, 0.7333, 0.7000, 0.6667,
        0.7000, 0.9333, 0.6333, 0.9333, 0.7667])


### Next, I need to put the prepared data into a pre-trained model and train it as a feature extractor

Here, the first choice is the efficientNet_s3_in21K pre-trained model

In [None]:
# Load the pre-trained model
model = timm.create_model('tf_efficientnetv2_s_in21k', pretrained=True,in_chans=3) # 可以通过传入argument in_chans来改变 预训练模型接受的数据通道

In [45]:
summary(model,input_size=(128,3,40,251))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [128, 21843]              --
├─Conv2dSame: 1-1                             [128, 24, 20, 126]        648
├─BatchNormAct2d: 1-2                         [128, 24, 20, 126]        48
│    └─Identity: 2-1                          [128, 24, 20, 126]        --
│    └─SiLU: 2-2                              [128, 24, 20, 126]        --
├─Sequential: 1-3                             [128, 256, 2, 8]          --
│    └─Sequential: 2-3                        [128, 24, 20, 126]        --
│    │    └─ConvBnAct: 3-1                    [128, 24, 20, 126]        5,232
│    │    └─ConvBnAct: 3-2                    [128, 24, 20, 126]        5,232
│    └─Sequential: 2-4                        [128, 48, 10, 63]         --
│    │    └─EdgeResidual: 3-3                 [128, 48, 10, 63]         25,632
│    │    └─EdgeResidual: 3-4                 [128, 48, 10, 63]         92,640
│    

In [46]:
# Because we want to customize the shape of the data, we need to change the last few layers of this pre-trained model
# First check the last few layers of the pre-trained model

layers_last4 = list(model.children())[-4:]

layers_last4

[Conv2d(256, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False),
 BatchNormAct2d(
   1280, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
   (drop): Identity()
   (act): SiLU(inplace=True)
 ),
 SelectAdaptivePool2d(pool_type=avg, flatten=Flatten(start_dim=1, end_dim=-1)),
 Linear(in_features=1280, out_features=21843, bias=True)]

In [47]:
# The last two layers are an adaptive pooling layer and a fully connected layer
# Here I choose to replace these two layers. First remove these two layers

layers = list(model.children())[:-2]

In [48]:
encoder = nn.Sequential(*layers) # Encapsulate multiple layers in order

In [49]:
encoder

Sequential(
  (0): Conv2dSame(3, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (1): BatchNormAct2d(
    24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (2): Sequential(
    (0): Sequential(
      (0): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
      (1): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
    )
    (1): Sequential(
      (0): EdgeResid

In [50]:
model.classifier # classifier is the last fully connected layer of the model, out_features represents the number of categories

Linear(in_features=1280, out_features=21843, bias=True)

In [51]:
print(model.classifier.in_features)
print(model.classifier.out_features)

1280
21843


In [52]:
## The pre-trained model will be used for feature extraction

clip10=clip9.permute((0,1,3,2))

print(clip10.shape)

torch.Size([32, 3, 251, 40])


In [53]:
# Get the time frame of each time segment
frames_num=clip10.shape[2] 

frames_num

251

In [54]:
clip10=clip10.transpose(2,3)

print(clip10)
print(clip10.shape)

tensor([[[[ 1.4345e+00,  1.3736e+00,  1.4010e+00,  ...,  9.7046e-01,
            9.9589e-01,  1.0101e+00],
          [ 1.1148e+00,  9.6725e-01,  1.1009e+00,  ...,  9.3517e-01,
            9.5411e-01,  1.0262e+00],
          [ 8.8883e-01,  8.3815e-01,  8.8324e-01,  ...,  9.7624e-01,
            9.4301e-01,  1.0100e+00],
          ...,
          [ 8.4184e-01,  7.3311e-01,  8.3998e-01,  ...,  6.8910e-01,
            6.7257e-01,  8.5800e-01],
          [ 8.0268e-01,  6.8876e-01,  7.9308e-01,  ...,  6.6533e-01,
            6.4602e-01,  7.7112e-01],
          [ 8.4739e-01,  6.1566e-01,  6.6143e-01,  ...,  6.4825e-01,
            6.3218e-01,  7.4726e-01]],

         [[-1.4110e-01, -1.4771e-01, -1.3356e-01,  ..., -2.3733e-03,
           -1.4755e-02,  1.5833e-03],
          [-1.6941e-01, -1.6631e-01, -1.6581e-01,  ...,  2.7224e-03,
           -1.7414e-02, -4.3255e-03],
          [-1.4138e-01, -1.1789e-01, -1.3459e-01,  ...,  1.1019e-02,
            1.2764e-03, -4.9612e-03],
          ...,
     

In [55]:
# Use the pre-trained model (excluding the last two layers) for calculation
clip10=encoder(clip10)

In [56]:
print(clip10)
print(clip10.shape)

tensor([[[[-1.6569e-02, -4.2813e-02, -1.1728e-01,  ...,  5.5723e+00,
            7.6509e+00,  8.5630e+00],
          [-1.3333e-02, -2.3326e-02, -6.4651e-03,  ...,  5.3766e+00,
            5.7731e+00,  5.5217e+00]],

         [[-1.2503e-02, -7.5873e-03, -1.3628e-02,  ..., -1.4496e-01,
            2.3696e-01,  2.6163e+00],
          [-8.0162e-04, -6.1119e-03, -9.9317e-05,  ..., -4.8325e-02,
            1.5870e+00,  1.8131e+00]],

         [[-7.0111e-04, -9.0271e-05, -2.0155e-03,  ..., -2.2952e-01,
           -2.7716e-01, -2.6387e-01],
          [-3.0181e-03, -4.2751e-04, -1.1265e-03,  ..., -3.4564e-02,
           -1.0372e-01, -1.4868e-01]],

         ...,

         [[-1.2487e-01, -9.2145e-02,  1.8384e+00,  ...,  7.5718e+00,
            2.1197e+00, -1.7752e-02],
          [-4.5718e-02, -3.4824e-02,  2.2021e+00,  ...,  5.6971e+00,
            1.4370e+00, -2.2556e-01]],

         [[-1.9169e-01, -7.0096e-02, -1.9271e-01,  ..., -3.4245e-06,
           -4.9262e-04, -4.1760e-04],
          [-2.

In [57]:
# Calculate the mean of each frequency band and merge them to compress the dimension
clip10 = torch.mean(clip10, dim=2)

In [58]:
print(clip10)
print(clip10.shape)

tensor([[[-1.4951e-02, -3.3069e-02, -6.1874e-02,  ...,  5.4745e+00,
           6.7120e+00,  7.0424e+00],
         [-6.6524e-03, -6.8496e-03, -6.8636e-03,  ..., -9.6645e-02,
           9.1196e-01,  2.2147e+00],
         [-1.8596e-03, -2.5889e-04, -1.5710e-03,  ..., -1.3204e-01,
          -1.9044e-01, -2.0628e-01],
         ...,
         [-8.5292e-02, -6.3484e-02,  2.0203e+00,  ...,  6.6345e+00,
           1.7784e+00, -1.2166e-01],
         [-2.1667e-01, -9.0521e-02, -1.3207e-01,  ..., -2.5900e-06,
          -3.0996e-04, -4.0122e-04],
         [-2.4330e-01, -3.0074e-02, -8.8959e-03,  ..., -4.5032e-02,
          -1.5404e-01, -4.6568e-02]],

        [[ 5.6472e-01, -1.7435e-01, -2.6220e-01,  ..., -2.4952e-01,
          -1.1131e-01,  2.3009e-02],
         [-1.4909e-02, -1.7898e-02, -8.3015e-03,  ..., -3.2845e-02,
          -6.2784e-02,  2.0154e+00],
         [-7.0096e-02, -2.0195e-01, -2.5979e-01,  ..., -1.3397e-03,
          -5.9477e-03, -2.6249e-01],
         ...,
         [-2.6221e-01, -2

In [59]:
# channel smoothing
x1 = F.max_pool1d(clip10, kernel_size=3, stride=1, padding=1)
x2 = F.avg_pool1d(clip10, kernel_size=3, stride=1, padding=1)
x = x1 + x2

In [60]:
print(x)
print(x.shape)

tensor([[[-3.0958e-02, -5.1583e-02, -1.1883e-01,  ...,  1.0742e+01,
           1.3452e+01,  1.1627e+01],
         [-1.1153e-02, -1.3441e-02, -9.2220e-03,  ...,  1.1510e+00,
           3.2247e+00,  3.2569e+00],
         [-9.6505e-04, -1.4887e-03, -1.1265e-02,  ..., -2.8853e-01,
          -3.0830e-01, -3.2268e-01],
         ...,
         [-1.1308e-01,  2.6441e+00,  7.1042e+00,  ...,  1.1490e+01,
           9.3982e+00,  2.3306e+00],
         [-1.9292e-01, -2.3694e-01, -2.0165e-01,  ..., -8.5077e-04,
          -2.4051e-04, -5.4703e-04],
         [-1.2120e-01, -1.0299e-01, -2.1358e-02,  ..., -1.6043e-01,
          -1.2691e-01, -1.1344e-01]],

        [[ 6.9485e-01,  6.0745e-01, -4.0838e-01,  ...,  4.4500e-01,
          -8.9598e-02, -6.4246e-03],
         [-2.5845e-02, -2.2004e-02, -2.1451e-02,  ..., -4.3633e-02,
           2.6553e+00,  2.6663e+00],
         [-1.6078e-01, -2.4737e-01, -3.9151e-01,  ..., -1.2847e-02,
          -9.1266e-02, -9.5428e-02],
         ...,
         [-3.5961e-01, -4

In [61]:
x = F.dropout(x, p=0.3, training=True)

In [62]:
x

tensor([[[-4.4226e-02, -0.0000e+00, -0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.5933e-02, -1.9201e-02, -1.3174e-02,  ...,  1.6443e+00,
           0.0000e+00,  4.6527e+00],
         [-1.3786e-03, -0.0000e+00, -0.0000e+00,  ..., -4.1219e-01,
          -4.4042e-01, -0.0000e+00],
         ...,
         [-1.6154e-01,  0.0000e+00,  1.0149e+01,  ...,  1.6414e+01,
           1.3426e+01,  3.3295e+00],
         [-2.7560e-01, -3.3849e-01, -0.0000e+00,  ..., -0.0000e+00,
          -0.0000e+00, -7.8146e-04],
         [-0.0000e+00, -1.4712e-01, -3.0512e-02,  ..., -2.2918e-01,
          -1.8130e-01, -1.6205e-01]],

        [[ 9.9264e-01,  8.6778e-01, -0.0000e+00,  ...,  6.3571e-01,
          -1.2800e-01, -9.1779e-03],
         [-3.6922e-02, -3.1435e-02, -3.0645e-02,  ..., -6.2332e-02,
           0.0000e+00,  0.0000e+00],
         [-2.2968e-01, -3.5339e-01, -5.5930e-01,  ..., -1.8352e-02,
          -1.3038e-01, -0.0000e+00],
         ...,
         [-5.1373e-01, -6

In [63]:
x = x.transpose(1, 2)

In [64]:
print(x.shape)

torch.Size([32, 8, 1280])


In [65]:
# Create a fully connected layer
fc1 = nn.Linear(in_features=model.classifier.in_features, out_features=model.classifier.in_features, bias=True)

# Initialize the parameters of the fully connected layer

def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight) # Initialize the weights and biases of the network layer

    if hasattr(layer, "bias"): # Check if the layer has a bias attribute
        if layer.bias is not None: # and bias is not None
            layer.bias.data.fill_(0.0) # If there is a bias, initialize it to 0

init_layer(fc1)

In [66]:
x = F.relu_(fc1(x))

In [67]:
print(x)
print(x.shape)

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.8493,  0.0000,  2.2463],
         [ 1.4956,  0.0000,  0.0000,  ...,  1.7019,  0.0000,  1.2083],
         [ 2.9852,  0.0000,  0.0000,  ...,  1.7144,  0.0000,  7.2559],
         ...,
         [ 0.0000,  4.5035,  0.0000,  ...,  0.4537,  0.0000,  7.8644],
         [ 3.3951,  6.4179,  0.0000,  ...,  2.0159,  1.4925,  8.1388],
         [ 1.4708,  3.8314,  1.2186,  ...,  1.6265,  3.6036,  2.7818]],

        [[ 3.1623,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.7630,  0.3800,  0.0000,  ...,  1.6521,  0.0000,  0.0000],
         [ 0.4159,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  1.9986,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.6193,  0.0000,  ...,  2.4450,  0.0000,  6.3322],
         [ 0.0000,  0.0000,  0.0000,  ...,  1.6864,  0.0000,  0.0000]],

        [[ 0.6995,  0.0000,  0.0000,  ...,  8.2243,  0.0000,  0.0000],
         [ 0.0000,  0.4590,  0.0000,  ...,  4

In [68]:
x = x.transpose(1, 2)

In [69]:
print(x)
print(x.shape)

tensor([[[ 0.0000,  1.4956,  2.9852,  ...,  0.0000,  3.3951,  1.4708],
         [ 0.0000,  0.0000,  0.0000,  ...,  4.5035,  6.4179,  3.8314],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  1.2186],
         ...,
         [ 0.8493,  1.7019,  1.7144,  ...,  0.4537,  2.0159,  1.6265],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  1.4925,  3.6036],
         [ 2.2463,  1.2083,  7.2559,  ...,  7.8644,  8.1388,  2.7818]],

        [[ 3.1623,  0.7630,  0.4159,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.3800,  0.0000,  ...,  1.9986,  0.6193,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  1.6521,  0.0000,  ...,  0.0000,  2.4450,  1.6864],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  6.3322,  0.0000]],

        [[ 0.6995,  0.0000,  0.0000,  ...,  0.0000,  1.0040,  0.0000],
         [ 0.0000,  0.4590,  0.0000,  ...,  3

In [70]:
x = F.dropout(x, p=0.3, training=True)

In [71]:
print(x)
print(x.shape)

tensor([[[ 0.0000,  2.1366,  4.2645,  ...,  0.0000,  4.8502,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  6.4335,  0.0000,  5.4734],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  1.7409],
         ...,
         [ 0.0000,  0.0000,  2.4492,  ...,  0.6481,  2.8799,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  5.1480],
         [ 3.2090,  1.7262,  0.0000,  ..., 11.2349, 11.6268,  3.9740]],

        [[ 4.5176,  1.0900,  0.5942,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.5429,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  2.3601,  0.0000,  ...,  0.0000,  3.4928,  2.4091],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  9.0460,  0.0000]],

        [[ 0.9993,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.6557,  0.0000,  ...,  4

In [72]:
frames_num

251

In [73]:
# Later we want to pass the acquired high-dimensional features into an attention module

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
        # x: This is the final output after the attention weights and classification layer.
        # shape: (n_samples, out_features). Since the time dimension is summed and compressed, each sample and each output feature ends up having a single value.
        # norm_att: This is the output of the attention layer (att) after the softmax and tanh functions, which shows which parts of the input sequence the model should focus on. Normalization ensures that the attention weights for all time steps add up to 1, which makes it easier to interpret the importance of each time step.
        # shape: (n_samples, out_features, n_time), where out_features is the number of output features of the att convolutional layer, which is the same as the out_features argument of the input. Each time step and each output feature has a normalized weight.
        # cla: This is the output of the classification layer (cla), which is obtained by processing the input features through another 1D convolutional layer. This output layer is often used to directly predict task-related outputs, such as the probability of a class label.
        # Shape: (n_samples, out_features, n_time), same shape as norm_att. This means that each output feature corresponding to each time step has a value processed by the activation function.
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [74]:
att_block=AttBlockV2(in_features=model.classifier.in_features, out_features=182, activation="sigmoid")


(clipwise_output, norm_att, segmentwise_output) = att_block(x)

In [75]:
print(clipwise_output)
print(clipwise_output.shape)

tensor([[0.7606, 0.3648, 0.1297,  ..., 0.6144, 0.5173, 0.3312],
        [0.1332, 0.4221, 0.1250,  ..., 0.1752, 0.4603, 0.2061],
        [0.3576, 0.3903, 0.2409,  ..., 0.0723, 0.3924, 0.6862],
        ...,
        [0.1907, 0.4924, 0.4743,  ..., 0.0145, 0.5626, 0.7469],
        [0.3258, 0.2783, 0.3120,  ..., 0.0737, 0.5454, 0.4005],
        [0.2966, 0.3888, 0.1759,  ..., 0.0054, 0.5776, 0.5563]],
       grad_fn=<SumBackward1>)
torch.Size([32, 182])


In [76]:
print(norm_att)
print(norm_att.shape)

tensor([[[0.0706, 0.5008, 0.0730,  ..., 0.0691, 0.0691, 0.0691],
         [0.2468, 0.0998, 0.2456,  ..., 0.0348, 0.0494, 0.0337],
         [0.0472, 0.1451, 0.1458,  ..., 0.1446, 0.0950, 0.1317],
         ...,
         [0.0682, 0.0682, 0.0682,  ..., 0.0682, 0.0883, 0.5012],
         [0.1257, 0.1206, 0.1257,  ..., 0.1257, 0.1251, 0.1257],
         [0.1830, 0.0559, 0.0558,  ..., 0.0558, 0.3998, 0.1381]],

        [[0.0944, 0.0687, 0.0688,  ..., 0.4915, 0.0687, 0.0701],
         [0.0717, 0.1065, 0.0428,  ..., 0.2911, 0.0430, 0.0831],
         [0.0611, 0.3514, 0.2741,  ..., 0.0517, 0.0518, 0.0517],
         ...,
         [0.0420, 0.0426, 0.0430,  ..., 0.0439, 0.3059, 0.2237],
         [0.1413, 0.1417, 0.1302,  ..., 0.1417, 0.1417, 0.1417],
         [0.1131, 0.1132, 0.1135,  ..., 0.1131, 0.1130, 0.1961]],

        [[0.0287, 0.0287, 0.0286,  ..., 0.2110, 0.2109, 0.2109],
         [0.1527, 0.1257, 0.0219,  ..., 0.1527, 0.1060, 0.1494],
         [0.2474, 0.2474, 0.0335,  ..., 0.0336, 0.0336, 0.

In [77]:
print(segmentwise_output)
print(segmentwise_output.shape)

tensor([[[4.3260e-01, 7.6388e-01, 9.8241e-01,  ..., 9.9195e-01,
          8.3926e-01, 9.2274e-01],
         [9.2157e-01, 4.2898e-01, 9.4977e-02,  ..., 8.6080e-01,
          6.6333e-02, 1.0297e-02],
         [5.0828e-01, 4.3726e-02, 8.2208e-04,  ..., 3.2264e-04,
          8.8152e-04, 3.2245e-02],
         ...,
         [1.6388e-02, 5.7675e-01, 3.0566e-02,  ..., 4.0920e-01,
          8.9302e-02, 9.4412e-01],
         [2.3313e-01, 9.9041e-02, 4.7421e-01,  ..., 9.9710e-01,
          9.3333e-01, 9.9261e-01],
         [9.4480e-01, 2.2029e-01, 3.5299e-01,  ..., 9.9298e-01,
          8.4289e-03, 9.1082e-02]],

        [[2.9726e-01, 5.3759e-02, 1.4502e-01,  ..., 7.6063e-02,
          7.7011e-03, 6.6838e-01],
         [8.5680e-01, 7.5084e-01, 9.1159e-01,  ..., 7.0751e-01,
          6.9029e-03, 2.7388e-01],
         [1.1977e-01, 7.3634e-02, 1.2665e-02,  ..., 4.7414e-02,
          6.2078e-01, 9.6099e-01],
         ...,
         [1.4380e-01, 6.6253e-02, 1.4006e-01,  ..., 3.5708e-01,
          1.143

In [78]:
segmentwise_logit = att_block.cla(x).transpose(1, 2)

In [79]:
print(segmentwise_logit)
print(segmentwise_logit.shape)

tensor([[[ -0.2713,   2.4638,   0.0331,  ...,  -4.0947,  -1.1907,   2.8400],
         [  1.1741,  -0.2860,  -3.0851,  ...,   0.3095,  -2.2079,  -1.2640],
         [  4.0224,  -2.2543,  -7.1029,  ...,  -3.4568,  -0.1033,  -0.6059],
         ...,
         [  4.8146,   1.8220,  -8.0387,  ...,  -0.3673,   5.8394,   4.9515],
         [  1.6528,  -2.6444,  -7.0330,  ...,  -2.3222,   2.6389,  -4.7676],
         [  2.4801,  -4.5656,  -3.4016,  ...,   2.8271,   4.9005,  -2.3005]],

        [[ -0.8604,   1.7889,  -1.9946,  ...,  -1.7841,   0.4416,  -2.3004],
         [ -2.8680,   1.1031,  -2.5322,  ...,  -2.6457,  -1.3676,  -0.0779],
         [ -1.7742,   2.3332,  -4.3562,  ...,  -1.8148,   2.4901,  -5.8333],
         ...,
         [ -2.4971,   0.8833,  -3.0003,  ...,  -0.5880,   0.2213,  -0.9300],
         [ -4.8587,  -4.9689,   0.4929,  ...,  -4.4594,  -8.3747,  -2.3255],
         [  0.7008,  -0.9750,   3.2043,  ...,  -4.5710,  -4.9341,  -0.3553]],

        [[  4.7997,  -2.9259,  -1.5754,  ...

In [80]:
segmentwise_output = segmentwise_output.transpose(1, 2)

In [81]:
print(segmentwise_output)
print(segmentwise_output.shape)

tensor([[[4.3260e-01, 9.2157e-01, 5.0828e-01,  ..., 1.6388e-02,
          2.3313e-01, 9.4480e-01],
         [7.6388e-01, 4.2898e-01, 4.3726e-02,  ..., 5.7675e-01,
          9.9041e-02, 2.2029e-01],
         [9.8241e-01, 9.4977e-02, 8.2208e-04,  ..., 3.0566e-02,
          4.7421e-01, 3.5299e-01],
         ...,
         [9.9195e-01, 8.6080e-01, 3.2264e-04,  ..., 4.0920e-01,
          9.9710e-01, 9.9298e-01],
         [8.3926e-01, 6.6333e-02, 8.8152e-04,  ..., 8.9302e-02,
          9.3333e-01, 8.4289e-03],
         [9.2274e-01, 1.0297e-02, 3.2245e-02,  ..., 9.4412e-01,
          9.9261e-01, 9.1082e-02]],

        [[2.9726e-01, 8.5680e-01, 1.1977e-01,  ..., 1.4380e-01,
          6.0863e-01, 9.1089e-02],
         [5.3759e-02, 7.5084e-01, 7.3634e-02,  ..., 6.6253e-02,
          2.0301e-01, 4.8054e-01],
         [1.4502e-01, 9.1159e-01, 1.2665e-02,  ..., 1.4006e-01,
          9.2344e-01, 2.9200e-03],
         ...,
         [7.6063e-02, 7.0751e-01, 4.7414e-02,  ..., 3.5708e-01,
          5.551

In [82]:
interpolate_ratio = frames_num // segmentwise_output.size(1)

In [83]:
interpolate_ratio

31

In [84]:
loss_function = nn.BCEWithLogitsLoss(reduction="none")

In [85]:
loss = 0.5 * loss_function(torch.logit(clipwise_output), audio_label9) + 0.5 * loss_function(segmentwise_logit.max(1)[0], audio_label9)

In [86]:
print(loss)
print(loss.shape)

tensor([[3.1261, 1.4997, 0.4244,  ..., 1.9188, 3.2854, 2.6804],
        [0.6234, 1.4871, 1.6888,  ..., 0.4304, 1.5933, 0.4429],
        [2.6252, 2.9437, 3.4807,  ..., 1.3277, 3.1519, 3.7862],
        ...,
        [1.0462, 2.4043, 2.9479,  ..., 0.0399, 3.6667, 5.3508],
        [1.4409, 4.1939, 1.7318,  ..., 0.3947, 3.7390, 2.5161],
        [4.2805, 2.6539, 1.1483,  ..., 0.0284, 3.1180, 2.7050]],
       grad_fn=<AddBackward0>)
torch.Size([32, 182])


In [87]:
loss = loss.sum(dim=1) * audio_weights9

In [88]:
print(loss)
print(loss.shape)

tensor([449.2588, 332.0229, 472.7906, 256.2851, 380.4775, 404.8902, 427.2510,
        352.7689, 374.0200, 363.3866, 334.1092, 368.8346, 509.0506, 405.9546,
        423.9656, 378.8881, 476.6589, 594.4567, 451.1944, 451.7960, 395.6176,
        354.9887, 274.0173, 597.6990, 409.6134, 340.1158, 289.0433, 355.9919,
        683.0813, 348.1871, 465.6527, 419.7467], grad_fn=<MulBackward0>)
torch.Size([32])


In [89]:
loss = loss.sum()

In [90]:
loss

tensor(13141.8154, grad_fn=<SumBackward0>)

In [91]:
# y_pred
torch.logit(clipwise_output)

tensor([[ 1.1558, -0.5544, -1.9038,  ...,  0.4658,  0.0692, -0.7026],
        [-1.8725, -0.3140, -1.9461,  ..., -1.5493, -0.1590, -1.3483],
        [-0.5859, -0.4461, -1.1479,  ..., -2.5514, -0.4370,  0.7826],
        ...,
        [-1.4455, -0.0305, -0.1029,  ..., -4.2208,  0.2518,  1.0819],
        [-0.7273, -0.9530, -0.7908,  ..., -2.5312,  0.1820, -0.4033],
        [-0.8633, -0.4523, -1.5447,  ..., -5.2091,  0.3130,  0.2260]],
       grad_fn=<LogitBackward0>)

In [92]:
audio_label9

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [93]:
torch.logit(clipwise_output).shape

torch.Size([32, 182])

In [94]:
audio_label9.shape

torch.Size([32, 182])

In [95]:
torch.logit(clipwise_output)[0]

tensor([ 1.1558, -0.5544, -1.9038,  0.5842,  0.6793, -0.3103,  0.3622,  2.6720,
         1.1968,  1.9605, -0.1440,  0.6012,  0.3439,  0.7290,  0.4987, -1.0556,
        -3.2401,  0.9414, -0.8551, -0.3132, -1.2216, -0.4961,  1.8152,  1.6972,
        -2.4760,  0.1262,  0.0644, -0.2398, -2.1218, -3.0737, -1.4441, -0.2646,
         0.1405,  2.3220, -2.3530, -2.5800,  1.3406,  2.7800, -0.3509, -1.0579,
         0.8010, -0.0457,  0.5179, -0.2395,  0.6103,  0.1513,  0.5079,  0.4382,
         0.1196,  0.6469, -5.4115,  1.2755,  1.3952,  1.7183, -1.7108, -0.9875,
        -0.0209,  0.3060,  0.6401,  2.0920, -1.9121, -1.0388, -0.9372, -1.3631,
        -0.5262,  0.0459,  0.4150, -1.3921, -2.8248,  6.8728, -1.5901,  3.2996,
        -0.9241, -2.2736, -1.0838,  2.5152, -1.7841, -0.7900,  3.3089,  0.5002,
         0.5658,  0.1768, -0.3029, -0.9915, -0.4425,  0.1313, -0.6286, -1.0696,
        -0.8901,  2.1551, -0.1422, -0.7597,  1.1850, -4.9617,  1.8083,  0.1927,
         0.1561,  1.0944,  2.1176, -0.08

In [96]:
torch.logit(clipwise_output)[0].sum()

tensor(-0.8000, grad_fn=<SumBackward0>)

In [97]:
clipwise_output[0]

tensor([0.7606, 0.3648, 0.1297, 0.6420, 0.6636, 0.4230, 0.5896, 0.9354, 0.7680,
        0.8766, 0.4641, 0.6459, 0.5851, 0.6746, 0.6221, 0.2581, 0.0377, 0.7194,
        0.2984, 0.4223, 0.2277, 0.3785, 0.8600, 0.8452, 0.0776, 0.5315, 0.5161,
        0.4403, 0.1070, 0.0442, 0.1909, 0.4342, 0.5351, 0.9107, 0.0868, 0.0704,
        0.7926, 0.9416, 0.4132, 0.2577, 0.6902, 0.4886, 0.6267, 0.4404, 0.6480,
        0.5378, 0.6243, 0.6078, 0.5299, 0.6563, 0.0044, 0.7817, 0.8014, 0.8479,
        0.1531, 0.2714, 0.4948, 0.5759, 0.6548, 0.8901, 0.1287, 0.2614, 0.2815,
        0.2037, 0.3714, 0.5115, 0.6023, 0.1991, 0.0560, 0.9990, 0.1694, 0.9644,
        0.2841, 0.0933, 0.2528, 0.9252, 0.1438, 0.3122, 0.9647, 0.6225, 0.6378,
        0.5441, 0.4248, 0.2706, 0.3911, 0.5328, 0.3478, 0.2555, 0.2911, 0.8961,
        0.4645, 0.3187, 0.7658, 0.0070, 0.8592, 0.5480, 0.5389, 0.7492, 0.8926,
        0.4792, 0.6310, 0.8490, 0.4859, 0.2177, 0.8572, 0.7966, 0.6972, 0.3264,
        0.5655, 0.9707, 0.3625, 0.4967, 

In [100]:
clipwise_output.shape

torch.Size([32, 182])

In [105]:
clipwise_output.sigmoid().detach()

tensor([[0.6815, 0.5902, 0.5324,  ..., 0.6489, 0.6265, 0.5821],
        [0.5333, 0.6040, 0.5312,  ..., 0.5437, 0.6131, 0.5514],
        [0.5885, 0.5964, 0.5599,  ..., 0.5181, 0.5969, 0.6651],
        ...,
        [0.5475, 0.6207, 0.6164,  ..., 0.5036, 0.6371, 0.6785],
        [0.5807, 0.5691, 0.5774,  ..., 0.5184, 0.6331, 0.5988],
        [0.5736, 0.5960, 0.5439,  ..., 0.5014, 0.6405, 0.6356]])

In [101]:
a=clipwise_output.sigmoid().detach().numpy()
a

array([[0.68147707, 0.5902113 , 0.5323743 , ..., 0.6489438 , 0.6265168 ,
        0.5820622 ],
       [0.53326297, 0.60399425, 0.53120345, ..., 0.5436858 , 0.6130908 ,
        0.551355  ],
       [0.58845305, 0.5963501 , 0.5599288 , ..., 0.51807535, 0.59687227,
        0.6651295 ],
       ...,
       [0.5475301 , 0.6206659 , 0.61640054, ..., 0.50361854, 0.63705707,
        0.67849326],
       [0.5807343 , 0.5691229 , 0.5773728 , ..., 0.51841724, 0.63306177,
        0.59881085],
       [0.57362306, 0.5959976 , 0.5438503 , ..., 0.50135916, 0.64051944,
        0.6355869 ]], dtype=float32)

In [102]:
a.shape

(32, 182)

In [99]:
audio_label9[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])