This notebook is modified based on the 13/13.1/13.2 notebooks.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import gc

In [3]:
metadata_path='../../data/train_metadata_new_add_rating.csv'

In [4]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# Find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# Initialize training set and validation set
train_indices = []
val_indices = []


# Random select a sample from each category to add to the validation set, and the rest to the training set
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))


# Divide the dataset by index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [5]:
# Random select 20,000 data from the training set
additional_val_samples = train_df.sample(n=20000, random_state=42)

# Add these samples to the validation set
val_df = pd.concat([val_df, additional_val_samples])

# Remove these samples from the training set
train_df = train_df.drop(additional_val_samples.index)

In [6]:
# Need to interpolate missing values ​​for ratings in metadata csv file

def rating_value_interplote(df:pd.DataFrame):
    '''
    interplote Nan values for rating col in metadata csv 

    parameters:
        df: the df of the metadata csv file

    rating col means the quality of the corresponding audio file
        5 is high quality
        1 is low quality
        0 is without defined quality level
    '''

    if df['rating'].isna().sum()>0: # with missing value
        df['rating'].fillna(0, inplace=True)

    # For all places where the value is 0, a random value is given, choosing from the specified choices.
    mask = df['rating'] == 0  # Create a boolean mask indicating which positions are 0

    choices=np.arange(0.5,5.1,0.5).tolist() # [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    random_values = np.random.choice(choices, size=mask.sum())  # Generate random numbers for these 0 values 
    df.loc[mask, 'rating'] = random_values  # Fill the generated random numbers back into the corresponding positions of the original DataFrame

    return df

In [7]:
# Calculate the weight of each audio file by rating, which helps model training
def audio_weight(df):
    '''
    calculate the weight corresponding to each audio file through the rating value

    Because each audio has different quality level, we use weight to affect the inportance of each audio in models,
    the lower the quality of the audio, the lower the weight
    '''
    # Through rating, we calculate the credibility of each audio and express it through weight. 
    # The purpose of this is to improve the model by increasing the weight of high-quality audio and reducing the weight of low-quality audio.
    df["audio_weight"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

    return df

In [8]:
# Because this is an unbalanced dataset, the amount of data in each category is very different
# So I will calculate the weight of each category here
# **(-0.5) The purpose is to reduce the relative influence of high-frequency categories and increase the influence of low-frequency categories, 
# so as to help the model better learn those uncommon categories
# The purpose of calculating this is to build a WeightedRandomSampler, so that each time a batch is extracted using dataloader, it is more friendly to data of different categories.

def sampling_weight(df)->torch.Tensor:
    '''
    calculate the sampling weight of each audio file

    because this is imbalanced dataset
    we hope the category with less data has large probability to be picked.
    '''
    sample_weights = (df['primary_label'].value_counts() / df['primary_label'].value_counts().sum()) ** (-0.5)

    # Map weights to each row of the original data
    sample_weights_map = df['primary_label'].map(sample_weights)

    # Convert pandas Series to NumPy array
    sample_weights_np = sample_weights_map.to_numpy(dtype=np.float32)

    # Convert a NumPy array to a PyTorch tensor using torch.from_numpy
    sample_weights_tensor = torch.from_numpy(sample_weights_np)

    return sample_weights_tensor

In [9]:
# Because I have divided the total metadata csv into two parts, next we need to calculate the sampler for train and val separately

# train df
sample_weights_tensor=sampling_weight(df=train_df)
# Here we will build an argument sampler that will be used by the dataloader
# It should be noted that the order of weights in the constructed sampler must be consistent with the order of data passed into the dataloader, otherwise the weights will not match

#Create a sampler based on the newly obtained weight list
train_sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

print(train_sampler)

# val df
sample_weights_tensor=sampling_weight(df=val_df)
# Here we will build an argument sampler that will be used by the dataloader
# Note that the order of weights in the constructed sampler must be consistent with the order of data passed into the dataloader, otherwise the weights will not match

# Create a sampler based on the newly obtained weight list
val_sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

print(val_sampler)

<torch.utils.data.sampler.WeightedRandomSampler object at 0x1144d6800>
<torch.utils.data.sampler.WeightedRandomSampler object at 0x11441b9d0>


In [10]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [11]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class CustomCompose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class CustomOneOf:
    def __init__(self, transforms: list, p=1.0):
        self.transforms = transforms
        self.p = p

    def __call__(self, y: np.ndarray):
        if np.random.rand() < self.p:
            n_trns = len(self.transforms)
            trns_idx = np.random.choice(n_trns)
            trns = self.transforms[trns_idx]
            y = trns(y)
        return y


class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=40.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"):
        super().__init__(always_apply, p)

        assert mode in [
            "uniform",
            "fade",
            "fade",
            "cosine",
            "sine",
        ], "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"

        self.db_limit = db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == "uniform":
            db_translated = 10 ** (db / 20)
        elif self.mode == "fade":
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == "cosine":
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented


class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5, sr=32000):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        return augmented


class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1, sr=32000):
        super().__init__(always_apply, p)
        self.max_rate = max_rate
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented


def _db2float(db: float, amplitude=True):
    if amplitude:
        return 10 ** (db / 20)
    else:
        return 10 ** (db / 10)


def volume_down(y: np.ndarray, db: float):
    """
    Low level API for decreasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to decrease
    Returns
    -------
    applied: numpy.ndarray
        audio with decreased volume
    """
    applied = y * _db2float(-db)
    return applied


def volume_up(y: np.ndarray, db: float):
    """
    Low level API for increasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to increase
    Returns
    -------
    applied: numpy.ndarray
        audio with increased volume
    """
    applied = y * _db2float(db)
    return applied


class RandomVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        if db >= 0:
            return volume_up(y, db)
        else:
            return volume_down(y, db)


class CosineVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
        dbs = _db2float(cosine * db)
        return y * dbs


class AddGaussianNoise(AudioTransform):
    """Add gaussian noise to the samples"""

    supports_multichannel = True

    def __init__(
        self, always_apply=False, min_amplitude=0.001, max_amplitude=0.015, p=0.5
    ):
        """
        :param min_amplitude: Minimum noise amplification factor
        :param max_amplitude: Maximum noise amplification factor
        :param p:
        """
        super().__init__(always_apply, p)
        assert min_amplitude > 0.0
        assert max_amplitude > 0.0
        assert max_amplitude >= min_amplitude
        self.min_amplitude = min_amplitude
        self.max_amplitude = max_amplitude

    def apply(self, samples: np.ndarray, sample_rate=32000):
        amplitude = np.random.uniform(self.min_amplitude, self.max_amplitude)
        noise = np.random.randn(*samples.shape).astype(np.float32)
        samples = samples + amplitude * noise
        return samples


class AddGaussianSNR(AudioTransform):
    """
    Add gaussian noise to the input. A random Signal to Noise Ratio (SNR) will be picked
    uniformly in the decibel scale. This aligns with human hearing, which is more
    logarithmic than linear.
    """

    supports_multichannel = True

    def __init__(
        self,
        always_apply=False,
        min_snr_in_db: float = 5.0,
        max_snr_in_db: float = 40.0,
        p: float = 0.5,
    ):
        """
        :param min_snr_in_db: Minimum signal-to-noise ratio in dB. A lower number means more noise.
        :param max_snr_in_db: Maximum signal-to-noise ratio in dB. A greater number means less noise.
        :param p: The probability of applying this transform
        """
        super().__init__(always_apply, p)
        self.min_snr_in_db = min_snr_in_db
        self.max_snr_in_db = max_snr_in_db

    def apply(self, samples: np.ndarray, sample_rate=32000):
        snr = np.random.uniform(self.min_snr_in_db, self.max_snr_in_db)

        clean_rms = np.sqrt(np.mean(np.square(samples)))

        a = float(snr) / 20
        noise_rms = clean_rms / (10**a)

        noise = np.random.normal(0.0, noise_rms, size=samples.shape).astype(np.float32)
        return samples + noise


class Normalize(AudioTransform):
    """
    Apply a constant amount of gain, so that highest signal level present in the sound becomes
    0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known
    as peak normalization.
    """

    supports_multichannel = True

    def __init__(self, always_apply=False, apply_to: str = "all", p: float = 0.5):
        super().__init__(always_apply, p)
        assert apply_to in ("all", "only_too_loud_sounds")
        self.apply_to = apply_to

    def apply(self, samples: np.ndarray, sample_rate=32000):
        max_amplitude = np.amax(np.abs(samples))
        if self.apply_to == "only_too_loud_sounds" and max_amplitude < 1.0:
            return samples

        if max_amplitude > 0:
            return samples / max_amplitude
        else:
            return samples

class NormalizeMelSpec(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps

    def forward(self, X):
        mean = X.mean((1, 2), keepdim=True)
        std = X.std((1, 2), keepdim=True)
        Xstd = (X - mean) / (std + self.eps)
        norm_min, norm_max = Xstd.min(-1)[0].min(-1)[0], Xstd.max(-1)[0].max(-1)[0]
        fix_ind = (norm_max - norm_min) > self.eps * torch.ones_like(
            (norm_max - norm_min)
        )
        V = torch.zeros_like(Xstd)
        if fix_ind.sum():
            V_fix = Xstd[fix_ind]
            norm_max_fix = norm_max[fix_ind, None, None]
            norm_min_fix = norm_min[fix_ind, None, None]
            V_fix = torch.max(
                torch.min(V_fix, norm_max_fix),
                norm_min_fix,
            )
            # print(V_fix.shape, norm_min_fix.shape, norm_max_fix.shape)
            V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
            V[fix_ind] = V_fix
        return V

In [12]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order is very important and needs to be matched one by one in the subsequent training, I will save these types here
# Save as .npy file
np.save("./temp_files/13-2-bird-cates.npy", bird_cates)

In [13]:
bird_cates

array(['malpar1', 'litgre1', 'houspa', 'indrob1', 'comtai1', 'grynig2',
       'rufwoo2', 'yebbul3', 'indpit1', 'gyhcaf1', 'ruftre2', 'wynlau1',
       'inpher1', 'comkin1', 'comior1', 'tibfly3', 'pomgrp2', 'oripip1',
       'indtit1', 'nutman', 'junmyn1', 'rutfly6', 'goflea1', 'litegr',
       'lesyel1', 'lewduc1', 'inbrob1', 'crfbar1', 'scamin3', 'shikra1',
       'gryfra', 'commoo3', 'grewar3', 'brodro1', 'rocpig', 'categr',
       'ingori1', 'plhpar1', 'sbeowl1', 'bwfshr1', 'junowl1', 'orihob2',
       'greegr', 'barswa', 'paisto1', 'moipig1', 'plapri1', 'forwag1',
       'maghor2', 'brasta1', 'lirplo', 'grecou1', 'kenplo1', 'bkcbul1',
       'grbeat1', 'junbab2', 'comsan', 'whbtre1', 'brnhao1', 'brcful1',
       'whcbar1', 'hoopoe', 'plaflo1', 'maltro1', 'piekin1', 'brnshr',
       'whiter2', 'brfowl1', 'pursun4', 'grehor1', 'pursun3', 'purswa3',
       'yebbab1', 'lblwar1', 'malwoo1', 'laudov1', 'grenig1', 'tilwar1',
       'heswoo1', 'compea', 'putbab1', 'smamin1', 'rorpar', 'gr

In [14]:
# load .npy file
loaded_array = np.load("./temp_files/13-2-bird-cates.npy",allow_pickle=True)

# Print the array contents to verify
print(loaded_array)
print(type(loaded_array))

['malpar1' 'litgre1' 'houspa' 'indrob1' 'comtai1' 'grynig2' 'rufwoo2'
 'yebbul3' 'indpit1' 'gyhcaf1' 'ruftre2' 'wynlau1' 'inpher1' 'comkin1'
 'comior1' 'tibfly3' 'pomgrp2' 'oripip1' 'indtit1' 'nutman' 'junmyn1'
 'rutfly6' 'goflea1' 'litegr' 'lesyel1' 'lewduc1' 'inbrob1' 'crfbar1'
 'scamin3' 'shikra1' 'gryfra' 'commoo3' 'grewar3' 'brodro1' 'rocpig'
 'categr' 'ingori1' 'plhpar1' 'sbeowl1' 'bwfshr1' 'junowl1' 'orihob2'
 'greegr' 'barswa' 'paisto1' 'moipig1' 'plapri1' 'forwag1' 'maghor2'
 'brasta1' 'lirplo' 'grecou1' 'kenplo1' 'bkcbul1' 'grbeat1' 'junbab2'
 'comsan' 'whbtre1' 'brnhao1' 'brcful1' 'whcbar1' 'hoopoe' 'plaflo1'
 'maltro1' 'piekin1' 'brnshr' 'whiter2' 'brfowl1' 'pursun4' 'grehor1'
 'pursun3' 'purswa3' 'yebbab1' 'lblwar1' 'malwoo1' 'laudov1' 'grenig1'
 'tilwar1' 'heswoo1' 'compea' 'putbab1' 'smamin1' 'rorpar' 'graher1'
 'ashpri1' 'piebus1' 'grnwar1' 'eurbla2' 'asikoe2' 'whbwat1' 'sqtbul1'
 'brwowl1' 'bncwoo3' 'ashwoo2' 'pabflo1' 'eaywag1' 'ashdro1' 'rerswa1'
 'emedov2' 'houcro1'

In [15]:
# Find the index of the target value in the array
index = np.where(loaded_array == 'gyhcaf1')[0][0]

index

9

In [16]:
a=torch.zeros(len(loaded_array))

a[9]=1

a

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [17]:
sample_weights = (train_df['primary_label'].value_counts() / train_df['primary_label'].value_counts().sum()) ** (-0.5)

# Convert sample_weights to a DataFrame for easier processing
sample_weights_df = sample_weights.reset_index()
sample_weights_df.columns = ['label', 'weight']

# sample_weights_df

# Convert loaded_array to Categorical type and sort sample_weights_df according to this new order
sample_weights_df['label'] = pd.Categorical(sample_weights_df['label'], categories=loaded_array, ordered=True)

print(sample_weights_df)

# # Sort DataFrame according to new category order
sample_weights_df = sample_weights_df.sort_values('label').reset_index(drop=True)

# # Now the weights of sample_weights_df are in the order of loaded_array
print(sample_weights_df)

       label      weight
0    blrwar1    3.300550
1     houspa    4.865550
2    grewar3    5.655309
3     commyn    5.852887
4     hoopoe    6.044591
..       ...         ...
177  blaeag1   90.727614
178  darter2   99.387122
179  asiope1  114.762363
180   integr  118.790331
181  niwpig1  134.013568

[182 rows x 2 columns]
       label     weight
0    malpar1  39.286209
1    litgre1   8.817445
2     houspa   4.865550
3    indrob1  26.328292
4    comtai1  10.093835
..       ...        ...
177  lobsun2  34.497774
178  rossta2  18.799193
179  bkrfla1  44.898525
180  indrol2  31.667370
181  cregos1  19.585808

[182 rows x 2 columns]


In [18]:
cate_weight=torch.tensor(sample_weights_df['weight'].values)
cate_weight

tensor([ 39.2862,   8.8174,   4.8656,  26.3283,  10.0938,  43.3761,  28.6310,
         29.3077,  21.5855,  10.6493,  15.4279,  53.1247,  81.1492,   8.0219,
         11.0841,  16.3281,  83.9974,  23.0448,  47.9287,  16.3391,  49.3859,
         63.4961,  20.9526,   9.7787,  30.3835,  32.5031,  17.5283,  49.3859,
         64.1541,  18.2369,  28.9327,   9.0166,   5.6553,  13.4442,  11.8117,
         10.0731,  21.7399,  32.8564,  29.8311,  30.9679,  22.7710,  27.4596,
         12.8201,   7.1209,  57.8654,  35.5863,  13.7364,  22.7710,  27.3037,
         52.3816,   7.6656,  13.8024,  13.2047,  33.6954,  26.9007,  17.2619,
          8.6473,  34.0895,  13.9238,  16.4959,  26.6577,   6.0446,  52.3816,
         50.6524,  17.6107,  13.5752,  10.1516,  32.1609,  15.8639,  27.5122,
         22.5938,  31.2730,  34.6022,  18.0704,  61.0530,  15.7937,  28.0549,
         34.7075,  36.1707,  25.7909,  10.0964,  48.4959,   8.0760,   7.1091,
         17.5968,  17.2749,  12.7409,  37.9739,  10.0602,  11.19

In [19]:
class BirdclefDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        bird_category_dir: str,
        audio_dir: str = "../../data/train_audio",
        train: bool = True,
    ):
        """
        parameters:
            df: the dataframe of metadata (train/val)
            bird_category_dir: the directory of the bird category array file (npy)
            audio_dir: the parent path where all audio files stored
            train: If the Datset for train set or val set
        """
        super().__init__()
        # if the Dataset for training or validation
        self.train = train
        self.raw_df = df

        # inperplote nan or 0 value of rating col
        self.raw_df = rating_value_interplote(df=self.raw_df)
        # Calculate the weight of each audio file by rating
        self.raw_df = audio_weight(self.raw_df)

        self.audio_dir = audio_dir

        self.bird_cate_array = np.load(bird_category_dir, allow_pickle=True)

        self.np_audio_transforms = (
            self.setup_transforms()
        )  # initialize data augmentation func

    def setup_transforms(self):

        return CustomCompose(
            [
                CustomOneOf(
                    [
                        NoiseInjection(p=1, max_noise_level=0.04),
                        GaussianNoise(p=1, min_snr=5, max_snr=20),
                        PinkNoise(p=1, min_snr=5, max_snr=20),
                        AddGaussianNoise(
                            min_amplitude=0.0001, max_amplitude=0.03, p=0.5
                        ),
                        AddGaussianSNR(min_snr_in_db=5, max_snr_in_db=15, p=0.5),
                    ],
                    p=0.3, 
                ),
            ]
        )

    def get_audio_path(self, file_name: str) -> str:
        """
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            file_name: in format category_type/XC-ID.ogg (asbfly/XC134896.ogg)

        Return:
            the single audio path string
        """

        # concatenate parent path and child path
        return os.path.join(self.audio_dir, file_name)

    def target_clip(
        self, index: int, audio: torch.Tensor, sample_rate: int
    ) -> torch.Tensor:
        """
        calculate the index corresponding audio clip

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
        """
        # Get the audio start time corresponding to index
        clip_start_time = self.raw_df["clip_start_time"].iloc[index]
        duration_seconds = self.raw_df["duration"].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time <= duration_seconds:
            clip_start_point = clip_start_time * sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point + segment_duration <= total_samples:
                clip = audio[:, clip_start_point : clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point + segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip = torch.cat((audio[:, clip_start_point:], silence), dim=1)

                del silence, padding_length

        else:
            raise ValueError("The clip start time is out of raw audio length")

        del clip_start_time, segment_duration, total_samples

        return clip

    def random_audio_augmentation(self, audio: torch.Tensor):
        """
        audio (torch.Tensor): A 2D tensor of audio samples with shape (1, N), where N is the number of samples.
        """

        audio_aug = self.np_audio_transforms(audio[0].numpy())

        # tranfer the array to 2D tensor and keep the num channel is 1
        # this step is to keep the input and output shape adn type are the same

        audio_aug_tensor = torch.from_numpy(audio_aug)
        audio_aug_tensor = audio_aug_tensor.unsqueeze(0).to(dtype=torch.float16)

        del audio_aug

        return audio_aug_tensor

    def audio_label_tensor_generator(self, true_label: str) -> torch.Tensor:
        """
        Generate a tensor containing all categories based on the given real audio label

        Parameters:
            true lable: a label string

        Return:
            If have 10 class, and give a true lable
            the return should be tensor([0,1,0,0,0,0,0,0,0,0])
        """
        # Find the index of the target value in the array
        idx = np.where(self.bird_cate_array == true_label)[0][0]

        # Create a tensor of all zeros with length equal to the length of the array
        audio_label_tensor = torch.zeros(len(self.bird_cate_array), dtype=torch.float16)

        # Set the value at the corresponding index position to 1
        audio_label_tensor[idx] = 1

        return audio_label_tensor

    def __len__(self):
        return self.raw_df.shape[0]

    def __getitem__(self, index):
        row = self.raw_df.iloc[index]

        audio_label = row["primary_label"]
        audio_weight = row["audio_weight"]

        # Get the path to a single audio file
        single_audio_dir = self.get_audio_path(row["filename"])

        # Read audio array according to path
        audio, sr = read_audio(single_audio_dir)

        # augmentation
        # only used for train df
        if self.train:
            audio_augmentation = self.random_audio_augmentation(audio=audio)
            # Get the audio clip corresponding to index
            clip = self.target_clip(index, audio=audio_augmentation, sample_rate=sr)
            del audio_augmentation
        else:
            clip = self.target_clip(index, audio=audio, sample_rate=sr)

        # change audio label to one-hot tensor
        audio_label_tensor = self.audio_label_tensor_generator(true_label=audio_label)

        audio_label_tensor = torch.tensor(audio_label_tensor, dtype=torch.float16)
        clip = torch.tensor(clip, dtype=torch.float16)
        audio_weight = torch.tensor(audio_weight, dtype=torch.float16)

        del audio

        return audio_label_tensor, clip, audio_weight

In [20]:
BD=BirdclefDataset(df=train_df,bird_category_dir="./temp_files/13-2-bird-cates.npy",train=True)
train_dataloader = DataLoader(dataset=BD, batch_size=32, sampler=train_sampler, pin_memory=True)

In [21]:
batch = next(iter(train_dataloader))
audio_label,clip,audio_weights = batch
print(audio_label)
print(type(audio_label))
print(audio_label.shape)
print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float16)
<class 'torch.Tensor'>
torch.Size([32, 182])
tensor([[[-3.6499e-02, -2.0172e-02, -4.3945e-03,  ..., -2.1088e-02,
          -1.9821e-02, -1.3184e-02]],

        [[-1.1091e-03, -8.6746e-03, -1.6012e-03,  ...,  6.3591e-03,
          -1.2703e-02, -3.5583e-02]],

        [[ 6.0499e-05,  4.7207e-05, -6.4373e-06,  ..., -9.2745e-04,
          -6.9523e-04, -2.4796e-04]],

        ...,

        [[ 3.1352e-05, -8.7023e-06,  5.6028e-06,  ..., -8.1682e-04,
          -2.3212e-03, -3.0594e-03]],

        [[-2.5249e-04,  8.3876e-04,  1.0834e-03,  ...,  2.8312e-05,
           1.6108e-03, -2.1839e-03]],

        [[ 5.1856e-06,  2.9802e-06, -3.6359e-06,  ...,  1.6174e-02,
           5.2147e-03, -5.3453e-04]]], dtype=torch.float16)
tor

In [22]:
audio_label[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float16)

In [23]:
class Mixup(nn.Module):
    def __init__(self, mix_beta, mixup_prob, mixup_double):
        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup_prob = mixup_prob
        self.mixup_double = mixup_double

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0] # Generate a random number p and compare it with mixup_prob to decide whether to mix.
        if p < self.mixup_prob:
            bs = X.shape[0] # batch size
            n_dims = len(X.shape)
            perm = torch.randperm(bs) # Generate a random permutation for randomly selecting samples from the current batch for mixing.

            p1 = torch.rand((1,))[0] # If the random number p1 (determines whether to perform double mixing) is less than mixup_double, perform a single mix. Otherwise, perform double mixing:
            if p1 < self.mixup_double:
                X = X + X[perm]
                Y = Y + Y[perm]
                Y = torch.clamp(Y, 0, 1) # Use torch.clamp to clamp the values ​​of Y between 0 and 1 (suitable for probabilistic or binary labels).

                if weight is None:
                    return X, Y
                else:
                    weight = 0.5 * weight + 0.5 * weight[perm]
                    return X, Y, weight
            else:
                perm2 = torch.randperm(bs)
                X = X + X[perm] + X[perm2]
                Y = Y + Y[perm] + Y[perm2]
                Y = torch.clamp(Y, 0, 1)

                if weight is None:
                    return X, Y
                else:
                    weight = (
                        1 / 3 * weight + 1 / 3 * weight[perm] + 1 / 3 * weight[perm2]
                    )
                    return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [24]:
# The mixup function here originally performs data conversion in lightningModule. In order to improve the training efficiency of the model, 
# I decided to extract this part from loggingModule and replace it with torch.utils.data.Dataset.collate_fn
mixup_layer = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

def mixup_collate(batch,mixup_layer):
    '''
    When creating data batches, define how each batch should be stacked
    parameters:
        batch: is a list of tuples with (labels, clip, weights)
    '''
    # Unpack each individual sample in the batch
    labels, audios, weights = zip(*batch)
    
    # Stack the data into new batches
    labels = torch.stack(labels)
    audios = torch.stack(audios)

    weights = torch.stack(weights) if weights[0] is not None else None

    return mixup_layer(X=audios,Y=labels,weight=weights)
    



BD=BirdclefDataset(df=train_df,bird_category_dir="./temp_files/13-2-bird-cates.npy",train=True)
train_dataloader = DataLoader(dataset=BD, batch_size=32, sampler=train_sampler, pin_memory=True,collate_fn=lambda batch: mixup_collate(batch, mixup_layer))


In [25]:
batch = next(iter(train_dataloader))
clip,audio_label,audio_weights = batch
# print(audio_label)
print(type(audio_label))
print(audio_label.shape)
# print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

<class 'torch.Tensor'>
torch.Size([32, 182])
torch.Size([32, 1, 160000])
tensor([0.7998, 0.8999, 0.6001, 0.7998, 0.6001, 0.7998, 0.5000, 0.8999, 0.2000,
        0.7998, 1.0000, 0.6001, 1.0000, 0.7002, 1.0000, 1.0000, 0.8999, 1.0000,
        0.7998, 0.5000, 0.8999, 0.7998, 0.7002, 0.7002, 0.7998, 0.7998, 0.7002,
        0.3999, 0.7002, 0.7998, 0.5000, 0.6001], dtype=torch.float16)
torch.Size([32])


In [26]:
audio_label[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float16)

In [27]:
def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.01,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  

    hop_length = int(hop_size * sample_rate)  

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec

In [28]:
def compute_deltas(specgram: torch.Tensor, win_length: int = 5, mode: str = "replicate") -> torch.Tensor:
    """Compute delta coefficients of a tensor, usually a spectrogram.

    Args:
        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
        win_length (int, optional): The window length used for computing delta (Default: 5)
        mode (str, optional): Mode parameter passed to padding (Default: "replicate")

    Returns:
        Tensor: Tensor of deltas of dimension (..., freq, time)
    """
    device = specgram.device  
    dtype = specgram.dtype

    shape = specgram.size()
    specgram = specgram.reshape(1, -1, shape[-1])

    assert win_length >= 3
    n = (win_length - 1) // 2
    denom = n * (n + 1) * (2 * n + 1) / 3

    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

    kernel = torch.arange(-n, n + 1, 1, dtype=dtype,device=device).repeat(specgram.shape[1], 1, 1)

    output = (
        torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
    )

    # unpack batch
    output = output.reshape(shape)

    return output



def make_delta(input_tensor: torch.Tensor):
    input_tensor = input_tensor.transpose(3, 2)
    input_tensor = compute_deltas(input_tensor)
    input_tensor = input_tensor.transpose(3, 2)
    return input_tensor


def image_delta(x):
    delta_1 = make_delta(x)
    delta_2 = make_delta(delta_1)
    x = torch.cat([x, delta_1, delta_2], dim=1)
    return x

In [29]:
class Mixup2(nn.Module):
    def __init__(self, mix_beta, mixup2_prob):
        super(Mixup2, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup2_prob = mixup2_prob

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0]
        if p < self.mixup2_prob:
            bs = X.shape[0]
            n_dims = len(X.shape)
            perm = torch.randperm(bs)
            coeffs = self.beta_distribution.rsample(torch.Size((bs,)))

            if n_dims == 2:
                X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
            elif n_dims == 3:
                X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
            else:
                X = (
                    coeffs.view(-1, 1, 1, 1) * X
                    + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]
                )
            Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]
            # Y = Y + Y[perm]
            # Y = torch.clamp(Y, 0, 1)

            if weight is None:
                return X, Y
            else:
                weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
                return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [30]:
# The collate_fn() created above indicates that the mixup has been successfully extracted from the lightningModule and partially integrated with the dataloader
# Now, you need to add the remaining data conversion steps to collate_fn()
# This step is for the train dataloader

mixup_layer = Mixup(mix_beta=5, mixup_prob=0.7, mixup_double=0.5)
mixup2_layer = Mixup2(mix_beta=2, mixup2_prob=0.15)

audio_transforms = Compose(
    [
        # AddColoredNoise(p=0.5),
        PitchShift(
            min_transpose_semitones=-4,
            max_transpose_semitones=4,
            sample_rate=32000,
            p=0.4,
        ),
        Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
    ]
)


def trainloader_collate(batch, mixup_layer):
    """
    When creating data batches, define how each batch should be stacked
    parameters:
        batch: is a list of tuples with (labels, clip, weights)
    """
    # Unpack each individual sample in the batch
    labels, clips, weights = zip(*batch)

    # Stack the data into new batches
    labels = torch.stack(labels).float()
    clips = torch.stack(clips).float()

    weights = torch.stack(weights) if weights[0] is not None else None

    clips, labels, weights = mixup_layer(X=clips, Y=labels, weight=weights)

    # Use Compose to combine multiple audio transformation operations. 
    # These operations are applied to the input audio data to enhance the generalization and robustness of the model.
    clips = audio_transforms(clips, sample_rate=32000)

    # Convert audio data into mel spectrogram
    clips = mel_transform(sample_rate=32000, audio=clips)

    ##Convert the amplitude of Mel Spectrogram to decibel (Decibel, dB)
    clips = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(clips)

    # normalization
    clips = (clips + 80) / 80

    # Randomly masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.
    clips = torchaudio.transforms.TimeMasking(
        time_mask_param=20, iid_masks=True, p=0.3
    )(clips)

    # Calculate the first and second order differences of audio or other time series data, 
    # usually called delta and delta-delta (also called acceleration) features.
    clips = image_delta(clips)

    # Mixing Audio
    clips, labels,weights = mixup2_layer(X=clips, Y=labels, weight=weights)

    return clips, labels, weights

In [31]:
BD=BirdclefDataset(df=train_df,bird_category_dir="./temp_files/13-2-bird-cates.npy",train=True)
train_dataloader = DataLoader(dataset=BD, batch_size=32, sampler=train_sampler, pin_memory=True,collate_fn=lambda batch: trainloader_collate(batch, mixup_layer))

In [32]:
batch = next(iter(train_dataloader))
clip,audio_label,audio_weights = batch
# print(audio_label)
print(type(audio_label))
print(audio_label.shape)
# print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

<class 'torch.Tensor'>
torch.Size([32, 182])
torch.Size([32, 3, 40, 501])
tensor([0.7002, 0.8501, 1.0000, 0.8999, 0.7002, 0.5498, 0.7500, 0.4502, 0.8999,
        0.9502, 0.7002, 0.8496, 1.0000, 0.7500, 0.7002, 0.8496, 0.7500, 0.7998,
        0.7998, 0.3501, 0.7500, 0.5498, 0.7998, 0.5498, 0.8999, 0.8501, 0.7998,
        0.7998, 0.5996, 0.7500, 0.8496, 0.6001], dtype=torch.float16)
torch.Size([32])


In [33]:
audio_label[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0.])

In [34]:
gc.collect()

16

### Next, I will analyze the layers of the pre-trained model, select which layers to use as feature extractors, and redefine the new model for training.

In [35]:
# Load the pre-trained model
model = timm.create_model('tf_efficientnetv2_s_in21k', pretrained=True,in_chans=3) # You can change the data channel accepted by the pre-trained model by passing in argument in_chans

In [36]:
summary(model,input_size=(128,3,40,501))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [128, 21843]              --
├─Conv2dSame: 1-1                             [128, 24, 20, 251]        648
├─BatchNormAct2d: 1-2                         [128, 24, 20, 251]        48
│    └─Identity: 2-1                          [128, 24, 20, 251]        --
│    └─SiLU: 2-2                              [128, 24, 20, 251]        --
├─Sequential: 1-3                             [128, 256, 2, 16]         --
│    └─Sequential: 2-3                        [128, 24, 20, 251]        --
│    │    └─ConvBnAct: 3-1                    [128, 24, 20, 251]        5,232
│    │    └─ConvBnAct: 3-2                    [128, 24, 20, 251]        5,232
│    └─Sequential: 2-4                        [128, 48, 10, 126]        --
│    │    └─EdgeResidual: 3-3                 [128, 48, 10, 126]        25,632
│    │    └─EdgeResidual: 3-4                 [128, 48, 10, 126]        92,640
│    

In [37]:
# Assume model is the loaded complete EfficientNet model
# Use the output of the first set of InvertedResidual
feature_extractor = torch.nn.Sequential(
    *list(model.children())[:-3]  # Remove the last three layers, which needs to be adjusted according to the actual model structure
)

In [38]:
feature_extractor

Sequential(
  (0): Conv2dSame(3, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (1): BatchNormAct2d(
    24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (2): Sequential(
    (0): Sequential(
      (0): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
      (1): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
    )
    (1): Sequential(
      (0): EdgeResid

In [39]:
model.classifier # classifier is the last fully connected layer of the model, out_features represents the number of categories

Linear(in_features=1280, out_features=21843, bias=True)

In [40]:
print(model.classifier.in_features)
print(model.classifier.out_features)

1280
21843


In [41]:
# Use the pre-trained model (excluding the last 3 layers) for calculation
clip=feature_extractor(clip)

In [42]:
print(clip.shape)

torch.Size([32, 1280, 2, 16])


In [43]:
# I want to separate feature extractor from lightningmodule and add it to dataloader as part of data processing
def trainloader_collate(batch, mixup_layer,feature_extractor):
    """
    When creating data batches, define how each batch should be stacked
    parameters:
        batch: is a list of tuples with (labels, clip, weights)
    """
    # Unpack each individual sample in the batch
    labels, clips, weights = zip(*batch)

    # Stack the data into new batches
    labels = torch.stack(labels).float()
    clips = torch.stack(clips).float()

    weights = torch.stack(weights) if weights[0] is not None else None

    clips, labels, weights = mixup_layer(X=clips, Y=labels, weight=weights)

    # Use Compose to combine multiple audio transformation operations. 
    # These operations are applied to the input audio data to enhance the generalization and robustness of the model.
    clips = audio_transforms(clips, sample_rate=32000)

    # Convert audio data into mel spectrogram
    clips = mel_transform(sample_rate=32000, audio=clips)

    ##Convert the amplitude of Mel Spectrogram to decibel (Decibel, dB)
    clips = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(clips)

    # normalization
    clips = (clips + 80) / 80

    # Random masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.
    clips = torchaudio.transforms.TimeMasking(
        time_mask_param=20, iid_masks=True, p=0.3
    )(clips)

    # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
    clips = image_delta(clips)

    # Mixing Audio
    clips, labels,weights = mixup2_layer(X=clips, Y=labels, weight=weights)

    # feature extractor
    clips=feature_extractor(clips)

    return clips, labels, weights

In [44]:
BD=BirdclefDataset(df=train_df,bird_category_dir="./temp_files/13-2-bird-cates.npy",train=True)
train_dataloader = DataLoader(dataset=BD, batch_size=32, sampler=train_sampler, pin_memory=True,collate_fn=lambda batch: trainloader_collate(batch, mixup_layer,feature_extractor))

In [45]:
batch = next(iter(train_dataloader))
clip,audio_label,audio_weights = batch
# print(audio_label)
print(type(audio_label))
print(audio_label.shape)
# print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

<class 'torch.Tensor'>
torch.Size([32, 182])
torch.Size([32, 1280, 2, 16])
tensor([0.7998, 0.8496, 0.6499, 0.7002, 0.7500, 0.8999, 0.7500, 0.5996, 0.8501,
        0.7500, 0.6499, 0.7002, 0.7500, 0.8496, 0.6499, 0.6001, 0.7002, 0.7002,
        0.8999, 0.7002, 0.8496, 0.8496, 0.5498, 0.7500, 0.8999, 0.8496, 0.7998,
        0.6001, 0.7500, 0.7500, 0.6499, 0.7998], dtype=torch.float16)
torch.Size([32])


In [46]:
# I want to separate feature extractor from lightningmodule and add it to dataloader as part of data processing

def valloader_collate(batch, feature_extractor):
    """
    在数据批次创建时，定义每个批次该如何堆叠
    parameters:
        batch: is a list of tuples with (labels, clip, weights)
    """
    # Unpack each individual sample in the batch
    labels, clips, weights = zip(*batch)

    # Stack the data into new batches
    labels = torch.stack(labels).float()
    clips = torch.stack(clips).float()

    weights = torch.stack(weights) if weights[0] is not None else None

    # clips, labels, weights = mixup_layer(X=clips, Y=labels, weight=weights)

    # # Use Compose to combine multiple audio transformation operations. 
    # These operations are applied to the input audio data to enhance the generalization and robustness of the model.
    # clips = audio_transforms(clips, sample_rate=32000)

    # Convert audio data into mel spectrogram
    clips = mel_transform(sample_rate=32000, audio=clips)

    ##Convert the amplitude of Mel Spectrogram to decibel (dB)
    clips = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(clips)

    # normalization
    clips = (clips + 80) / 80

    # # Random masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.
    # clips = torchaudio.transforms.TimeMasking(
    #     time_mask_param=20, iid_masks=True, p=0.3
    # )(clips)

    # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
    clips = image_delta(clips)

    # mixing audio
    # clips, labels,weights = mixup2_layer(X=clips, Y=labels, weight=weights)

    # feature extractor
    clips=feature_extractor(clips)

    return clips, labels, weights

In [47]:
BD = BirdclefDataset(
    df=val_df, bird_category_dir="./temp_files/13-2-bird-cates.npy", train=True
)
val_dataloader = DataLoader(
    dataset=BD,
    batch_size=32,
    sampler=val_sampler,
    pin_memory=True,
    collate_fn=lambda batch: valloader_collate(batch, feature_extractor),
)

In [48]:
batch = next(iter(val_dataloader))
clip,audio_label,audio_weights = batch
# print(audio_label)
print(type(audio_label))
print(audio_label.shape)
# print(clip)
print(clip.shape)
print(audio_weights)
print(audio_weights.shape)

<class 'torch.Tensor'>
torch.Size([32, 182])
torch.Size([32, 1280, 2, 16])
tensor([0.7998, 1.0000, 0.8999, 0.7998, 0.2000, 1.0000, 0.7998, 0.2000, 0.8999,
        0.8999, 1.0000, 0.7002, 1.0000, 1.0000, 1.0000, 0.7998, 0.5000, 0.7998,
        0.7998, 1.0000, 1.0000, 1.0000, 0.7998, 1.0000, 1.0000, 0.7998, 0.7002,
        0.7998, 1.0000, 0.7002, 0.8999, 1.0000], dtype=torch.float16)
torch.Size([32])


In [49]:
# Use flatten to combine the last two dimensions
x_flattened = torch.flatten(clip, start_dim=2)  # The resulting shape is also [32, 1280, 32]

print(x_flattened)
print(x_flattened.shape)

tensor([[[ 2.2469e+01,  6.2483e+00, -8.8653e+00,  ..., -4.4712e+00,
          -7.1122e+00, -7.2261e+00],
         [ 1.2141e+01,  5.6110e+00,  4.7072e+00,  ..., -2.1433e+00,
           1.0140e+01,  2.1438e+01],
         [ 2.5661e-01,  2.7368e+00,  1.2372e+01,  ...,  1.1742e+01,
           1.0400e+01,  1.8145e+00],
         ...,
         [ 1.9727e+01,  1.4484e+01,  3.8275e+00,  ...,  1.5333e-01,
           5.3791e+00,  1.2065e+01],
         [-2.0544e+01, -1.3588e+01,  3.1236e+00,  ..., -1.7952e+00,
          -4.4621e+00, -9.4821e-01],
         [ 5.9797e+00, -2.0643e+00,  2.6429e+00,  ...,  6.0352e+00,
           5.7987e+00,  2.4760e+00]],

        [[ 3.8677e+00, -1.1622e+00,  4.7125e+00,  ..., -5.4461e+00,
          -5.6394e+00, -7.5521e+00],
         [-1.6453e+01, -1.6136e+01, -8.6477e+00,  ...,  1.5164e+00,
          -1.5115e-02, -3.2839e+00],
         [ 9.8462e-01,  4.6436e+00, -1.7860e+00,  ...,  9.0861e+00,
           1.2655e+01,  5.7870e+00],
         ...,
         [ 6.9145e+00, -2

In [50]:
class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.gru1=nn.GRU(input_size=1280,hidden_size=128,num_layers=1,batch_first=True)
        self.bn1= nn.BatchNorm1d(num_features=32)
        self.gru2=nn.GRU(input_size=128,hidden_size=128,num_layers=1,batch_first=True)
        self.bn2= nn.BatchNorm1d(num_features=32)
        self.gru3=nn.GRU(input_size=256,hidden_size=128,num_layers=1,batch_first=True)
        self.bn3= nn.BatchNorm1d(num_features=32)
        self.gru4=nn.GRU(input_size=384,hidden_size=128,num_layers=1,batch_first=True)
        self.bn4= nn.BatchNorm1d(num_features=32)
        self.dropout1 = nn.Dropout(0.3)
        self.fc1=nn.Linear(in_features=128,out_features=182)


    def forward(self,x):
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, feature_size, sequence length)
        # I need to change the shape
        x=x.permute(0,2,1)
        gru_out1,_=self.gru1(x)
        x1=self.bn1(gru_out1)
        gru_out2,_=self.gru2(x1)
        x2=self.bn2(gru_out2)
        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x3=torch.cat((x1,x2),dim=2)
        gru_out3,_=self.gru3(x3)
        x4=self.bn3(gru_out3)
        x5=torch.cat((x1,x2,x4),dim=2)
        gru_out4,_=self.gru4(x5)
        x6 = self.dropout1(gru_out4[:, -1, :]) #Usually take the final output of GRU
        out = self.fc1(x6) 

        return out

In [51]:
model=ChronoNet()
clip=model(x_flattened)

print(clip.shape)

torch.Size([32, 182])


In [52]:
print(clip[0])

tensor([-0.0392,  0.3155,  0.4806,  0.4337,  0.1109,  0.1359, -0.4806, -0.1386,
         0.4318, -0.4446, -0.2128, -0.3190,  0.3317,  0.1431,  0.4492, -0.0396,
        -0.5315,  0.3659, -0.6303, -0.5312,  0.1577,  0.3573,  0.4484, -0.5918,
         0.4210, -0.1279,  0.2047,  0.1297,  0.1064, -0.2537,  0.3900,  0.6682,
        -0.0828, -0.2345, -0.3453,  0.1298,  0.4748, -0.0998, -0.1111,  0.1332,
        -0.1437, -0.1141, -0.3809,  0.2559,  0.6969, -0.5416, -0.3398, -0.1797,
         0.8385, -0.3311,  0.6701,  0.4356,  0.6088, -0.2598, -0.3122,  0.4474,
         0.0239, -0.0103,  0.3292, -0.0232,  1.1892, -0.8816, -0.2917,  0.7434,
        -0.2166, -0.7863,  0.0615, -0.1152,  0.5993,  0.1749,  0.3961,  0.7628,
        -0.3389,  0.0672,  0.0211,  0.2785,  0.1851,  0.1546, -0.0641,  0.0270,
        -0.2108, -0.2117,  0.3912,  0.6857, -0.6390,  0.2813,  0.0495,  0.4485,
        -0.1133,  0.2673, -0.1169, -0.0609, -0.8051,  0.1734, -0.0587,  0.2924,
         0.1399,  0.6104, -0.2507,  0.16

In [53]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, sample_weight=None,reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight  # Class weights
        self.sample_weight=sample_weight
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.weight)
        p_t = torch.exp(-ce_loss) # Modulating Factor
        loss = (1 - p_t) ** self.gamma * ce_loss

        if self.sample_weight is not None:
            loss *= self.sample_weight

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [54]:
focal=FocalLoss(weight=cate_weight,sample_weight=audio_weights)

loss=focal(inputs=clip,targets=audio_label)

In [55]:
loss

tensor(64.7562, dtype=torch.float64, grad_fn=<MeanBackward0>)