reference for 13.2-data-process-augmentation-with-batch-add-mix.ipynb

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import timm
# import torch

# # Creating a pre-trained model
# pretrained_model_name = "tf_efficientnetv2_s_in21k"  # model name
# model = timm.create_model(pretrained_model_name, pretrained=True, in_chans=3)

# # save model weights
# save_path='/Users/yiding/personal_projects/ML/github_repo/birdcief/model/backbones/tf-efficientnetv2_s_in21k/'
# torch.save(model.state_dict(), save_path+pretrained_model_name + '_weights.pth')

In [3]:
# # Loading model weights in an offline environment
# model = timm.create_model(pretrained_model_name, pretrained=False, in_chans=3)
# model.load_state_dict(torch.load(save_path+pretrained_model_name + '_weights.pth'))

In [4]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [5]:
metadata_path='../../data/train_metadata_new_add_rating_2500.csv'

In [6]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# Find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# Initialize the training set and validation set
train_indices = []
val_indices = []


# Random select a sample from each category to join the validation set, and the rest to join the training set
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))


# Divide the dataset by index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [7]:
train_df.shape

(2338, 5)

In [8]:
val_df.shape

(162, 5)

In [9]:
# Need to interpolate missing values ​​for ratings in metadata csv files

def rating_value_interplote(df:pd.DataFrame):
    '''
    interplote Nan values for rating col in metadata csv 

    parameters:
        df: the df of the metadata csv file

    rating col means the quality of the corresponding audio file
        5 is high quality
        1 is low quality
        0 is without defined quality level
    '''

    if df['rating'].isna().sum()>0: # with missing values
        df['rating'].fillna(0, inplace=True)

    # Random assign a value to all places where the value is 0, choosing from the specified choices
    mask = df['rating'] == 0  # Create a boolean mask indicating which positions are 0

    choices=np.arange(0.5,5.1,0.5).tolist() # [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    random_values = np.random.choice(choices, size=mask.sum())  # Generate random numbers for these 0 values 
    df.loc[mask, 'rating'] = random_values  # Fill the generated random numbers back into the corresponding positions of the original DataFrame

    return df

In [10]:
# Calculating the weight of each audio file by rating helps model training
def audio_weight(df):
    '''
    calculate the weight corresponding to each audio file through the rating value

    Because each audio has different quality level, we use weight to affect the inportance of each audio in models,
    the lower the quality of the audio, the lower the weight
    '''
    #Through rating, we calculate the credibility of each audio and express it through weight. 
    # The purpose of this is to improve the model by increasing the weight of high-quality audio and reducing the weight of low-quality audio.
    df["audio_weight"] = np.clip(df["rating"] / df["rating"].max(), 0.1, 1.0)

    return df



In [11]:
# Because this is an unbalanced dataset, the amount of data in each category is very different
# So I will calculate the weight of each category here
# **(-0.5) The purpose is to reduce the relative influence of high-frequency categories and increase the influence of low-frequency categories, 
# so as to help the model better learn those uncommon categories
# The purpose of calculating this is to build a WeightedRandomSampler, so that each time a batch is extracted using dataloader, it is more friendly to data of different categories.

def sampling_weight(df)->torch.Tensor:
    '''
    calculate the sampling weight of each audio file

    because this is imbalanced dataset
    we hope the category with less data has large probability to be picked.
    '''
    sample_weights = (df['primary_label'].value_counts() / df['primary_label'].value_counts().sum()) ** (-0.5)

    # 将权重映射到原始数据的每一行
    sample_weights_map = df['primary_label'].map(sample_weights)

    # Convert pandas Series to NumPy array
    sample_weights_np = sample_weights_map.to_numpy(dtype=np.float32)

    # Convert a NumPy array to a PyTorch tensor using torch.from_numpy
    sample_weights_tensor = torch.from_numpy(sample_weights_np)

    return sample_weights_tensor


In [12]:
# df=pd.read_csv(metadata_path,header=0)

sample_weights_tensor=sampling_weight(df=train_df)
# Here we will build an argument sampler that will be used by the dataloader
# Note that the order of weights in the constructed sampler must be consistent with the order of data passed into the dataloader, otherwise the weights will not match

# Create a sampler based on the newly obtained weight list
sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

sampler

<torch.utils.data.sampler.WeightedRandomSampler at 0x107da6e60>

In [13]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate


In [14]:
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError


class CustomCompose:
    def __init__(self, transforms: list):
        self.transforms = transforms

    def __call__(self, y: np.ndarray):
        for trns in self.transforms:
            y = trns(y)
        return y


class CustomOneOf:
    def __init__(self, transforms: list, p=1.0):
        self.transforms = transforms
        self.p = p

    def __call__(self, y: np.ndarray):
        if np.random.rand() < self.p:
            n_trns = len(self.transforms)
            trns_idx = np.random.choice(n_trns)
            trns = self.transforms[trns_idx]
            y = trns(y)
        return y


class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=40.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class VolumeControl(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"):
        super().__init__(always_apply, p)

        assert mode in [
            "uniform",
            "fade",
            "fade",
            "cosine",
            "sine",
        ], "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'"

        self.db_limit = db_limit
        self.mode = mode

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.db_limit, self.db_limit)
        if self.mode == "uniform":
            db_translated = 10 ** (db / 20)
        elif self.mode == "fade":
            lin = np.arange(len(y))[::-1] / (len(y) - 1)
            db_translated = 10 ** (db * lin / 20)
        elif self.mode == "cosine":
            cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * cosine / 20)
        else:
            sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2)
            db_translated = 10 ** (db * sine / 20)
        augmented = y * db_translated
        return augmented


class NoiseInjection(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5, sr=32000):
        super().__init__(always_apply, p)

        self.noise_level = (0.0, max_noise_level)
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        noise_level = np.random.uniform(*self.noise_level)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_level).astype(y.dtype)
        return augmented


class GaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise**2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented


class PinkNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y**2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        pink_noise = cn.powerlaw_psd_gaussian(1, len(y))
        a_pink = np.sqrt(pink_noise**2).max()
        augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype)
        return augmented


class TimeStretch(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_rate=1, sr=32000):
        super().__init__(always_apply, p)
        self.max_rate = max_rate
        self.sr = sr

    def apply(self, y: np.ndarray, **params):
        rate = np.random.uniform(0, self.max_rate)
        augmented = librosa.effects.time_stretch(y, rate)
        return augmented


def _db2float(db: float, amplitude=True):
    if amplitude:
        return 10 ** (db / 20)
    else:
        return 10 ** (db / 10)


def volume_down(y: np.ndarray, db: float):
    """
    Low level API for decreasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to decrease
    Returns
    -------
    applied: numpy.ndarray
        audio with decreased volume
    """
    applied = y * _db2float(-db)
    return applied


def volume_up(y: np.ndarray, db: float):
    """
    Low level API for increasing the volume
    Parameters
    ----------
    y: numpy.ndarray
        stereo / monaural input audio
    db: float
        how much decibel to increase
    Returns
    -------
    applied: numpy.ndarray
        audio with increased volume
    """
    applied = y * _db2float(db)
    return applied


class RandomVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        if db >= 0:
            return volume_up(y, db)
        else:
            return volume_down(y, db)


class CosineVolume(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, limit=10):
        super().__init__(always_apply, p)
        self.limit = limit

    def apply(self, y: np.ndarray, **params):
        db = np.random.uniform(-self.limit, self.limit)
        cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2)
        dbs = _db2float(cosine * db)
        return y * dbs


class AddGaussianNoise(AudioTransform):
    """Add gaussian noise to the samples"""

    supports_multichannel = True

    def __init__(
        self, always_apply=False, min_amplitude=0.001, max_amplitude=0.015, p=0.5
    ):
        """
        :param min_amplitude: Minimum noise amplification factor
        :param max_amplitude: Maximum noise amplification factor
        :param p:
        """
        super().__init__(always_apply, p)
        assert min_amplitude > 0.0
        assert max_amplitude > 0.0
        assert max_amplitude >= min_amplitude
        self.min_amplitude = min_amplitude
        self.max_amplitude = max_amplitude

    def apply(self, samples: np.ndarray, sample_rate=32000):
        amplitude = np.random.uniform(self.min_amplitude, self.max_amplitude)
        noise = np.random.randn(*samples.shape).astype(np.float32)
        samples = samples + amplitude * noise
        return samples


class AddGaussianSNR(AudioTransform):
    """
    Add gaussian noise to the input. A random Signal to Noise Ratio (SNR) will be picked
    uniformly in the decibel scale. This aligns with human hearing, which is more
    logarithmic than linear.
    """

    supports_multichannel = True

    def __init__(
        self,
        always_apply=False,
        min_snr_in_db: float = 5.0,
        max_snr_in_db: float = 40.0,
        p: float = 0.5,
    ):
        """
        :param min_snr_in_db: Minimum signal-to-noise ratio in dB. A lower number means more noise.
        :param max_snr_in_db: Maximum signal-to-noise ratio in dB. A greater number means less noise.
        :param p: The probability of applying this transform
        """
        super().__init__(always_apply, p)
        self.min_snr_in_db = min_snr_in_db
        self.max_snr_in_db = max_snr_in_db

    def apply(self, samples: np.ndarray, sample_rate=32000):
        snr = np.random.uniform(self.min_snr_in_db, self.max_snr_in_db)

        clean_rms = np.sqrt(np.mean(np.square(samples)))

        a = float(snr) / 20
        noise_rms = clean_rms / (10**a)

        noise = np.random.normal(0.0, noise_rms, size=samples.shape).astype(np.float32)
        return samples + noise


class Normalize(AudioTransform):
    """
    Apply a constant amount of gain, so that highest signal level present in the sound becomes
    0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known
    as peak normalization.
    """

    supports_multichannel = True

    def __init__(self, always_apply=False, apply_to: str = "all", p: float = 0.5):
        super().__init__(always_apply, p)
        assert apply_to in ("all", "only_too_loud_sounds")
        self.apply_to = apply_to

    def apply(self, samples: np.ndarray, sample_rate=32000):
        max_amplitude = np.amax(np.abs(samples))
        if self.apply_to == "only_too_loud_sounds" and max_amplitude < 1.0:
            return samples

        if max_amplitude > 0:
            return samples / max_amplitude
        else:
            return samples

class NormalizeMelSpec(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = eps

    def forward(self, X):
        mean = X.mean((1, 2), keepdim=True)
        std = X.std((1, 2), keepdim=True)
        Xstd = (X - mean) / (std + self.eps)
        norm_min, norm_max = Xstd.min(-1)[0].min(-1)[0], Xstd.max(-1)[0].max(-1)[0]
        fix_ind = (norm_max - norm_min) > self.eps * torch.ones_like(
            (norm_max - norm_min)
        )
        V = torch.zeros_like(Xstd)
        if fix_ind.sum():
            V_fix = Xstd[fix_ind]
            norm_max_fix = norm_max[fix_ind, None, None]
            norm_min_fix = norm_min[fix_ind, None, None]
            V_fix = torch.max(
                torch.min(V_fix, norm_max_fix),
                norm_min_fix,
            )
            # print(V_fix.shape, norm_min_fix.shape, norm_max_fix.shape)
            V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
            V[fix_ind] = V_fix
        return V

In [15]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order is very important and needs to be matched one by one in the subsequent training, I will save these types here
# Save as .npy file
np.save("./external_files/3-bird-cates.npy", bird_cates)

In [16]:
class BirdclefDataset(Dataset):
    def __init__(self,df:pd.DataFrame,bird_category_dir:str,audio_dir:str='../../data/train_audio',train:bool=True):
        '''
        parameters:
            df: the dataframe of metadata (train/val)
            bird_category_dir: the directory of the bird category array file (npy)
            audio_dir: the parent path where all audio files stored
            train: If the Datset for train set or val set
        '''
        super().__init__()
        # if the Dataset for training or validation
        self.train=train
        self.raw_df=df

        # inperplote nan or 0 value of rating col
        self.raw_df=rating_value_interplote(df=self.raw_df)
        # Calculate the weight of each audio file by rating
        self.raw_df=audio_weight(self.raw_df)

        self.audio_dir=audio_dir

        self.bird_cate_array=np.load(bird_category_dir,allow_pickle=True)

    def get_audio_path(self,file_name:str) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            file_name: in format category_type/XC-ID.ogg (asbfly/XC134896.ogg)

        Return:
            the single audio path string
        '''

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,file_name)


    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.raw_df['clip_start_time'].iloc[index]
        duration_seconds=self.raw_df['duration'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)
                
        else:
            raise ValueError('The clip start time is out of raw audio length')

        return clip


    def random_audio_augmentation(self,audio:torch.Tensor):
        '''
        audio (torch.Tensor): A 2D tensor of audio samples with shape (1, N), where N is the number of samples.
        '''
        np_audio_transforms = CustomCompose(
            [
                CustomOneOf(
                    [
                        NoiseInjection(p=1, max_noise_level=0.04),
                        GaussianNoise(p=1, min_snr=5, max_snr=20),
                        PinkNoise(p=1, min_snr=5, max_snr=20),
                        AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.03, p=0.5),
                        AddGaussianSNR(min_snr_in_db=5, max_snr_in_db=15, p=0.5),
                    ],
                    p=0.3,  
                ),
            ]
        )

        audio_aug=np_audio_transforms(audio[0].numpy())

        # tranfer the array to 2D tensor and keep the num channel is 1
        # this step is to keep the input and output shape adn type are the same

        audio_aug_tensor=torch.from_numpy(audio_aug)
        audio_aug_tensor=audio_aug_tensor.unsqueeze(0)

        return audio_aug_tensor
    

    def audio_label_tensor_generator(self,true_label:str)-> torch.Tensor:
        '''
        Generate a tensor containing all categories based on the given real audio label

        Parameters:
            true lable: a label string

        Return:
            If have 10 class, and give a true lable
            the return should be tensor([0,1,0,0,0,0,0,0,0,0])
        '''
        # Find the index of the target value in the array
        idx = np.where(self.bird_cate_array == true_label)[0][0]
        
        # Create a tensor of all zeros, with length equal to the length of the array
        audio_label_tensor = torch.zeros(len(self.bird_cate_array))

        # Set the value of the corresponding index position to 1
        audio_label_tensor[idx] = 1

        return audio_label_tensor


    
    def __len__(self):
        return self.raw_df.shape[0]

    def __getitem__(self,index):
        row=self.raw_df.iloc[index]

        audio_label=row['primary_label']
        audio_weight=row['audio_weight']

        # Get the path of a single audio file
        single_audio_dir=self.get_audio_path(row['filename'])

        # Read the audio array according to the path
        audio, sr=read_audio(single_audio_dir)

        # augmentation
        if self.train:
            audio_augmentation=self.random_audio_augmentation(audio=audio)
            # Get the audio clip corresponding to index
            clip=self.target_clip(index,audio=audio_augmentation,sample_rate=sr)
        else:
            clip=self.target_clip(index,audio=audio,sample_rate=sr)

        # change audio label to one-hot tensor
        audio_label_tensor=self.audio_label_tensor_generator(true_label=audio_label)

        audio_label_tensor=torch.tensor(audio_label_tensor, dtype=torch.float16)
        clip=torch.tensor(clip, dtype=torch.float16)
        audio_weight=torch.tensor(audio_weight, dtype=torch.float16)

        
        return audio_label_tensor.to(device),clip.to(device),audio_weight.to(device)

In [17]:
# define DatasetModule

class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(self,sampler,train_df:pd.DataFrame,val_df:pd.DataFrame,bird_category_dir:str,audio_dir: str = '../../data/train_audio',batch_size:int=128):
        super().__init__()
        self.train_df=train_df
        self.val_df=val_df
        self.bird_category_dir=bird_category_dir
        self.audio_dir=audio_dir
        self.batch_size=batch_size
        self.sampler=sampler

    
    def train_dataloader(self):
        BD=BirdclefDataset(df=self.train_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=True)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, sampler=self.sampler, pin_memory=True)

        return loader

    def val_dataloader(self):
        BD=BirdclefDataset(df=self.val_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=False)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, pin_memory=True)

        return loader

In [18]:
class Mixup(nn.Module):
    def __init__(self, mix_beta, mixup_prob, mixup_double):
        super(Mixup, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup_prob = mixup_prob
        self.mixup_double = mixup_double

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0] # Generate a random number p and compare it with mixup_prob to decide whether to mix.
        if p < self.mixup_prob:
            bs = X.shape[0] # batch size
            n_dims = len(X.shape)
            perm = torch.randperm(bs) # Generate a random permutation for randomly selecting samples from the current batch for mixing.

            p1 = torch.rand((1,))[0] # If the random number p1 (determines whether to perform double mixing) is less than mixup_double, perform a single mix. Otherwise, perform double mixing:
            if p1 < self.mixup_double:
                X = X + X[perm]
                Y = Y + Y[perm]
                Y = torch.clamp(Y, 0, 1) 

                if weight is None:
                    return X, Y
                else:
                    weight = 0.5 * weight + 0.5 * weight[perm]
                    return X, Y, weight
            else:
                perm2 = torch.randperm(bs)
                X = X + X[perm] + X[perm2]
                Y = Y + Y[perm] + Y[perm2]
                Y = torch.clamp(Y, 0, 1)

                if weight is None:
                    return X, Y
                else:
                    weight = (
                        1 / 3 * weight + 1 / 3 * weight[perm] + 1 / 3 * weight[perm2]
                    )
                    return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [19]:
def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    # Determine window size and frame shift
    # window_size = 0.04 # 40 milliseconds
    # hop_size = 0.02 # 20 milliseconds, usually half the window size
    n_fft = int(window_size * sample_rate)  # Convert the window size to the number of sampling points
    hop_length = int(hop_size * sample_rate)  # Convert frame shift to sampling point number

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    ).to(device)

    melspec=mel_transformer(audio)

    return melspec.to(device)

In [20]:
def compute_deltas(specgram: torch.Tensor, win_length: int = 5, mode: str = "replicate") -> torch.Tensor:
    """Compute delta coefficients of a tensor, usually a spectrogram.

    Args:
        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
        win_length (int, optional): The window length used for computing delta (Default: 5)
        mode (str, optional): Mode parameter passed to padding (Default: "replicate")

    Returns:
        Tensor: Tensor of deltas of dimension (..., freq, time)
    """
    device = specgram.device  # Get the device of the input tensor
    dtype = specgram.dtype

    # pack batch
    shape = specgram.size()
    specgram = specgram.reshape(1, -1, shape[-1])

    assert win_length >= 3
    n = (win_length - 1) // 2
    denom = n * (n + 1) * (2 * n + 1) / 3

    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

    # Create the kernel tensor, making sure it is on the same device as the input tensor
    kernel = torch.arange(-n, n + 1, 1, dtype=dtype,device=device).repeat(specgram.shape[1], 1, 1)

    output = (
        torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
    )

    # unpack batch
    output = output.reshape(shape)

    return output



def make_delta(input_tensor: torch.Tensor):
    input_tensor = input_tensor.transpose(3, 2)
    input_tensor = compute_deltas(input_tensor)
    input_tensor = input_tensor.transpose(3, 2)
    return input_tensor


def image_delta(x):
    delta_1 = make_delta(x)
    delta_2 = make_delta(delta_1)
    x = torch.cat([x, delta_1, delta_2], dim=1)
    return x

In [21]:
class Mixup2(nn.Module):
    def __init__(self, mix_beta, mixup2_prob):
        super(Mixup2, self).__init__()
        self.beta_distribution = Beta(mix_beta, mix_beta)
        self.mixup2_prob = mixup2_prob

    def forward(self, X, Y, weight=None):
        p = torch.rand((1,))[0]
        if p < self.mixup2_prob:
            bs = X.shape[0]
            n_dims = len(X.shape)
            perm = torch.randperm(bs)
            coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(device)

            if n_dims == 2:
                X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm]
            elif n_dims == 3:
                X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm]
            else:
                X = (
                    coeffs.view(-1, 1, 1, 1) * X
                    + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm]
                )
            Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm]
            # Y = Y + Y[perm]
            # Y = torch.clamp(Y, 0, 1)

            if weight is None:
                return X, Y
            else:
                weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm]
                return X, Y, weight
        else:
            if weight is None:
                return X, Y
            else:
                return X, Y, weight

In [22]:
def init_layer(layer):
    '''
    Initialize the parameters of the fully connected layer
    '''
    nn.init.xavier_uniform_(layer.weight) # Initialize the weights and biases of the network layer

    if hasattr(layer, "bias"): # Check if the layer has a bias attribute
        if layer.bias is not None: # and bias is not None
            layer.bias.data.fill_(0.0) # If there is a bias, initialize it to 0

In [23]:
# Later we want to pass the acquired high-dimensional features into an attention module

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
        # x: This is the final output after the attention weights and classification layer.
        # shape: (n_samples, out_features). Since the time dimension is summed and compressed, each sample and each output feature ends up having a single value.
        # norm_att: This is the output of the attention layer (att) after the softmax and tanh functions, 
        # which shows which parts of the input sequence the model should focus on. Normalization ensures that the attention weights 
        # for all time steps add up to 1, which makes it easier to interpret the importance of each time step.
        # shape: (n_samples, out_features, n_time), where out_features is the number of output features of the att convolutional layer, 
        # which is the same as the out_features argument of the input. Each time step and each output feature has a normalized weight.
        # cla: This is the output of the classification layer (cla), which is obtained by processing the input features through another 1D convolutional layer. 
        # This output layer is often used to directly predict task-related outputs, such as the probability of a class label.
        # Shape: (n_samples, out_features, n_time), same shape as norm_att. 
        # This means that each output feature corresponding to each time step has a value processed by the activation function.
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [24]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num

        self.audio_transforms = Compose(
            [
                # AddColoredNoise(p=0.5),
                PitchShift(
                    min_transpose_semitones=-4,
                    max_transpose_semitones=4,
                    sample_rate=32000,
                    p=0.4,
                ),
                Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
            ]
        )

        # load pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=True,in_chans=3)

        # The last two layers are an adaptive pooling layer and a fully connected layer
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers).to(device) # Encapsulate multiple layers in order

        self.in_features=pretrained_model.classifier.in_features # classifier is the last fully connected layer of the model, out_features represents the number of categories

        # create a dense layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True).to(device)

        # add attention block
        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid").to(device)

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


        # freeze part of parameters
        self.freeze()



    def freeze(self):
        self.encoder.eval()
        # self.fc1.eval()
        for param in self.encoder.parameters():
            param.requires_grad = False
        # for param in self.fc1.parameters():
        #     param.requires_grad = False
        return



    def forward(self,clip):

        # Calculation using the pre-trained model (excluding the last two layers)
        clip=self.encoder(clip.to(device)) # feature extractor

        # Calculate the mean of each frequency band and merge them Dimensionality compression
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.3, training=True)

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        x = F.dropout(x, p=0.3, training=True)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]
    
        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # mix audio up
        mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

        clip, audio_label,audio_weights=mixup(X=clip,Y=audio_label,weight=audio_weights)

        # Use Compose to combine multiple audio transformation operations. 
        # These operations are applied to the input audio data to improve the generalization and robustness of the model.
        # clip=self.audio_transforms(clip,sample_rate=self.sample_rate)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        ##Convert the amplitude of Mel Spectrogram to decibel (dB)
        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        #generalization
        clip=(clip+80)/80

        # Random mask part of the Spectrogram, which helps the model learn to be robust when information is missing in certain time periods.

        time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

        clip = time_mask_transform(clip)

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        # mix audio up
        mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

        clip, audio_label,audio_weights = mixup2(clip, audio_label, audio_weights)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]

        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # convert audio to mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        #generalization
        clip=(clip+80)/80

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [25]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  
    dirpath='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/checkpoints/',
    filename='chrononet-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1, 
    mode='min',  
    auto_insert_metric_name=False  
)

early_stop_callback = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.00,
    patience=3, 
    verbose=True,
    mode='min'
)

In [26]:


# # Previous we used a separate dataloader to feed the model
# # Here we encapsulate the dataloader and use this class to read data for training

# bdm=BirdclefDatasetModule(sampler=sampler,train_df=train_df,val_df=val_df,bird_category_dir='./external_files/3-bird-cates.npy',batch_size=128)


# class_num=len(np.load('./external_files/3-bird-cates.npy',allow_pickle=True))
# BirdModelModule=BirdModelModule(class_num=class_num).to(device)


# trainer=L.Trainer(
#     max_epochs=10,
#     # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
#     # devices='auto', # use all gpus if applicable like value=1 or "auto"
#     default_root_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/',
#     # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
#     callbacks=[checkpoint_callback, early_stop_callback], 
# )

# # train the model
# trainer.fit(
#     model=BirdModelModule,
#     datamodule=bdm 
# )

### prediction

In [27]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num

        self.audio_transforms = Compose(
            [
                # AddColoredNoise(p=0.5),
                PitchShift(
                    min_transpose_semitones=-4,
                    max_transpose_semitones=4,
                    sample_rate=32000,
                    p=0.4,
                ),
                Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
            ]
        )

        # load pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=True,in_chans=3)

        # The last two layers are an adaptive pooling layer and a fully connected layer.
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers).to(device) # Encapsulate multiple layers in order

        self.in_features=pretrained_model.classifier.in_features # classifier is the last fully connected layer of the model, out_features represents the number of categories

        # create a dense layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True).to(device)

        # add attention block
        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid").to(device)

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


        # freeze parameters
        self.freeze()



    def freeze(self):
        self.encoder.eval()
        # self.fc1.eval()
        for param in self.encoder.parameters():
            param.requires_grad = False
        # for param in self.fc1.parameters():
        #     param.requires_grad = False
        return



    def forward(self,clip):

        # Use the pre-trained model (excluding the last two layers) for calculation
        clip=self.encoder(clip.to(device)) # feature extractor

        # Calculate the mean of each frequency band and merge them to compress the dimension
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.3, training=True)

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        x = F.dropout(x, p=0.3, training=True)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]
    
        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # mix audio up
        mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

        clip, audio_label,audio_weights=mixup(X=clip,Y=audio_label,weight=audio_weights)

        # Use Compose to combine multiple audio transformation operations. These operations are applied to the input audio data to enhance the generalization and robustness of the model.
        # clip=self.audio_transforms(clip,sample_rate=self.sample_rate)

        #Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        #generalization
        clip=(clip+80)/80

        # Random masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.

        time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

        clip = time_mask_transform(clip)

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

        clip, audio_label,audio_weights = mixup2(clip, audio_label, audio_weights)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]

        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        ##Convert the amplitude of Mel Spectrogram to decibel (dB)
        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        clip=(clip+80)/80

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        # If you have only one tensor (feature) in your TensorDataset , batch will be a tuple containing the tensor and an empty tuple (since there are no labels)
        features= batch
        features=features.to(self.device)
        predictions = self(features)
        # Because what our model ultimately wants is the probability of an object corresponding to all categories, 
        # the sigmoid function is used here because we want to treat each class as a separate probability, so softmax is not used.
        probabilities = predictions.sigmoid().detach()

        return probabilities

In [28]:
# 1. load checkpoint
class_num=len(np.load('./external_files/3-bird-cates.npy',allow_pickle=True))

model = BirdModelModule.load_from_checkpoint(
    checkpoint_path="./checkpoints/sed_s21k_v1-03-291.84.ckpt",
    class_num=182
)

In [29]:
pred_dir = Path("../../data/predict")
pred_files = pred_dir.glob("*.ogg")

In [30]:
def split_audio(audio: torch.Tensor, segment_length:int):

    '''
    split raw audio tensor into multiple clips with 5 seconds long.

    Parameters:
        audio: the raw audio tensor
        segment_length: the audio length of each 5 seconds

    return:
        parts: list includes all clips
        end_time_list: the list of all clips' end time in seconds
    '''

    length_audio = audio.shape[1]
    parts = []
    # For example, if this is the first 5 seconds of audio, then the end time is 5. If it is 5-10 seconds, the end time is 10.
    end_time_list=[]
    end_time=5
    for i in range(0, length_audio, segment_length):
        part = audio[0][i:i + segment_length]
        # if len(part) == segment_length:  # Ensure the fragment lengths are consistent
        parts.append(part)  #Store the raw bytes of audio data
        end_time_list.append(end_time)
        end_time+=5

        

    return parts,end_time_list

In [31]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [32]:
audio_clips_list=[]
clip_names_list=[]

for path in pred_files:
    # read audio as tensor
    audio,sr=read_audio(path=path)

    # get audio corresponding informatino
    duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

    # split audio into multi clips with 5 seconds
    audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

    # generate each label name for each clip
    soundscape_id=path.stem
    clip_name=[f'soundscape_{soundscape_id}_{end_time}' for end_time in end_time_list]

    audio_clips_list.extend(audio_clips)
    
    clip_names_list.extend(clip_name)

    

In [33]:
clip_names_list

['soundscape_1000170626_5',
 'soundscape_1000170626_10',
 'soundscape_1000170626_15',
 'soundscape_1000170626_20',
 'soundscape_1000170626_25',
 'soundscape_1000170626_30',
 'soundscape_1000170626_35',
 'soundscape_1000170626_40',
 'soundscape_1000170626_45',
 'soundscape_1000170626_50',
 'soundscape_1000170626_55',
 'soundscape_1000170626_60',
 'soundscape_1000170626_65',
 'soundscape_1000170626_70',
 'soundscape_1000170626_75',
 'soundscape_1000170626_80',
 'soundscape_1000170626_85',
 'soundscape_1000170626_90',
 'soundscape_1000170626_95',
 'soundscape_1000170626_100',
 'soundscape_1000170626_105',
 'soundscape_1000170626_110',
 'soundscape_1000170626_115',
 'soundscape_1000170626_120',
 'soundscape_1000170626_125',
 'soundscape_1000170626_130',
 'soundscape_1000170626_135',
 'soundscape_1000170626_140',
 'soundscape_1000170626_145',
 'soundscape_1000170626_150',
 'soundscape_1000170626_155',
 'soundscape_1000170626_160',
 'soundscape_1000170626_165',
 'soundscape_1000170626_170',


In [34]:
from datasets import Dataset

In [35]:
# create Dataset
dataset = Dataset.from_dict({'audio_clip': audio_clips_list})

In [36]:
dataset

Dataset({
    features: ['audio_clip'],
    num_rows: 96
})

In [37]:
def pred_transform(batch):
    """
    Transform audio data into normalized mel spectrogram in decibel scale.
    """
    n_fft = int(0.04 * 32000)  # Convert window size to sample points
    hop_length = int(0.02 * 32000)  # Convert hop size to sample points
    n_mels = 40  # Number of Mel filters

    # Create Mel Spectrogram transformer
    mel_transformer = MelSpectrogram(
        sample_rate=32000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )
    
    # Create dB transformer
    db_transform = AmplitudeToDB(stype="power", top_db=80)

    audio_clip_batch = batch['audio_clip']
    melspec_list = []

    for audio_clip in audio_clip_batch:
        # Convert audio clip to tensor and add a new dimension
        audio_clip = torch.tensor(audio_clip).unsqueeze(0)

        # Generate Mel Spectrogram
        melspec = mel_transformer(audio_clip)
        
        # Convert Mel Spectrogram to dB
        db_melspec = db_transform(melspec)
        
        # Normalize the spectrogram
        normalized_melspec = (db_melspec + 80) / 80
        
        # Move the normalized spectrogram to the desired device
        normalized_melspec = normalized_melspec.to(device)

        melspec_list.append(normalized_melspec)

    return {'audio_mel': melspec_list}



In [38]:
dataset_mel=dataset.map(pred_transform, batched=True)

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map: 100%|██████████| 96/96 [00:04<00:00, 22.20 examples/s]


In [39]:
del dataset

In [40]:
dataset_mel_single=dataset_mel.remove_columns('audio_clip')

In [41]:
torch.tensor(dataset_mel_single['audio_mel'][0]).shape

torch.Size([1, 40, 251])

In [42]:
dataset_mel_single

Dataset({
    features: ['audio_mel'],
    num_rows: 96
})

In [43]:
def image_delta(batch):
    batch=torch.tensor(batch['audio_mel'])
    delta_1 = make_delta(batch)
    delta_2 = make_delta(delta_1)
    x = torch.cat([batch, delta_1, delta_2], dim=1)
    
    return {'clip_delta':x}

In [44]:
dataset_delta=dataset_mel_single.map(image_delta, batched=True)

Map: 100%|██████████| 96/96 [00:00<00:00, 101.73 examples/s]


In [45]:
torch.tensor(dataset_delta['clip_delta']).shape

torch.Size([96, 3, 40, 251])

In [46]:
del dataset_mel_single

In [47]:
dataset_delta_single=dataset_delta.remove_columns('audio_mel')

In [48]:
dataset_delta_single

Dataset({
    features: ['clip_delta'],
    num_rows: 96
})

In [49]:
torch.tensor(dataset_delta_single['clip_delta'][0]).shape

torch.Size([3, 40, 251])

In [50]:
from torch.utils.data import Dataset

In [51]:
class PredDataset(Dataset):
    def __init__(self,dataset):
        super().__init__()
        self.dataset=dataset

    def __len__(self):

        return len(self.dataset)
    
    def __getitem__(self, index):
        audio_melspec=self.dataset['clip_delta'][index]

        audio_tensor=torch.tensor(audio_melspec)

        return audio_tensor

In [52]:
PD=PredDataset(dataset=dataset_delta_single)

dataloader = DataLoader(dataset=PD, batch_size=32, shuffle=False, num_workers=0)

In [53]:
# 3. load model for prediction
trainer = L.Trainer(
    accelerator="gpu", 
    devices=1
)

# Use trainer to make predictions
predictions = trainer.predict(model, dataloaders=dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 3/3 [00:35<00:00,  0.08it/s]


In [54]:
predictions

[tensor([[0.5008, 0.5087, 0.5071,  ..., 0.5011, 0.5012, 0.5029],
         [0.5005, 0.5076, 0.5082,  ..., 0.5014, 0.5011, 0.5026],
         [0.5008, 0.5089, 0.5090,  ..., 0.5015, 0.5016, 0.5037],
         ...,
         [0.5011, 0.5107, 0.5096,  ..., 0.5012, 0.5016, 0.5034],
         [0.5008, 0.5078, 0.5084,  ..., 0.5010, 0.5016, 0.5031],
         [0.5006, 0.5074, 0.5071,  ..., 0.5007, 0.5016, 0.5025]]),
 tensor([[0.5011, 0.5066, 0.5097,  ..., 0.5010, 0.5014, 0.5038],
         [0.5004, 0.5057, 0.5062,  ..., 0.5006, 0.5009, 0.5021],
         [0.5007, 0.5079, 0.5082,  ..., 0.5008, 0.5014, 0.5029],
         ...,
         [0.5017, 0.5071, 0.5097,  ..., 0.5012, 0.5020, 0.5048],
         [0.5006, 0.5074, 0.5079,  ..., 0.5008, 0.5012, 0.5026],
         [0.5011, 0.5073, 0.5089,  ..., 0.5015, 0.5016, 0.5040]]),
 tensor([[0.5010, 0.5096, 0.5098,  ..., 0.5016, 0.5021, 0.5041],
         [0.5007, 0.5075, 0.5070,  ..., 0.5007, 0.5010, 0.5026],
         [0.5004, 0.5051, 0.5049,  ..., 0.5004, 0.5007, 0.

In [55]:
['1']+['2']

['1', '2']

In [56]:
submission=pd.DataFrame(columns=['row_id']+np.load('./external_files/3-bird-cates.npy',allow_pickle=True).tolist())

In [57]:
submission

Unnamed: 0,row_id,insbab1,whiter2,rocpig,blakit1,asbfly,litegr,houspa,comros,grnwar1,...,bncwoo3,malpar1,crbsun2,insowl1,chbeat1,vehpar1,sttwoo1,eurbla2,junmyn1,oripip1


In [58]:
# Convert each tensor to a NumPy array and use them as rows of the DataFrame
data_frames = [pd.DataFrame(tensor.numpy()) for tensor in predictions]

In [59]:
# Merge all DataFrames into one big DataFrame
# Each tensor forms a block of the DataFrame
df = pd.concat(data_frames, ignore_index=True)

In [60]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,172,173,174,175,176,177,178,179,180,181
0,0.500847,0.508715,0.507131,0.500731,0.504230,0.500931,0.500945,0.500395,0.501287,0.504073,...,0.506515,0.500559,0.501985,0.504159,0.501750,0.500453,0.501527,0.501061,0.501202,0.502913
1,0.500545,0.507622,0.508183,0.501068,0.504058,0.500899,0.500944,0.500286,0.501162,0.503265,...,0.506215,0.500726,0.501299,0.503370,0.501399,0.500514,0.501542,0.501441,0.501074,0.502585
2,0.500816,0.508852,0.509012,0.500980,0.504600,0.501387,0.501179,0.500673,0.501862,0.504559,...,0.508769,0.500878,0.502046,0.504065,0.502175,0.500769,0.501870,0.501455,0.501611,0.503708
3,0.500460,0.507444,0.505193,0.500502,0.502649,0.500780,0.500567,0.500248,0.500807,0.503195,...,0.505132,0.500455,0.501061,0.502447,0.501107,0.500447,0.500970,0.500538,0.500802,0.501826
4,0.500870,0.508636,0.509370,0.501127,0.505013,0.501530,0.501477,0.500645,0.502033,0.504597,...,0.509093,0.500943,0.502345,0.504458,0.502089,0.501030,0.502110,0.501276,0.501500,0.503278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.500338,0.505972,0.506470,0.500438,0.502620,0.500555,0.500543,0.500143,0.500649,0.503359,...,0.504548,0.500394,0.501050,0.502213,0.501090,0.500320,0.500877,0.500380,0.500803,0.501748
92,0.500480,0.507602,0.507627,0.500755,0.503814,0.500723,0.501146,0.500300,0.501320,0.503300,...,0.505877,0.500651,0.501638,0.503184,0.501508,0.500566,0.501323,0.500768,0.501412,0.502094
93,0.500352,0.505996,0.505472,0.500510,0.502127,0.500595,0.500700,0.500194,0.500704,0.502782,...,0.504038,0.500653,0.501016,0.502739,0.500857,0.500304,0.500849,0.500579,0.500652,0.501967
94,0.500439,0.505509,0.506158,0.500442,0.503384,0.500635,0.500833,0.500338,0.501091,0.502910,...,0.505003,0.500456,0.501603,0.502521,0.501381,0.500385,0.501152,0.500656,0.500901,0.501962


In [61]:
df.columns=np.load('./external_files/3-bird-cates.npy',allow_pickle=True).tolist()

ValueError: Length mismatch: Expected axis has 182 elements, new values have 162 elements

In [None]:
df

Unnamed: 0,insbab1,whiter2,rocpig,blakit1,asbfly,litegr,houspa,comros,grnwar1,wynlau1,...,bncwoo3,malpar1,crbsun2,insowl1,chbeat1,vehpar1,sttwoo1,eurbla2,junmyn1,oripip1
0,0.500001,0.502747,0.500176,0.500365,0.500028,0.500471,0.508517,0.532191,0.500003,0.500020,...,0.500008,0.504719,0.500000,0.5,0.5,0.500003,0.5,0.5,0.500001,0.5
1,0.500004,0.533823,0.500019,0.500132,0.500004,0.509141,0.516256,0.533277,0.500019,0.500001,...,0.500000,0.500295,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
2,0.500003,0.503245,0.500991,0.500040,0.500003,0.504204,0.532776,0.537239,0.500001,0.500001,...,0.500000,0.502176,0.500004,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
3,0.500005,0.509151,0.500025,0.501754,0.500065,0.506159,0.523961,0.536590,0.500011,0.500058,...,0.500000,0.502940,0.500005,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
4,0.500001,0.505027,0.500007,0.501812,0.500005,0.539216,0.504708,0.535431,0.500001,0.500002,...,0.500000,0.510557,0.500003,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.500452,0.527358,0.500230,0.501527,0.500256,0.524080,0.501026,0.510450,0.500033,0.500004,...,0.500000,0.511648,0.500009,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
92,0.500001,0.503903,0.500130,0.515409,0.500005,0.522831,0.523866,0.512087,0.500008,0.500001,...,0.500005,0.504960,0.500007,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
93,0.500027,0.500318,0.500018,0.500239,0.500004,0.509084,0.502225,0.506571,0.500001,0.500032,...,0.500000,0.501020,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
94,0.500031,0.505785,0.500026,0.502344,0.500052,0.515034,0.524997,0.519913,0.500009,0.500007,...,0.500000,0.543843,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5


In [None]:
# Create a new Series
new_column = pd.Series(clip_names_list, name='row_id')

In [None]:
df.insert(0,'row_id',new_column)

In [None]:
df

Unnamed: 0,row_id,insbab1,whiter2,rocpig,blakit1,asbfly,litegr,houspa,comros,grnwar1,...,bncwoo3,malpar1,crbsun2,insowl1,chbeat1,vehpar1,sttwoo1,eurbla2,junmyn1,oripip1
0,soundscape_1000170626_5,0.500001,0.502747,0.500176,0.500365,0.500028,0.500471,0.508517,0.532191,0.500003,...,0.500008,0.504719,0.500000,0.5,0.5,0.500003,0.5,0.5,0.500001,0.5
1,soundscape_1000170626_10,0.500004,0.533823,0.500019,0.500132,0.500004,0.509141,0.516256,0.533277,0.500019,...,0.500000,0.500295,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
2,soundscape_1000170626_15,0.500003,0.503245,0.500991,0.500040,0.500003,0.504204,0.532776,0.537239,0.500001,...,0.500000,0.502176,0.500004,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
3,soundscape_1000170626_20,0.500005,0.509151,0.500025,0.501754,0.500065,0.506159,0.523961,0.536590,0.500011,...,0.500000,0.502940,0.500005,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
4,soundscape_1000170626_25,0.500001,0.505027,0.500007,0.501812,0.500005,0.539216,0.504708,0.535431,0.500001,...,0.500000,0.510557,0.500003,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,soundscape_1000389428_220,0.500452,0.527358,0.500230,0.501527,0.500256,0.524080,0.501026,0.510450,0.500033,...,0.500000,0.511648,0.500009,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
92,soundscape_1000389428_225,0.500001,0.503903,0.500130,0.515409,0.500005,0.522831,0.523866,0.512087,0.500008,...,0.500005,0.504960,0.500007,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
93,soundscape_1000389428_230,0.500027,0.500318,0.500018,0.500239,0.500004,0.509084,0.502225,0.506571,0.500001,...,0.500000,0.501020,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
94,soundscape_1000389428_235,0.500031,0.505785,0.500026,0.502344,0.500052,0.515034,0.524997,0.519913,0.500009,...,0.500000,0.543843,0.500000,0.5,0.5,0.500000,0.5,0.5,0.500000,0.5
