In order to use workers to speed up data processing, I separated some blocks of 3-efficient-in21k-feature-extractor into separate packages

Specifically, you can find the package in common/

In [1]:
!pip install -q packages/torchaudio-2.3.0-cp310-cp310-manylinux1_x86_64.whl
!pip install -q packages/lightning-2.2.0-py3-none-any.whl
!pip install -q packages/colorednoise-2.2.0-py3-none-any.whl
!pip install -q packages/librosa-0.10.2-py3-none-any.whl
!pip install -q packages/torch_audiomentations-0.11.1-py3-none-any.whl
!pip install -q packages/torchinfo-1.8.0-py3-none-any.whl
!pip install -q wandb

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger
from lightning.pytorch.loggers import WandbLogger

  warn(
2024-05-11 14:01:51.055250: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 14:01:51.102700: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from common.audiotransform import rating_value_interplote,audio_weight, Mixup,image_delta,Mixup2,mel_transform
from common.audioprocess import read_audio 
from common.audioprocess import CustomCompose,CustomOneOf,NoiseInjection,GaussianNoise,PinkNoise,AddGaussianNoise,AddGaussianSNR
from common.audiodatasets import BirdclefDataset

In [5]:
# check cuda and select device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
if device == "cuda":
    # get the num of devices
    num_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_devices)

    # Iterate over each CUDA device and print its core count
    for i in range(num_devices):
        print("CUDA device", i, "cores:", torch.cuda.get_device_properties(i).multi_processor_count)
else:
    print("CUDA is not available.")

cuda
Number of CUDA devices: 1
CUDA device 0 cores: 40


In [6]:
metadata_path='data/train_metadata_new_add_rating.csv'

In [7]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# initilize tran and val sets
train_indices = []
val_indices = []

# random select a sample into val set and other part into train set.
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))

# split dataset based off index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [8]:
# select 20000 pieces data from trainset
additional_val_samples = train_df.sample(n=20000, random_state=42)

# add these selected data into valset
val_df = pd.concat([val_df, additional_val_samples])

# drop these data out of trainset
train_df = train_df.drop(additional_val_samples.index)

In [9]:
train_df.shape

(197556, 5)

In [10]:
val_df.shape

(20182, 5)

In [11]:
# Because this is an unbalanced dataset, the amount of data in each category is very different
# So I will calculate the weight of each category here
# **(-0.5) The purpose is to reduce the relative influence of high-frequency categories and increase the influence of low-frequency categories, so as to help the model better learn those uncommon categories
# The purpose of calculating this is to build a WeightedRandomSampler, so that each time a batch is extracted using dataloader, it is more friendly to data of different categories.

def sampling_weight(df)->torch.Tensor:
    '''
    calculate the sampling weight of each audio file

    because this is imbalanced dataset
    we hope the category with less data has large probability to be picked.
    '''
    sample_weights = (df['primary_label'].value_counts() / df['primary_label'].value_counts().sum()) ** (-0.5)

    # Map weights to each row of the original data
    sample_weights_map = df['primary_label'].map(sample_weights)

    # Convert a pandas Series to a NumPy array
    sample_weights_np = sample_weights_map.to_numpy(dtype=np.float32)

    # Convert NumPy arrays to PyTorch tensors using torch.from_numpy
    sample_weights_tensor = torch.from_numpy(sample_weights_np)

    return sample_weights_tensor

In [12]:
# df=pd.read_csv(metadata_path,header=0)
sample_weights_tensor=sampling_weight(df=train_df)
# Here we will build an argument sampler that dataloader will use
# It should be noted that the order of weights in the constructed sampler needs to be consistent with the order of data passed into the dataloader, otherwise the weights will not match

#Create a sampler based on the newly obtained weight list
sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

sampler

<torch.utils.data.sampler.WeightedRandomSampler at 0x7fcff408fdf0>

In [13]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order of this is very important and needs to be matched one by one in subsequent training, I will save these categories here
# save as .npy file
np.save("./external/3-bird-cates.npy", bird_cates)

In [14]:
# define DatasetModule

class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(self,sampler,train_df:pd.DataFrame,val_df:pd.DataFrame,bird_category_dir:str,audio_dir: str = 'data/audio',batch_size:int=128,workers=4):
        super().__init__()
        self.train_df=train_df
        self.val_df=val_df
        self.bird_category_dir=bird_category_dir
        self.audio_dir=audio_dir
        self.batch_size=batch_size
        self.sampler=sampler
        self.workers=workers

    
    def train_dataloader(self):
        BD=BirdclefDataset(df=self.train_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=True)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, sampler=self.sampler, pin_memory=True,num_workers=self.workers)
        return loader

    def val_dataloader(self):
        BD=BirdclefDataset(df=self.val_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=False)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, pin_memory=True,num_workers=self.workers)
        return loader

In [15]:
def init_layer(layer):
    '''
    initilize dense layer parameters
    '''
    nn.init.xavier_uniform_(layer.weight) # initilize net layers weight and bias

    if hasattr(layer, "bias"): # check if layer has bias value
        if layer.bias is not None: # and bias is not none
            layer.bias.data.fill_(0.0) # if existing bias, set as 0

In [16]:
# then we want to put the high dimentional fetures grabbed into a attention block

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
    # x： This is the final output after processing by the attention weighting and classification layer
    # shape：(n_samples, out_features)。Since the time dimension is compressed by summation, we end up with a single value for each sample and each output feature.
    # norm_att: This is the output of the attention layer (att) processed with softmax and tanh functions, which shows which parts of the input sequence the model should focus on. Normalization ensures that the attention weights of all time steps add up to 1, which makes it easier to intuitively interpret the importance of each time step.
    # shape：(n_samples, out_features, n_time)，where out_features is the number of output features of the att convolutional layer, which is the same as the out_features parameter of the input. Each time step and each output feature has a normalized weight.
    # cla: This is the output of the classification layer (cla), which is obtained by processing the input features through another 1D convolutional layer. This output layer is usually used to directly predict task-related outputs, such as the probability of a class label.
    # shape：(n_samples, out_features, n_time)，Same shape as norm_att . This means that each output feature at each time step has a value processed by the activation function.
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [17]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num

        self.audio_transforms = Compose(
            [
                # AddColoredNoise(p=0.5),
                PitchShift(
                    min_transpose_semitones=-4,
                    max_transpose_semitones=4,
                    sample_rate=32000,
                    p=0.4,
                ),
                Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
            ]
        )

        # load pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=True,in_chans=3)

        # The last two layers are an adaptive pooling layer and a fully connected layer.
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers).to(device) # Encapsulate multiple layers in sequence

        self.in_features=pretrained_model.classifier.in_features # classifier is the last fully connected layer of the model, out_features represents the number of categories

        # create a fully connected layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True).to(device)

        # add attention block
        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid").to(device)

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


        # freeze part parameters
        self.freeze()



    def freeze(self):
        self.encoder.eval()
        # self.fc1.eval()
        for param in self.encoder.parameters():
            param.requires_grad = False
        # for param in self.fc1.parameters():
        #     param.requires_grad = False
        return
    

    def preprocess(self, clip):
        clip = mel_transform(sample_rate=self.sample_rate, audio=clip)
        clip = torchaudio.transforms.AmplitudeToDB()(clip)
        clip = (clip + 80) / 80  # normalization
        clip = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)(clip)
        clip = image_delta(clip)
        return clip



    def forward(self,clip):
        # use pre-trained model (exclude the last two layers) for computation
        clip=self.encoder(clip.to(device)) # feature extractor

        # Calculate the mean of each frequency band and merge them Dimensionality compression
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]
    
        # audio_label=audio_label.to(device)
        # clip=clip.to(device)
        # audio_weights=audio_weights.to(device)

        # mixup audio
        mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

        clip, audio_label,audio_weights=mixup(X=clip,Y=audio_label,weight=audio_weights)

        # Use Compose to combine multiple audio transformation operations. 
        # These operations are applied to the input audio data to improve the generalization and robustness of the model.
        # clip=self.audio_transforms(clip,sample_rate=self.sample_rate)

        # # Convert audio data into mel spectrogram
        # clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        # ##Convert the amplitude of Mel Spectrogram to decibel (dB)
        # db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        # clip=db_transform(clip).to(device)

        # #normalization
        # clip=(clip+80)/80

        # # Randomly masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.

        # time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

        # clip = time_mask_transform(clip)

        # # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        # clip= image_delta(clip.to(device))

        clip=self.preprocess(clip)

        # audio mix up
        mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

        clip, audio_label,audio_weights = mixup2(clip, audio_label, audio_weights)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean up memory
        del audio_label, clip, audio_weights, target_pred
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]

        # audio_label=audio_label.to(device)
        # clip=clip.to(device)
        # audio_weights=audio_weights.to(device)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip)

        ##Convert the amplitude of Mel Spectrogram to decibel (dB)
        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip)

        # normalization
        clip=(clip+80)/80

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip)

        # predictions
        target_pred=self(clip).detach()

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean up memory
        del audio_label, clip, audio_weights, target_pred
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [None]:
if __name__=="__main__":

    num_workers = multiprocessing.cpu_count()

    logger = WandbLogger(project='BirdClef-2024', name='sef_s21_v1')

    # set up checkpoint（ModelCheckpoint）
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',  # monitor val loss
        dirpath='models/checkpoints',
        filename='sed_s21k_v1-{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,  # Only save the best model, the one with the lowest validation loss
        mode='min',  # Specifying ‘min’ means the smaller the better, e.g. the smaller the loss the better
        auto_insert_metric_name=False  # Prevent automatic insertion of index names into path names
    )

    # EarlyStopping
    early_stop_callback = EarlyStopping(
        monitor='val_loss', 
        min_delta=0.00,
        patience=3,  # If the validation set loss does not improve within 3 epochs, stop training early
        verbose=True,
        mode='min'  # 'min' is valid for loss, if you are monitoring indicators such as accuracy, you should use 'max'
    )



    # Previously we used a separate dataloader to feed the model
    # Here we encapsulate the dataloader and use this class to read data for training

    bdm=BirdclefDatasetModule(sampler=sampler,train_df=train_df,val_df=val_df,bird_category_dir='external_files/3-bird-cates.npy',batch_size=64,workers=8)


    class_num=len(np.load('external_files/3-bird-cates.npy',allow_pickle=True))
    BirdModelModule=BirdModelModule(class_num=class_num).to(device)


    trainer=L.Trainer(
        # setup Trainer，Enable mixed precision
        precision=16,
        # Set up Trainer, use gradient accumulation, and update parameters after accumulating gradients every 128 batches
        accumulate_grad_batches=256,
        max_epochs=45,
        # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
        # devices='auto', # use all gpus if applicable like value=1 or "auto"
        default_root_dir='models/model_training',
        # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
        logger=logger, 
        callbacks=[checkpoint_callback, early_stop_callback], 
    )

    # train the model
    trainer.fit(
        model=BirdModelModule,
        datamodule=bdm # DM can automatically find the corresponding dataloader from the object for training without specifying
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/sagemaker-user/.netrc


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type              | Params
-------------------------------------------------------
0 | audio_transforms | Compose           | 0     
1 | encoder          | Sequential        | 20.2 M
2 | fc1              | Linear            | 1.6 M 
3 | att_block        | AttBlockV2        | 466 K 
4 | loss_function    | BCEWithLogitsLoss | 0     
-------------------------------------------------------
2.1 M     Trainable params
20.2 M    Non-trainable params
22.3 M    Total params
89.134    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss = nan is not finite. Previous best value was inf. Signaling Trainer to stop.
