In order to use workers to speed up data processing, I separated some blocks of 3-efficient-in21k-feature-extractor into separate packages

Specific packages can be found in common/v1

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys

module_path = '../../'

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from common.sed_s21k.audiotransform import rating_value_interplote,audio_weight, Mixup,image_delta,Mixup2,mel_transform
from common.sed_s21k.audioprocess import read_audio 
from common.sed_s21k.audioprocess import CustomCompose,CustomOneOf,NoiseInjection,GaussianNoise,PinkNoise,AddGaussianNoise,AddGaussianSNR
from common.sed_s21k.audiodatasets import BirdclefDataset

In [4]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [5]:
metadata_path='../../data/train_metadata_new_add_rating.csv'

In [6]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# Find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# Initialize training set and validation set
train_indices = []
val_indices = []


# Randomly select a sample from each category to add to the validation set, and the rest to the training set
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))


# Divide the dataset by index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [7]:
# Random select 20,000 data from the training set
additional_val_samples = train_df.sample(n=2000, random_state=42)

# Add these samples to the validation set
val_df = pd.concat([val_df, additional_val_samples])

# Remove these samples from the training set
train_df = train_df.drop(additional_val_samples.index)


# reduce train_df 
additional_val_samples = train_df.sample(n=150000, random_state=42)
# Remove these samples from the training set
train_df = train_df.drop(additional_val_samples.index)

In [8]:
train_df.shape

(65556, 5)

In [9]:
val_df.shape

(2182, 5)

In [10]:
# Because this is an unbalanced dataset, the amount of data in each category is very different
# So I will calculate the weight of each category here
# **(-0.5) The purpose is to reduce the relative influence of high-frequency categories and increase the influence of low-frequency categories, 
# so as to help the model better learn those uncommon categories
# The purpose of calculating this is to build a WeightedRandomSampler, 
# so that each time a batch is extracted using dataloader, it is more friendly to data of different categories.

def sampling_weight(df)->torch.Tensor:
    '''
    calculate the sampling weight of each audio file

    because this is imbalanced dataset
    we hope the category with less data has large probability to be picked.
    '''
    sample_weights = (df['primary_label'].value_counts() / df['primary_label'].value_counts().sum()) ** (-0.5)

    # Map weights to each row of the original data
    sample_weights_map = df['primary_label'].map(sample_weights)

    # Convert pandas Series to NumPy array
    sample_weights_np = sample_weights_map.to_numpy(dtype=np.float32)

    # Convert a NumPy array to a PyTorch tensor using torch.from_numpy
    sample_weights_tensor = torch.from_numpy(sample_weights_np)

    return sample_weights_tensor

In [11]:
# df=pd.read_csv(metadata_path,header=0)

sample_weights_tensor=sampling_weight(df=train_df)
# Here we will build an argument sampler that will be used by the dataloader
# Note that the order of weights in the constructed sampler must be consistent with the order of data passed into the dataloader, otherwise the weights will not match

# Create a sampler based on the newly obtained weight list
sampler = WeightedRandomSampler(sample_weights_tensor.type('torch.DoubleTensor'), len(sample_weights_tensor),replacement=True)

sampler

<torch.utils.data.sampler.WeightedRandomSampler at 0x10618f130>

In [12]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order is very important and needs to be matched one by one in the subsequent training, I will save these types here
# Save as .npy file
np.save("./external_files/3-bird-cates.npy", bird_cates)

In [13]:
# define DatasetModule

class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(self,sampler,train_df:pd.DataFrame,val_df:pd.DataFrame,bird_category_dir:str,audio_dir: str = '../../data/train_audio',batch_size:int=128,workers=4):
        super().__init__()
        self.train_df=train_df
        self.val_df=val_df
        self.bird_category_dir=bird_category_dir
        self.audio_dir=audio_dir
        self.batch_size=batch_size
        self.sampler=sampler
        self.workers=workers

    
    def train_dataloader(self):
        BD=BirdclefDataset(df=self.train_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=True)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, sampler=self.sampler, pin_memory=True,num_workers=self.workers)
        return loader

    def val_dataloader(self):
        BD=BirdclefDataset(df=self.val_df,bird_category_dir=self.bird_category_dir,audio_dir=self.audio_dir,train=False)
        loader = DataLoader(dataset=BD, batch_size=self.batch_size, pin_memory=True,num_workers=self.workers)
        return loader

In [14]:
def init_layer(layer):
    '''
    initilize layers' parameters
    '''
    nn.init.xavier_uniform_(layer.weight) # Initialize the weights and biases of the network layer

    if hasattr(layer, "bias"): # Check if the layer has a bias attribute
        if layer.bias is not None: # and bias is not None
            layer.bias.data.fill_(0.0) # If there is a bias, initialize it to 0

In [15]:
# Later we want to pass the acquired high-dimensional features into an attention module

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
        # x: This is the final output after the attention weights and classification layer.
        # shape: (n_samples, out_features). Since the time dimension is summed and compressed, each sample and each output feature ends up having a single value.
        # norm_att: This is the output of the attention layer (att) after the softmax and tanh functions, 
        # which shows which parts of the input sequence the model should focus on. Normalization ensures that the attention weights for all time steps add up to 1,
        #  which makes it easier to interpret the importance of each time step.
        # shape: (n_samples, out_features, n_time), where out_features is the number of output features of the att convolutional layer, 
        # which is the same as the out_features argument of the input. Each time step and each output feature has a normalized weight.
        # cla: This is the output of the classification layer (cla), which is obtained by processing the input features through another 1D convolutional layer.
        # This output layer is often used to directly predict task-related outputs, such as the probability of a class label.
        # Shape: (n_samples, out_features, n_time), same shape as norm_att. 
        # This means that each output feature corresponding to each time step has a value processed by the activation function.
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [16]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num

        self.audio_transforms = Compose(
            [
                # AddColoredNoise(p=0.5),
                PitchShift(
                    min_transpose_semitones=-4,
                    max_transpose_semitones=4,
                    sample_rate=32000,
                    p=0.4,
                ),
                Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
            ]
        )

        # load pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=True,in_chans=3)

        # The last two layers are an adaptive pooling layer and a fully connected layer
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers).to(device) # Encapsulate multiple layers in order

        self.in_features=pretrained_model.classifier.in_features # classifier is the last fully connected layer of the model, out_features represents the number of categories

        # create a dense layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True).to(device)

        # add attention block
        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid").to(device)

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


        # freeze parameters
        self.freeze()



    def freeze(self):
        self.encoder.eval()
        # self.fc1.eval()
        for param in self.encoder.parameters():
            param.requires_grad = False
        # for param in self.fc1.parameters():
        #     param.requires_grad = False
        return



    def forward(self,clip):

        # Calculation using the pre-trained model (excluding the last two layers)
        clip=self.encoder(clip.to(device)) # feature extractor

        # Calculate the mean of each frequency band and merge them Dimensionality compression
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.3, training=True)

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        x = F.dropout(x, p=0.3, training=True)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]
    
        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # mix audio up
        mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

        clip, audio_label,audio_weights=mixup(X=clip,Y=audio_label,weight=audio_weights)

        # Use Compose to combine multiple audio transformation operations. 
        # These operations are applied to the input audio data to enhance the generalization and robustness of the model.
        # clip=self.audio_transforms(clip,sample_rate=self.sample_rate)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        #generalization
        clip=(clip+80)/80

        # Random masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.

        time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

        clip = time_mask_transform(clip)

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        # mix audio up
        mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

        clip, audio_label,audio_weights = mixup2(clip, audio_label, audio_weights)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean memory
        del audio_label, clip, audio_weights, target_pred
        if torch.cuda.is_available():
            print('allocated memory:',torch.cuda.memory_allocated())
            torch.cuda.empty_cache()
            print('allocated memory after empty cache:',torch.cuda.memory_allocated())

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]

        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip).to(device)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip).to(device)

        #generalization
        clip=(clip+80)/80

        # Calculate the first and second order differences of audio or other time series data, 
        # usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip.to(device))

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean memory
        del audio_label, clip, audio_weights, target_pred
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [17]:
if __name__=="__main__":

    num_workers = multiprocessing.cpu_count()

    # setup MLflow logger
    mlflow_logger = MLFlowLogger(
        experiment_name="BirdClef_Experiment",
        tracking_uri="file:///Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/mlruns"
    )

    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',  
        dirpath='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/checkpoints/',
        filename='sed_s21k_v1-{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,  
        mode='min',  
        auto_insert_metric_name=False  
    )


    early_stop_callback = EarlyStopping(
        monitor='val_loss',  
        min_delta=0.00,
        patience=3,  
        verbose=True,
        mode='min'  
    )



    # we used a separate dataloader to feed the model
    # Here we encapsulate the dataloader and use this class to read data for training
    bdm=BirdclefDatasetModule(sampler=sampler,train_df=train_df,val_df=val_df,bird_category_dir='./external_files/3-bird-cates.npy',batch_size=64,workers=num_workers)


    class_num=len(np.load('./external_files/3-bird-cates.npy',allow_pickle=True))
    BirdModelModule=BirdModelModule(class_num=class_num).to(device)


    trainer=L.Trainer(
        max_epochs=45,
        # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
        # devices='auto', # use all gpus if applicable like value=1 or "auto"
        default_root_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/',
        # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
        logger=mlflow_logger,  # use MLflow logger
        callbacks=[checkpoint_callback, early_stop_callback], 
    )

    # train the model
    trainer.fit(
        model=BirdModelModule,
        datamodule=bdm 
    )

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name             | Type              | Params
-------------------------------------------------------
0 | audio_transforms | Compose           | 0     
1 | encoder          | Sequential        | 20.2 M
2 | fc1              | Linear            | 1.6 M 
3 | att_block        | AttBlockV2        | 466 K 
4 | loss_function    | BCEWithLogitsLoss | 0     
-------------------------------------------------------
2.1 M     Trainable params
20.2 M    Non-trainable params
22.3 M    Total params
89.134    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 1025/1025 [53:29<00:00,  0.32it/s, v_num=8b07, train_loss_step=167.0, val_loss_step=29.60, val_loss_epoch=327.0, train_loss_epoch=588.0]

Metric val_loss improved. New best score: 326.578


Epoch 1: 100%|██████████| 1025/1025 [41:36<00:00,  0.41it/s, v_num=8b07, train_loss_step=248.0, val_loss_step=26.10, val_loss_epoch=296.0, train_loss_epoch=552.0]

Metric val_loss improved by 30.445 >= min_delta = 0.0. New best score: 296.132


Epoch 3: 100%|██████████| 1025/1025 [50:07<00:00,  0.34it/s, v_num=8b07, train_loss_step=174.0, val_loss_step=26.70, val_loss_epoch=292.0, train_loss_epoch=544.0] 

Metric val_loss improved by 4.297 >= min_delta = 0.0. New best score: 291.836


Epoch 5:  61%|██████    | 626/1025 [43:12<27:32,  0.24it/s, v_num=8b07, train_loss_step=756.0, val_loss_step=30.40, val_loss_epoch=320.0, train_loss_epoch=556.0]  