Because chrononet layers were used in 4.2 notebook, but the desired effect was not achieved, I tried to replace them with the attention layer used before.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger

from sklearn.metrics import roc_auc_score
from lightning.pytorch.loggers import WandbLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys

module_path = '../../'

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from common.sed_s21k_v4.audioprocess import rating_value_interplote, audio_weight, sampling_weight, dataloader_sampler_generate,class_weight_generate
from common.sed_s21k_v4.audiotransform import read_audio, Mixup, mel_transform,image_delta, Mixup2
from common.sed_s21k_v4.audiotransform import CustomCompose,CustomOneOf,NoiseInjection,GaussianNoise,PinkNoise,AddGaussianNoise,AddGaussianSNR
from common.sed_s21k_v4.audiodatasets_preprepared import BirdclefDataset
from common.sed_s21k_v4.audiodatasets import trainloader_collate,valloader_collate
from common.sed_s21k_v4.modelmeasurements import FocalLoss,compute_roc_auc


In [4]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [5]:
# 定义DatasetModule


class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(
        self,
        batch_size: int = 1,
        workers=4,
    ):
        super().__init__()
        self.batch_size = batch_size
        self.workers = workers

    def train_dataloader(self):
        BD = BirdclefDataset(
            data_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/data/preprepared/train'
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            pin_memory=True,
            num_workers=self.workers,
            prefetch_factor=2,
            # shuffle=True,
        )
        return loader

    def val_dataloader(self):
        BD = BirdclefDataset(
            data_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/data/preprepared/train'
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            pin_memory=True,
            num_workers=self.workers,
            prefetch_factor=2,
            # shuffle=False
        )
        return loader

In [6]:
def init_layer(layer):
    '''
    初始化 全联接层的参数
    '''
    nn.init.xavier_uniform_(layer.weight) # Initialize the weights and biases of the network layer

    if hasattr(layer, "bias"): # Check if the layer has a bias attribute
        if layer.bias is not None: # and bias is not None
            layer.bias.data.fill_(0.0) # initilize as 0 if bias exists

In [7]:
# we want to pass the acquired high-dimensional features into an attention module

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
        # x: This is the final output after the attention weights and classification layer.
        # shape: (n_samples, out_features). Since the time dimension is summed and compressed, each sample and each output feature ends up having a single value.
        # norm_att: This is the output of the attention layer (att) after the softmax and tanh functions, 
        # which shows which parts of the input sequence the model should focus on. Normalization ensures that the attention weights for all time steps add up to 1, 
        # which makes it easier to interpret the importance of each time step.
        # shape: (n_samples, out_features, n_time), where out_features is the number of output features of the att convolutional layer, 
        # which is the same as the out_features argument of the input. Each time step and each output feature has a normalized weight.
        # cla: This is the output of the classification layer (cla), which is obtained by processing the input features through another 1D convolutional layer. 
        # This output layer is often used to directly predict task-related outputs, such as the probability of a class label.
        # Shape: (n_samples, out_features, n_time), same shape as norm_att. This means that each output feature corresponding to each time step has a value processed by the activation function.
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [8]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num

        # load pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=True,in_chans=3)

        # The last two layers are an adaptive pooling layer and a fully connected layer
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers).to(device) # Encapsulate multiple layers in order

        self.in_features=pretrained_model.classifier.in_features # classifier is the last fully connected layer of the model, out_features represents the number of categories

        # Create a dense layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True).to(device)

        # add attention block
        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid").to(device)

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


    def forward(self,clip):

        # Calculate the mean of each frequency band and merge them to compress the dimension
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.3, training=True)

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        x = F.dropout(x, p=0.3, training=True)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[1]
        clip=batch[0]
        audio_weights=batch[2]
    
        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean memory
        del audio_label, clip, audio_weights, target_pred
        # if torch.cuda.is_available():
        #     print('allocated memory:',torch.cuda.memory_allocated())
        #     torch.cuda.empty_cache()
        #     print('allocated memory after empty cache:',torch.cuda.memory_allocated())

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[1]
        clip=batch[0]
        audio_weights=batch[2]

        audio_label=audio_label.to(device)
        clip=clip.to(device)
        audio_weights=audio_weights.to(device)

        # predictions
        target_pred=self(clip.to(device))

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # clean memory
        del audio_label, clip, audio_weights, target_pred
        # if torch.cuda.is_available():
        #     torch.cuda.empty_cache()

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [9]:
if __name__ == "__main__":

    num_workers = multiprocessing.cpu_count()


    # # initilize collate_fn
    # valloader_collate=valloader_collate()
    # trainloader_collate=trainloader_collate()

    logger = WandbLogger(project='BirdClef-mac', name='sef_s21_v2_mac')

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",  
        dirpath="models/checkpoints",
        filename="sed_s21k_43-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,  
        mode="min", 
        auto_insert_metric_name=False,  
    )

    early_stop_callback = EarlyStopping(
        monitor="val_loss", 
        min_delta=0.00,
        patience=3,  
        verbose=True,
        mode="min",  
    )

    # we used a separate dataloader to feed the model
    # Here we encapsulate the dataloader and use this class to read data for training

    bdm = BirdclefDatasetModule(
        batch_size=None,
        workers=4,
    )

    class_num = len(np.load("external_files/13-2-bird-cates.npy", allow_pickle=True))
    # initilize model
    # chrononet = ChronoNet(class_nums=class_num)

    BirdModelModule = BirdModelModule(
        class_num=class_num,
    )

    trainer = L.Trainer(
        # enable mixed precision
        precision=16,
        # Set up Trainer, use gradient accumulation, and update parameters after accumulating gradients every 64*4 batches
        accumulate_grad_batches=4,
        max_epochs=45,
        # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
        # devices='auto', # use all gpus if applicable like value=1 or "auto"
        default_root_dir="models/model_training",
        # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
        logger=logger,  # use MLflow logger
        callbacks=[checkpoint_callback, early_stop_callback], 
    )

    # train the model
    trainer.fit(
        model=BirdModelModule,
        datamodule=bdm,  
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdydifferent[0m. Use [1m`wandb login --relogin`[0m to force relogin



  | Name          | Type              | Params
----------------------------------------------------
0 | encoder       | Sequential        | 20.2 M
1 | fc1           | Linear            | 1.6 M 
2 | att_block     | AttBlockV2        | 466 K 
3 | loss_function | BCEWithLogitsLoss | 0     
----------------------------------------------------
2.1 M     Trainable params
20.2 M    Non-trainable params
22.3 M    Total params
89.134    Total estimated model params size (MB)


Epoch 0: 100%|██████████| 3087/3087 [01:24<00:00, 36.63it/s, v_num=9v9l, train_loss_step=713.0, val_loss_step=707.0, val_loss_epoch=504.0, train_loss_epoch=552.0]

Metric val_loss improved. New best score: 504.471


Epoch 1: 100%|██████████| 3087/3087 [01:59<00:00, 25.79it/s, v_num=9v9l, train_loss_step=713.0, val_loss_step=698.0, val_loss_epoch=490.0, train_loss_epoch=500.0]

Metric val_loss improved by 14.071 >= min_delta = 0.0. New best score: 490.400


Epoch 2: 100%|██████████| 3087/3087 [01:59<00:00, 25.84it/s, v_num=9v9l, train_loss_step=711.0, val_loss_step=688.0, val_loss_epoch=483.0, train_loss_epoch=490.0]

Metric val_loss improved by 7.843 >= min_delta = 0.0. New best score: 482.557


Epoch 3: 100%|██████████| 3087/3087 [01:59<00:00, 25.84it/s, v_num=9v9l, train_loss_step=694.0, val_loss_step=687.0, val_loss_epoch=477.0, train_loss_epoch=483.0]

Metric val_loss improved by 5.331 >= min_delta = 0.0. New best score: 477.227


Epoch 4: 100%|██████████| 3087/3087 [01:59<00:00, 25.84it/s, v_num=9v9l, train_loss_step=694.0, val_loss_step=677.0, val_loss_epoch=473.0, train_loss_epoch=478.0]

Metric val_loss improved by 4.706 >= min_delta = 0.0. New best score: 472.520


Epoch 5: 100%|██████████| 3087/3087 [02:03<00:00, 25.07it/s, v_num=9v9l, train_loss_step=670.0, val_loss_step=673.0, val_loss_epoch=469.0, train_loss_epoch=473.0]

Metric val_loss improved by 3.466 >= min_delta = 0.0. New best score: 469.054


Epoch 6: 100%|██████████| 3087/3087 [02:00<00:00, 25.55it/s, v_num=9v9l, train_loss_step=670.0, val_loss_step=675.0, val_loss_epoch=465.0, train_loss_epoch=468.0]

Metric val_loss improved by 3.756 >= min_delta = 0.0. New best score: 465.299


Epoch 7: 100%|██████████| 3087/3087 [02:00<00:00, 25.62it/s, v_num=9v9l, train_loss_step=663.0, val_loss_step=665.0, val_loss_epoch=462.0, train_loss_epoch=464.0]

Metric val_loss improved by 3.181 >= min_delta = 0.0. New best score: 462.117


Epoch 8: 100%|██████████| 3087/3087 [01:58<00:00, 26.14it/s, v_num=9v9l, train_loss_step=669.0, val_loss_step=661.0, val_loss_epoch=460.0, train_loss_epoch=461.0]

Metric val_loss improved by 2.099 >= min_delta = 0.0. New best score: 460.018


Epoch 9: 100%|██████████| 3087/3087 [02:01<00:00, 25.47it/s, v_num=9v9l, train_loss_step=664.0, val_loss_step=661.0, val_loss_epoch=459.0, train_loss_epoch=459.0]

Metric val_loss improved by 1.323 >= min_delta = 0.0. New best score: 458.695


Epoch 12: 100%|██████████| 3087/3087 [02:00<00:00, 25.72it/s, v_num=9v9l, train_loss_step=683.0, val_loss_step=682.0, val_loss_epoch=464.0, train_loss_epoch=468.0]

Monitored metric val_loss did not improve in the last 3 records. Best score: 458.695. Signaling Trainer to stop.


Epoch 12: 100%|██████████| 3087/3087 [02:00<00:00, 25.72it/s, v_num=9v9l, train_loss_step=683.0, val_loss_step=682.0, val_loss_epoch=464.0, train_loss_epoch=468.0]
