In [1]:
!pip install -q packages/torchaudio-2.3.0-cp310-cp310-manylinux1_x86_64.whl
!pip install -q packages/lightning-2.2.0-py3-none-any.whl
!pip install -q packages/colorednoise-2.2.0-py3-none-any.whl
!pip install -q packages/librosa-0.10.2-py3-none-any.whl
!pip install -q packages/torch_audiomentations-0.11.1-py3-none-any.whl
!pip install -q packages/torchinfo-1.8.0-py3-none-any.whl
!pip install -q wandb

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger

from sklearn.metrics import roc_auc_score

from lightning.pytorch.loggers import WandbLogger

  warn(
2024-05-15 06:11:29.374098: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 06:11:29.423499: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from common.audioprocess import rating_value_interplote, audio_weight, sampling_weight, dataloader_sampler_generate,class_weight_generate
from common.audiotransform import read_audio, Mixup, mel_transform,image_delta, Mixup2
from common.audiotransform import CustomCompose,CustomOneOf,NoiseInjection,GaussianNoise,PinkNoise,AddGaussianNoise,AddGaussianSNR
from common.audiodatasets import BirdclefDataset
from common.audiodatasets import trainloader_collate,valloader_collate
from common.modelmeasurements import FocalLoss,compute_roc_auc


In [4]:
# check cuda and select device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
if device == "cuda":
    # get the num of devices
    num_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_devices)

    # Iterate over each CUDA device and print its core count
    for i in range(num_devices):
        print("CUDA device", i, "cores:", torch.cuda.get_device_properties(i).multi_processor_count)
else:
    print("CUDA is not available.")

cuda
Number of CUDA devices: 1
CUDA device 0 cores: 40


In [None]:
metadata_path='./data/train_metadata_new_add_rating.csv'

In [6]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# initilize tran and val sets
train_indices = []
val_indices = []


# random select a sample into val set and other part into train set.
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))


# split dataset based off index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [7]:
# select 20000 pieces data from trainset
additional_val_samples = train_df.sample(n=20000, random_state=42)

# add these selected data into valset
val_df = pd.concat([val_df, additional_val_samples])

# drop these data out of trainset
train_df = train_df.drop(additional_val_samples.index)

In [8]:
# prepare dataloader sampler

train_sampler=dataloader_sampler_generate(df=train_df)
val_sampler=dataloader_sampler_generate(df=val_df)

In [9]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order of this is very important and needs to be matched one by one in subsequent training, 
# I will save these categories here
# save as .npy file
np.save("./external_files/bird-cates.npy", bird_cates)

In [10]:
# load .npy file
loaded_array = np.load("./external_files/bird-cates.npy",allow_pickle=True)

In [11]:
loss_train_class_weights=class_weight_generate(df=train_df,loaded_array=loaded_array)
loss_val_class_weights=class_weight_generate(df=val_df,loaded_array=loaded_array)

In [12]:
# define DatasetModule


class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(
        self,
        train_sampler,
        val_sampler,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        bird_category_dir: str,
        audio_dir: str = "data/audio",
        batch_size: int = 128,
        workers=4,
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.bird_category_dir = bird_category_dir
        self.audio_dir = audio_dir
        self.batch_size = batch_size
        self.train_sampler = train_sampler
        self.val_sampler = val_sampler
        self.workers = workers

    def train_dataloader(self):
        BD = BirdclefDataset(
            df=self.train_df,
            bird_category_dir=self.bird_category_dir,
            audio_dir=self.audio_dir,
            train=True,
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            sampler=self.train_sampler,
            pin_memory=True,
            num_workers=self.workers,
            collate_fn=trainloader_collate
        )
        return loader

    def val_dataloader(self):
        BD = BirdclefDataset(
            df=self.val_df,
            bird_category_dir=self.bird_category_dir,
            audio_dir=self.audio_dir,
            train=False,
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            sampler=self.val_sampler,
            pin_memory=True,
            num_workers=self.workers,
            collate_fn=valloader_collate
        )
        return loader

In [13]:
class ChronoNet(nn.Module):
    def __init__(self,class_nums:int=182):
        super().__init__()
        self.gru1 = nn.GRU(
            input_size=1280, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn1 = nn.BatchNorm1d(num_features=32)
        self.gru2 = nn.GRU(
            input_size=128, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn2 = nn.BatchNorm1d(num_features=32)
        self.gru3 = nn.GRU(
            input_size=256, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn3 = nn.BatchNorm1d(num_features=32)
        self.gru4 = nn.GRU(
            input_size=384, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn4 = nn.BatchNorm1d(num_features=32)
        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(in_features=128, out_features=class_nums)

    def forward(self, x):
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, feature_size, sequence length)
        # I need to change the shape
        x = x.permute(0, 2, 1)
        gru_out1, _ = self.gru1(x)
        x1 = self.bn1(gru_out1)
        gru_out2, _ = self.gru2(x1)
        x2 = self.bn2(gru_out2)
        # According to the chrononet architecture, 
        # we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x3 = torch.cat((x1, x2), dim=2)
        gru_out3, _ = self.gru3(x3)
        x4 = self.bn3(gru_out3)
        x5 = torch.cat((x1, x2, x4), dim=2)
        gru_out4, _ = self.gru4(x5)
        x6 = self.dropout1(gru_out4[:, -1, :])  #Usually take the final output of GRU
        out = self.fc1(x6)

        return out

In [14]:
class BirdModelModule(L.LightningModule):

    def __init__(
        self,
        model,
        train_class_weight: torch.Tensor,
        val_class_weight: torch.Tensor,
        sample_rate: int = 32000,
        class_num: int = 182,
        lr: float = 0.001
    ):
        """
        Parameters:
            model: the defined model module
            train_class_weight: the argument is used for Focal Loss Function, focal loss needs a sequence of class weights to calculate the loss
            val_class_weight: the argument is also used for Focal loss function, for validation step
        """
        super().__init__()
        self.model = model.to(device)
        self.train_class_weight = train_class_weight.to(device)
        self.val_class_weight = val_class_weight.to(device)
        self.sample_rate = sample_rate
        self.class_num = class_num
        self.lr = lr

    def forward(self, clips):

        return self.model(clips)

    def training_step(self, batch, batch_idx):

        clips = batch[0]
        labels = batch[1]
        weights = batch[2]

        labels = labels.to(device)
        clips = clips.to(device)
        weights = weights.to(device)

        # Use flatten to combine the last two dimensions
        clips = torch.flatten(clips, start_dim=2)

        # predictions
        # target_pred=self(clip.to(device))
        target_pred = self(clips)
        # print("train", weights.shape)
        # initialize loss fn
        loss_fn = FocalLoss(weight=self.train_class_weight, sample_weight=weights)

        loss = loss_fn(inputs=target_pred, targets=labels)

        # Compute ROC-AUC and log it
        # roc_auc = compute_roc_auc(preds=target_pred, targets=labels)

        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        # self.log(
        #     "train_roc_auc",
        #     roc_auc,
        #     on_step=True,
        #     on_epoch=True,
        #     prog_bar=True,
        #     logger=True,
        # )

        # # clean up memory
        # del labels, clips, weights, target_pred
        # if torch.cuda.is_available():
        #     torch.cuda.empty_cache()

        return loss

    def validation_step(self, batch, batch_idx):
        clips = batch[0]
        labels = batch[1]
        weights = batch[2]

        labels = labels.to(device)
        clips = clips.to(device)
        weights = weights.to(device)

        # Use flatten to combine the last two dimensions
        clips = torch.flatten(clips, start_dim=2)

        # predictions
        target_pred = self(clips).detach()

        # initialize loss fn
        print("val", weights.shape)
        loss_fn = FocalLoss(weight=self.val_class_weight, sample_weight=weights)

        loss = loss_fn(inputs=target_pred, targets=labels)

        # Compute ROC-AUC and log it
        roc_auc = compute_roc_auc(preds=target_pred, targets=labels)

        self.log(
            "val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        self.log(
            "val_roc_auc",
            roc_auc,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        # # clean up memory
        # del labels, clips, weights, target_pred
        # if torch.cuda.is_available():
        #     torch.cuda.empty_cache()

        return loss

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=self.lr,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass

    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [None]:
if __name__ == "__main__":

    num_workers = multiprocessing.cpu_count()


    # # initilize collate_fn
    # valloader_collate=valloader_collate()
    # trainloader_collate=trainloader_collate()

    logger = WandbLogger(project='BirdClef-2024', name='sef_s21_v2')

    # setup checkpoint（ModelCheckpoint）
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",  # monitor val set loss
        dirpath="models/check",
        filename="sed_s21k_v2-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,  # Only save the best model, the one with the lowest validation loss
        mode="min",  
        auto_insert_metric_name=False, 
    )

    # EarlyStopping
    early_stop_callback = EarlyStopping(
        monitor="val_loss",  
        min_delta=0.00,
        patience=3, 
        verbose=True,
        mode="min", 
    )
    # we used a separate dataloader to feed the model Previously
    # Here we encapsulate the dataloader and use this class to read data for training

    bdm = BirdclefDatasetModule(
        train_sampler=train_sampler,
        val_sampler=val_sampler,
        train_df=train_df,
        val_df=val_df,
        bird_category_dir="./external_files/bird-cates.npy",
        audio_dir="./data/audio",
        batch_size=64,
        workers=8,
    )

    class_num = len(np.load("external_files/bird-cates.npy", allow_pickle=True))
    # initilize model
    chrononet = ChronoNet(class_nums=class_num)

    BirdModelModule = BirdModelModule(
        model=chrononet,
        train_class_weight=loss_train_class_weights,
        val_class_weight=loss_val_class_weights,
        class_num=class_num,
    )

    trainer = L.Trainer(
        # 设置 Trainer，enable mixed precision
        precision=16,
        # Set up Trainer, use gradient accumulation, 
        # and update parameters after accumulating gradients every 512 batches
        accumulate_grad_batches=512,
        max_epochs=45,
        # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
        # devices='auto', # use all gpus if applicable like value=1 or "auto"
        default_root_dir="models/model_training",
        # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
        logger=logger,  # logger
        callbacks=[checkpoint_callback, early_stop_callback], 
    )

    # train the model
    trainer.fit(
        model=BirdModelModule,
        datamodule=bdm,
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Currently logged in as: [33mdydifferent[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | model | ChronoNet | 1.0 M 
------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.039     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

val torch.Size([64])
val torch.Size([64])


Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
import gc
gc.collect()