This notebook will call the module to facilitate the use of multiple workers to accelerate training

For specific modules, please refer to common/sed-s21k-v4

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta
from torch_audiomentations import Compose, PitchShift, Shift, OneOf, AddColoredNoise

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

from lightning.pytorch.loggers import MLFlowLogger

from sklearn.metrics import roc_auc_score
from lightning.pytorch.loggers import WandbLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys

module_path = '../../'

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from common.sed_s21k_v4.audioprocess import rating_value_interplote, audio_weight, sampling_weight, dataloader_sampler_generate,class_weight_generate
from common.sed_s21k_v4.audiotransform import read_audio, Mixup, mel_transform,image_delta, Mixup2
from common.sed_s21k_v4.audiotransform import CustomCompose,CustomOneOf,NoiseInjection,GaussianNoise,PinkNoise,AddGaussianNoise,AddGaussianSNR
from common.sed_s21k_v4.audiodatasets import BirdclefDataset
from common.sed_s21k_v4.audiodatasets import trainloader_collate,valloader_collate
from common.sed_s21k_v4.modelmeasurements import FocalLoss,compute_roc_auc


In [4]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [5]:
metadata_path='../../data/train_metadata_new_add_rating.csv'

In [6]:
# I need to do a train test split on the data first
# Because this dataset is unbalanced
# Randomly select a sample from each category to add to the validation set, and the rest to the training set

raw_df=pd.read_csv(metadata_path,header=0)

# Find the index of each category
class_indices = raw_df.groupby('primary_label').apply(lambda x: x.index.tolist())

# Initialize training set and validation set
train_indices = []
val_indices = []


# Random select a sample from each category to add to the validation set, and the rest to the training set
for indices in class_indices:
    val_sample = pd.Series(indices).sample(n=1, random_state=42).tolist()
    val_indices.extend(val_sample)
    train_indices.extend(set(indices) - set(val_sample))


# Divide the dataset by index
train_df = raw_df.loc[train_indices]
val_df = raw_df.loc[val_indices]

In [7]:
# Random select 20,000 data from the training set
additional_val_samples = train_df.sample(n=20000, random_state=42)

# Add these samples to the validation set
val_df = pd.concat([val_df, additional_val_samples])

# Remove these samples from the training set
train_df = train_df.drop(additional_val_samples.index)

In [8]:
# prepare dataloader sampler

train_sampler=dataloader_sampler_generate(df=train_df)
val_sampler=dataloader_sampler_generate(df=val_df)

In [9]:
# First we need to get all the types
meta_df=pd.read_csv(metadata_path,header=0)
bird_cates=meta_df.primary_label.unique()

#Because the order is very important and needs to be matched one by one in the subsequent training, I will save these types here
# Save as .npy file
np.save("./external_files/13-2-bird-cates.npy", bird_cates)

In [10]:
# load .npy file
loaded_array = np.load("./external_files/13-2-bird-cates.npy",allow_pickle=True)

In [11]:
loss_train_class_weights=class_weight_generate(df=train_df,loaded_array=loaded_array)
loss_val_class_weights=class_weight_generate(df=val_df,loaded_array=loaded_array)

In [12]:
# mixup_layer = Mixup(mix_beta=5, mixup_prob=0.7, mixup_double=0.5)
# mixup2_layer = Mixup2(mix_beta=2, mixup2_prob=0.15)

# audio_transforms = Compose(
#     [
#         # AddColoredNoise(p=0.5),
#         PitchShift(
#             min_transpose_semitones=-4,
#             max_transpose_semitones=4,
#             sample_rate=32000,
#             p=0.4,
#         ),
#         Shift(min_shift=-0.5, max_shift=0.5, p=0.4),
#     ]
# )

In [13]:
# # load pretrained model
# model = timm.create_model('tf_efficientnetv2_s_in21k', pretrained=True,in_chans=3) # You can change the data channel accepted by the pre-trained model by passing in argument in_chans

In [14]:
# # Assume that model is the loaded complete EfficientNet model
# # Use the output of the first set of InvertedResidual
# feature_extractor = torch.nn.Sequential(
#     *list(model.children())[:-3]  # Remove the last three layers, which needs to be adjusted according to the actual model structure
# )
# feature_extractor.eval()

In [15]:
# # I want to separate feature extractor from lightningmodule and add it to dataloader as part of data processing

# def trainloader_collate(batch):
#     """
#     When creating data batches, define how each batch should be stacked
#     parameters:
#         batch: is a list of tuples with (labels, clip, weights)
#         feature_extractor: use a pretrained model as a feature extractor
#     """
#     # Unpack each individual sample in the batch
#     labels, clips, weights = zip(*batch)

#     # Stack the data into new batches
#     labels = torch.stack(labels).float()
#     clips = torch.stack(clips).float()

#     weights = torch.stack(weights) if weights[0] is not None else None

#     clips, labels, weights = mixup_layer(X=clips, Y=labels, weight=weights)

#     # Use Compose to combine multiple audio transformation operations. 
#     # These operations are applied to the input audio data to improve the generalization and robustness of the model.
#     clips = audio_transforms(clips, sample_rate=32000)

#     # Convert audio data into mel spectrogram
#     clips = mel_transform(sample_rate=32000, audio=clips)

#     clips = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(clips)

#     # generalization
#     clips = (clips + 80) / 80

#     # Random masking part of the spectrogram helps the model learn to be robust to missing information in certain time periods.
#     clips = torchaudio.transforms.TimeMasking(
#         time_mask_param=20, iid_masks=True, p=0.3
#     )(clips)

#     # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
#     clips = image_delta(clips)

#     # mix audio up
#     clips, labels,weights = mixup2_layer(X=clips, Y=labels, weight=weights)

#     # feature extractor
#     # Use torch.no_grad() to ensure feature extraction does not preserve gradients
#     with torch.no_grad():
#         clips=feature_extractor(clips)

#     return clips, labels, weights

In [16]:
# # I want to separate feature extractor from lightningmodule and add it to dataloader as part of data processing.


# def valloader_collate(batch):
#     """
#     When creating data batches, define how each batch should be stacked
#     parameters:
#         batch: is a list of tuples with (labels, clip, weights)
#         feature_extractor: use a pretrained model as a feature extractor
#     """
#     # Unpack each individual sample in the batch
#     labels, clips, weights = zip(*batch)

#     # Stack the data into new batches
#     labels = torch.stack(labels).float()
#     clips = torch.stack(clips).float()

#     weights = torch.stack(weights) if weights[0] is not None else None

#     # Convert audio data into mel spectrogram
#     clips = mel_transform(sample_rate=32000, audio=clips)

#     ##Convert the amplitude of Mel Spectrogram to decibel (dB)
#     clips = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(clips)

#     # generalization
#     clips = (clips + 80) / 80

#     # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
#     clips = image_delta(clips)

#     # feature extractor
#     # Use torch.no_grad() to ensure feature extraction does not preserve gradients
#     with torch.no_grad():
#         clips = feature_extractor(clips)

#     return clips, labels, weights

In [17]:
# define DatasetModule


class BirdclefDatasetModule(L.LightningDataModule):

    def __init__(
        self,
        train_sampler,
        val_sampler,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        bird_category_dir: str,
        audio_dir: str = "data/audio",
        batch_size: int = 128,
        workers=4,
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.bird_category_dir = bird_category_dir
        self.audio_dir = audio_dir
        self.batch_size = batch_size
        self.train_sampler = train_sampler
        self.val_sampler = val_sampler
        self.workers = workers

    def train_dataloader(self):
        BD = BirdclefDataset(
            df=self.train_df,
            bird_category_dir=self.bird_category_dir,
            audio_dir=self.audio_dir,
            train=True,
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            sampler=self.train_sampler,
            pin_memory=True,
            num_workers=self.workers,
            prefetch_factor=64,
            collate_fn=trainloader_collate
        )
        return loader

    def val_dataloader(self):
        BD = BirdclefDataset(
            df=self.val_df,
            bird_category_dir=self.bird_category_dir,
            audio_dir=self.audio_dir,
            train=False,
        )
        loader = DataLoader(
            dataset=BD,
            batch_size=self.batch_size,
            sampler=self.val_sampler,
            pin_memory=True,
            num_workers=self.workers,
            prefetch_factor=64,
            collate_fn=valloader_collate
        )
        return loader

In [18]:
class ChronoNet(nn.Module):
    def __init__(self,class_nums:int=182):
        super().__init__()
        self.gru1 = nn.GRU(
            input_size=1280, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn1 = nn.BatchNorm1d(num_features=32)
        self.gru2 = nn.GRU(
            input_size=128, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn2 = nn.BatchNorm1d(num_features=32)
        self.gru3 = nn.GRU(
            input_size=256, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn3 = nn.BatchNorm1d(num_features=32)
        self.gru4 = nn.GRU(
            input_size=384, hidden_size=128, num_layers=1, batch_first=True
        )
        self.bn4 = nn.BatchNorm1d(num_features=32)
        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(in_features=128, out_features=class_nums)

    def forward(self, x):
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, feature_size, sequence length)
        # I need to change the shape
        x = x.permute(0, 2, 1)
        gru_out1, _ = self.gru1(x)
        x1 = self.bn1(gru_out1)
        gru_out2, _ = self.gru2(x1)
        x2 = self.bn2(gru_out2)
        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x3 = torch.cat((x1, x2), dim=2)
        gru_out3, _ = self.gru3(x3)
        x4 = self.bn3(gru_out3)
        x5 = torch.cat((x1, x2, x4), dim=2)
        gru_out4, _ = self.gru4(x5)
        x6 = self.dropout1(gru_out4[:, -1, :]) 
        out = self.fc1(x6)

        return out

In [19]:
class BirdModelModule(L.LightningModule):

    def __init__(
        self,
        model,
        train_class_weight: torch.Tensor,
        val_class_weight: torch.Tensor,
        sample_rate: int = 32000,
        class_num: int = 182,
        lr: float = 0.001
    ):
        """
        Parameters:
            model: the defined model module
            train_class_weight: the argument is used for Focal Loss Function, focal loss needs a sequence of class weights to calculate the loss
            val_class_weight: the argument is also used for Focal loss function, for validation step
        """
        super().__init__()
        self.model = model.to(device)
        self.train_class_weight = train_class_weight.to(device)
        self.val_class_weight = val_class_weight.to(device)
        self.sample_rate = sample_rate
        self.class_num = class_num
        self.lr = lr

    def forward(self, clips):

        return self.model(clips)

    def training_step(self, batch, batch_idx):

        clips = batch[0]
        labels = batch[1]
        weights = batch[2]

        labels = labels.to(device)
        clips = clips.to(device)
        weights = weights.to(device)

        # Use flatten to combine the last two dimensions
        clips = torch.flatten(clips, start_dim=2)

        # predictions
        # target_pred=self(clip.to(device))
        target_pred = self(clips)
        # print("train", weights.shape)
        # initialize loss fn
        loss_fn = FocalLoss(weight=self.train_class_weight, sample_weight=weights)

        loss = loss_fn(inputs=target_pred, targets=labels)

        # Compute ROC-AUC and log it
        # roc_auc = compute_roc_auc(preds=target_pred, targets=labels)

        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        # self.log(
        #     "train_roc_auc",
        #     roc_auc,
        #     on_step=True,
        #     on_epoch=True,
        #     prog_bar=True,
        #     logger=True,
        # )

        # clean memory
        del labels, clips, weights, target_pred
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return loss

    def validation_step(self, batch, batch_idx):
        clips = batch[0]
        labels = batch[1]
        weights = batch[2]

        labels = labels.to(device)
        clips = clips.to(device)
        weights = weights.to(device)

        # Use flatten to combine the last two dimensions
        clips = torch.flatten(clips, start_dim=2)

        # predictions
        target_pred = self(clips).detach()

        # initialize loss fn
        print("val", weights.shape)
        loss_fn = FocalLoss(weight=self.val_class_weight, sample_weight=weights)

        loss = loss_fn(inputs=target_pred, targets=labels)

        # Compute ROC-AUC and log it
        # roc_auc = compute_roc_auc(preds=target_pred, targets=labels)

        self.log(
            "val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        # self.log(
        #     "val_roc_auc",
        #     roc_auc,
        #     on_step=True,
        #     on_epoch=True,
        #     prog_bar=True,
        #     logger=True,
        # )

        # clean memory
        del labels, clips, weights, target_pred
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return loss

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=self.lr,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass

    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        pass

In [20]:
if __name__ == "__main__":

    num_workers = multiprocessing.cpu_count()


    # # initilize collate_fn
    # valloader_collate=valloader_collate()
    # trainloader_collate=trainloader_collate()

    logger = WandbLogger(project='BirdClef-mac', name='sef_s21_v1_mac')

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss", 
        dirpath="models/checkpoints",
        filename="sed_s21k_v1-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,  
        mode="min", 
        auto_insert_metric_name=False, 
    )

    early_stop_callback = EarlyStopping(
        monitor="val_loss", 
        min_delta=0.00,
        patience=3, 
        verbose=True,
        mode="min", 
    )

    # Previously we used a separate dataloader to feed the model
    # Here we encapsulate the dataloader and use this class to read data for training

    bdm = BirdclefDatasetModule(
        train_sampler=train_sampler,
        val_sampler=val_sampler,
        train_df=train_df,
        val_df=val_df,
        bird_category_dir="./external_files/13-2-bird-cates.npy",
        audio_dir="../../data/train_audio",
        batch_size=64,
        workers=6,
    )

    class_num = len(np.load("external_files/13-2-bird-cates.npy", allow_pickle=True))
    # initilize model
    chrononet = ChronoNet(class_nums=class_num)

    BirdModelModule = BirdModelModule(
        model=chrononet,
        train_class_weight=loss_train_class_weights,
        val_class_weight=loss_val_class_weights,
        class_num=class_num,
    )

    trainer = L.Trainer(
        # Set up Trainer and enable mixed precision
        precision=16,
        # Set up Trainer, use gradient accumulation, and update parameters after accumulating gradients every 512 batches
        accumulate_grad_batches=512,
        max_epochs=45,
        # accelerator="auto", # set to 'auto' or 'gpu' to use gpu if possible
        # devices='auto', # use all gpus if applicable like value=1 or "auto"
        default_root_dir="models/model_training",
        # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
        logger=logger,  # use MLflow logger
        callbacks=[checkpoint_callback, early_stop_callback],  
    )

    # train the model
    trainer.fit(
        model=BirdModelModule,
        datamodule=bdm, 
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/yiding/.netrc



  | Name  | Type      | Params
------------------------------------
0 | model | ChronoNet | 1.0 M 
------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.039     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]val torch.Size([64])
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  4.46it/s]val torch.Size([64])
Epoch 0:   1%|          | 20/3087 [03:35<9:10:35,  0.09it/s, v_num=q8b2, train_loss_step=65.20] 

In [21]:
import gc
gc.collect()

40