Here I choose to introduce a pre-trained model as a feature extractor, and here I choose to use tf_efficientnetv2_s.in21k

Refer to the data processing section of 2-chrononet-with-torchdataset-melspecs.ipynb. The same data processing steps are used here.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split

import lightning as L

from sklearn.preprocessing import LabelEncoder

import torch.nn as nn
import torch
import torch.nn.functional as F


import torchmetrics

from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping
import torch.optim as optim

In [2]:
import timm
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [4]:
labels_path='../../data/train_metadata_new_tiny.csv'

In [5]:
# initialize label encoder

encoder=LabelEncoder()

In [6]:
raw_df=pd.read_csv(labels_path,header=0)

labels_all=raw_df.primary_label.unique().tolist()

print(labels_all)
print(len(labels_all))

labels_encoded=encoder.fit_transform(labels_all)

print(labels_encoded)

# If needed, you can view the mapping of original labels to encodings
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_mapping)

['lblwar1', 'blrwar1', 'bkskit1', 'comros', 'houcro1', 'houspa', 'graher1', 'blhori1', 'mawthr1', 'grewar3', 'hoopoe', 'indpit1', 'litspi1', 'wemhar1', 'laudov1', 'litgre1', 'rocpig', 'grecou1', 'whbwoo2', 'barswa', 'gyhcaf1', 'purher1', 'litegr', 'commyn', 'lirplo', 'putbab1', 'cregos1', 'bkwsti', 'gybpri1', 'commoo3', 'categr', 'asbfly', 'brwowl1', 'marsan', 'maghor2', 'zitcis1', 'bcnher', 'woosan', 'greegr', 'grtdro1', 'comtai1', 'eaywag1', 'grejun2', 'placuc3', 'grnsan', 'eucdov', 'comkin1', 'junowl1', 'ingori1', 'emedov2', 'sttwoo1', 'rorpar', 'thbwar1', 'comsan', 'goflea1', 'indrol2', 'cohcuc1', 'blakit1', 'comgre', 'eurcoo', 'whbbul2', 'rewlap1', 'inbrob1', 'brnshr', 'rerswa1', 'plapri1', 'whiter2', 'whbwat1', 'labcro1', 'plaflo1', 'grywag', 'spodov', 'redspu1', 'spepic1', 'yebbul3', 'gargan', 'spoowl1', 'aspswi1', 'eurbla2', 'brodro1', 'rewbul', 'stbkin1', 'ashdro1', 'lobsun2', 'rossta2', 'tilwar1', 'grefla1', 'compea', 'sbeowl1', 'barfly1', 'crseag1', 'comior1', 'grenig1', 'ru

In [7]:
del raw_df,labels_all,labels_encoded,label_mapping

In [8]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [9]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [10]:
## do mel spectrogram transform


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    # Determine window size and frame shift
    # window_size = 0.04 # 40 milliseconds
    # hop_size = 0.02 # 20 milliseconds, usually half the window size
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate) 

    # Calculate Mel Spectrogram
    # n_mels = 40 # Number of Mel filters

    # Set up Mel Spectrogram converter
    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [11]:
class BirdclefDataset(Dataset):
    def __init__(self,
                 encoder:LabelEncoder,
                 audio_dir:str='../../data/train_audio',
                 labels_path:str=None,
                 ):
        """
        Parameters:
            encoder: label encoder
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.encoder=encoder
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)

            # Calculate mean and standard deviation
            mean_vals = clip.mean(dim=1, keepdim=True)
            std_vals = clip.std(dim=1, keepdim=True)

            # standardization
            standardized_clip = (clip - mean_vals) / std_vals

            # # Map normalized data to the range 0-255
            # scaled_clip = 255 * (standardized_clip - standardized_clip.min()) / (standardized_clip.max() - standardized_clip.min())
            # final_clip = scaled_clip.round().to(torch.uint8)  # Round and convert to an 8-bit unsigned integer

                
        else:
            raise ValueError('The clip start time is out of raw audio length')
        


        return standardized_clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path to a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.encoder.transform([self.get_audio_label(index)])[0]

        # Read audio array according to path
        audio, sr=read_audio(single_audio_dir)
        
        # Read the duration and number of channels corresponding to the audio
        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)

        # mel spectrogram transformation
        mel_spec=mel_transform(sample_rate=sr,audio=clip)

        # # Calculates the minimum and maximum values ​​of a tensor
        # min_val = mel_spec.min()
        # max_val = mel_spec.max()

        # # Normalize the data to the range 0-1
        # normalized_tensor = (mel_spec - min_val) / (max_val - min_val)

        # # Scale the data in the 0-1 range to the 0-255 range
        # scaled_tensor = normalized_tensor * 255

        # # Round and convert to integer
        # final_tensor = scaled_tensor.round().to(torch.float32)

        return audio_label, mel_spec

In [12]:
BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)
dataloader = DataLoader(dataset=BD, batch_size=128, shuffle=True, num_workers=0)



In [13]:
batch = next(iter(dataloader))
labels, mel_specs = batch
print(labels)
print(mel_specs)
print(mel_specs.shape)

tensor([  0, 113,  87,  32,  66,  34,  92,  11, 125, 145,  47,  52,  66,  96,
         16,  16,  98,  52, 152,  93,  33,  16,  52,  20,  36,  34,  32,  43,
        153,  33,  21, 149,  16,  87,  36,  34,  16, 128,  34,  16,  16,  31,
        159,  36,  16,  72,  34,  36, 126, 113, 139,  88,  94,  68,  52, 115,
         68,  43,  72,  34,  15,   7, 126,  27,  68,   8,  16,  11,  37,   8,
         61,   9,  30,  89,  88,  67, 145, 130,  55,  23,  93,  76, 125,   7,
         32,  45,  39,   0, 112,  68, 154,  66,  43,  93,  33, 153, 143,  34,
         92, 146,  41,  15, 141,  36,  95,  92,  30,  16, 122,  60,  72, 122,
        140,  45,  72, 154, 124, 139, 116,  72, 159,  13, 118,  45,  25,  16,
         41,   8])
tensor([[[[2.5582e-06, 7.1976e-05, 2.2219e+00,  ..., 4.1435e-08,
           3.7231e-08, 4.1788e+01],
          [2.3467e-06, 2.6229e-04, 2.5768e+00,  ..., 1.4192e-07,
           1.5844e-07, 4.5789e+01],
          [3.9678e-06, 2.9553e-04, 3.7667e+00,  ..., 3.2753e-04,
           1

In [14]:
# load pretrained model
model = timm.create_model('tf_efficientnetv2_s_in21k', pretrained=True)

  model = create_fn(


In [15]:
# The tf_efficientnetv2_s_in21k model expects the input data to be a three-channel image (usually an RGB image) by default
# So here, temporarily change ch_num to 3 to view the information of the pre-trained model
summary(model,input_size=(128,3,40,251))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [128, 21843]              --
├─Conv2dSame: 1-1                             [128, 24, 20, 126]        648
├─BatchNormAct2d: 1-2                         [128, 24, 20, 126]        48
│    └─Identity: 2-1                          [128, 24, 20, 126]        --
│    └─SiLU: 2-2                              [128, 24, 20, 126]        --
├─Sequential: 1-3                             [128, 256, 2, 8]          --
│    └─Sequential: 2-3                        [128, 24, 20, 126]        --
│    │    └─ConvBnAct: 3-1                    [128, 24, 20, 126]        5,232
│    │    └─ConvBnAct: 3-2                    [128, 24, 20, 126]        5,232
│    └─Sequential: 2-4                        [128, 48, 10, 63]         --
│    │    └─EdgeResidual: 3-3                 [128, 48, 10, 63]         25,632
│    │    └─EdgeResidual: 3-4                 [128, 48, 10, 63]         92,640
│    

In [44]:
model.classifier.in_features

1280

In [16]:
# Modify the model to accept 1-channel input
first_conv_layer = model.conv_stem

print(first_conv_layer)

model.conv_stem = nn.Conv2d(in_channels=1, out_channels=first_conv_layer.out_channels, 
                            kernel_size=first_conv_layer.kernel_size, 
                            stride=first_conv_layer.stride, 
                            padding=first_conv_layer.padding, bias=False)

Conv2dSame(3, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)


In [17]:
summary(model,input_size=(128,1,40,251))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [128, 21843]              --
├─Conv2d: 1-1                                 [128, 24, 19, 125]        216
├─BatchNormAct2d: 1-2                         [128, 24, 19, 125]        48
│    └─Identity: 2-1                          [128, 24, 19, 125]        --
│    └─SiLU: 2-2                              [128, 24, 19, 125]        --
├─Sequential: 1-3                             [128, 256, 2, 8]          --
│    └─Sequential: 2-3                        [128, 24, 19, 125]        --
│    │    └─ConvBnAct: 3-1                    [128, 24, 19, 125]        5,232
│    │    └─ConvBnAct: 3-2                    [128, 24, 19, 125]        5,232
│    └─Sequential: 2-4                        [128, 48, 10, 63]         --
│    │    └─EdgeResidual: 3-3                 [128, 48, 10, 63]         25,632
│    │    └─EdgeResidual: 3-4                 [128, 48, 10, 63]         92,640
│    

In [18]:
# Set the model to evaluation mode
model.eval()

EfficientNet(
  (conv_stem): Conv2d(1, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNormAct2d(
    24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
      (1): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNormAct2d(
          24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (drop_path): Identity()
      )
    )
    (1): Sequential(
      (

In [19]:
# # Extract features using the model
# features_list = []
# with torch.no_grad():
#     for inputs in dataloader:
#         # Assume your model is slightly modified for feature extraction
#         features = model(inputs[1])
#         features_list.append(features)

In [20]:
# Optionally convert the feature list into a large tensor
# features_tensor = torch.cat(features_list, dim=0)


In [21]:
# features_tensor.shape

It can be seen that if the output layer of the pre-trained model is not modified, the result is the same as the definition of the original model itself, and 21,843 categories will be returned.

But we only have 182 categories. If a small data set is used, there are only 160 categories. I need to redefine the output layer and retrain the output layer.

In [22]:
# Replace the last classification layer, the number of output features of the new layer is 160
model.classifier = nn.Linear(in_features=model.classifier.in_features, out_features=160)

In [23]:
# Next, you need to train the new output layer. You can choose to freeze the previous layers and only train this new classification layer, 
# or adjust the training settings of some layers as needed.

# Freeze the parameters of all previous layers and only train the new classification layer
for param in model.parameters():
    param.requires_grad = False

# Enable gradient updates only for the last classification layer
for param in model.classifier.parameters():
    param.requires_grad = True


In [24]:
# import torch

# def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
#     model.train()  # Set model to training mode
    
#     for epoch in range(num_epochs):
#         running_loss = 0.0
#         correct_predictions = 0
        
#         for inputs, labels in dataloader:
#             inputs, labels = inputs.to(device), labels.to(device)
            
#             optimizer.zero_grad()  # Clear previous gradients
            
#             # Forward Propagation
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
            
#             # Backpropagation and Optimization
#             loss.backward()
#             optimizer.step()
            
#             # statistics
#             running_loss += loss.item()
#             _, predictions = torch.max(outputs, 1)
#             correct_predictions += (predictions == labels).sum().item()
        
#         epoch_loss = running_loss / len(dataloader.dataset)
#         epoch_acc = correct_predictions / len(dataloader.dataset)
        
#         print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
        
#     return model

# # Training the model
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
# trained_model = train_model(model, train_loader, criterion, optimizer, num_epochs=10)


In [25]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight  # category weights
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.weight)
        p_t = torch.exp(-ce_loss) # Modulating Factor
        loss = (1 - p_t) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss


In [40]:
## Use lightning to encapsulate the training steps

class effnetv2tune(L.LightningModule):
    def __init__(self,model,learning_rate=0.0001):
        super().__init__()
        self.model=model
        self.lr=learning_rate
        self.train_acc=torchmetrics.Accuracy(task='multiclass',num_classes=160)
        self.val_acc=torchmetrics.Accuracy(task='multiclass',num_classes=160)
        self.focal_loss = FocalLoss(gamma=2.0)

    def forward(self,x):
        return self.model(x)


    def training_step(self,batch,batch_idx):
        labels,features=batch
        # Send data to GPU for training
        features=features.to(self.device)
        labels=labels.to(self.device)

        outputs=self(features)
        # print(outputs)
        # print('-------------')
        # loss=F.cross_entropy(outputs,labels)
        loss = self.focal_loss(outputs, labels)

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        acc=self.train_acc(outputs,labels)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss
        

    def validation_step(self,batch,batch_idx):

        labels,features=batch
        # Send data to GPU for training
        features=features.to(self.device)
        labels=labels.to(self.device)

        outputs=self(features)
        # loss=F.cross_entropy(outputs,labels)
        loss = self.focal_loss(outputs, labels)
        # self.log('val_loss',loss)

        # # Calculate accuracy
        # _, predictions = torch.max(outputs, 1)
        # acc = torch.tensor(torch.sum(predictions == labels).item() / len(predictions))
        # self.log('val_acc', acc, on_step=False, on_epoch=True)

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        acc=self.val_acc(outputs,labels)
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.classifier.parameters(), lr=self.lr)
        return optimizer
    
    


In [27]:
# I introduce lightningDataModule here to distinguish trainset and valset

class ChronoNetDataModule(L.LightningDataModule):
    def __init__(self,dataset:Dataset,pred=None,batch_size:int=128):
        super().__init__()
        self.dataset=dataset
        self.batch_size=batch_size

        self.pred=pred

    def setup(self,stage:str):
        # assign train/val splits for use in dataloaders
        if stage=='fit':
            # self.train_dataset,self.val_dataset=random_split(self.dataset,[0.8,0.2],generator=torch.Generator().manual_seed(41))
            self.train_dataset,self.val_dataset=random_split(self.dataset,[0.8,0.2],generator=torch.Generator())
        # if stage=='predict':
            

    def train_dataloader(self):
        loader= DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

        return loader
    
    def val_dataloader(self):
        loader= DataLoader(self.val_dataset, batch_size=self.batch_size,shuffle=False)

        return loader

    def predict_dataloader(self):
        loader=DataLoader(self.dataset,batch_size=self.batch_size,shuffle=False)

        return loader


In [28]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  # monitor val set loss
    dirpath='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/feature-extractor/checkpoints/',
    filename='chrononet-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,  
    mode='min',  
    auto_insert_metric_name=False  
)

# 设置早停（EarlyStopping）
early_stop_callback = EarlyStopping(
    monitor='val_loss',  
    min_delta=0.00,
    patience=3,  
    verbose=True,
    mode='min'  
)

In [29]:
#initialize Dataset first

BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)
# Previously we used a separate dataloader to feed the model
# Here we encapsulate the dataloader and use this class to read data for training

dm=ChronoNetDataModule(dataset=BD,batch_size=128)
print(dm)



<__main__.ChronoNetDataModule object at 0x2890eb0a0>


In [30]:
effnetv2tune=effnetv2tune(model=model)

trainer=L.Trainer(
    max_epochs=30,
    accelerator="gpu", # set to 'auto' or 'gpu' to use gpu if possible
    devices=1, # use all gpus if applicable like value=1 or "auto"
    default_root_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/feature-extractor/',
    # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
    callbacks=[checkpoint_callback, early_stop_callback],  
)

# train the model
trainer.fit(
    model=effnetv2tune,
    datamodule=dm 
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/homebrew/Caskroom/miniforge/base/envs/birdclef/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /Users/yiding/personal_projects/ML/github_repo/birdcief/code/feature-extractor/checkpoints exists and is not empty.

  | Name       | Type               | Params
--------------------------------------------------
0 | model      | EfficientNet       | 20.4 M
1 | train_acc  | MulticlassAccuracy | 0     
2 | val_acc    | MulticlassAccuracy | 0     
3 | focal_loss | FocalLoss          | 0     
--------------------------------------------------
204 K     Trainable params
20.2 M    Non-trainable params
20.4 M    Total params
81.528    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/homebrew/Caskroom/miniforge/base/envs/birdclef/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/opt/homebrew/Caskroom/miniforge/base/envs/birdclef/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/opt/homebrew/Caskroom/miniforge/base/envs/birdclef/lib/python3.10/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (14) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 14/14 [03:29<00:00,  0.07it/s, v_num=11, train_loss_step=6.85e+4, train_acc_step=0.0256, val_loss_step=7.06e+4, val_acc_step=0.0196, val_loss_epoch=7.25e+4, val_acc_epoch=0.0115, train_loss_epoch=8.75e+4, train_acc_epoch=0.00976]

Metric val_loss improved. New best score: 72491.164


Epoch 1: 100%|██████████| 14/14 [03:25<00:00,  0.07it/s, v_num=11, train_loss_step=8.25e+4, train_acc_step=0.0385, val_loss_step=6.21e+4, val_acc_step=0.0392, val_loss_epoch=6.54e+4, val_acc_epoch=0.0276, train_loss_epoch=6.53e+4, train_acc_epoch=0.023]  

Metric val_loss improved by 7056.953 >= min_delta = 0.0. New best score: 65434.211


Epoch 2: 100%|██████████| 14/14 [03:26<00:00,  0.07it/s, v_num=11, train_loss_step=5.7e+4, train_acc_step=0.0128, val_loss_step=6.01e+4, val_acc_step=0.0588, val_loss_epoch=6.33e+4, val_acc_epoch=0.0299, train_loss_epoch=5.81e+4, train_acc_epoch=0.0344]

Metric val_loss improved by 2156.781 >= min_delta = 0.0. New best score: 63277.430


Epoch 3: 100%|██████████| 14/14 [03:30<00:00,  0.07it/s, v_num=11, train_loss_step=5.62e+4, train_acc_step=0.0256, val_loss_step=5.97e+4, val_acc_step=0.0588, val_loss_epoch=6.17e+4, val_acc_epoch=0.0276, train_loss_epoch=5.18e+4, train_acc_epoch=0.0339]

Metric val_loss improved by 1610.840 >= min_delta = 0.0. New best score: 61666.590


Epoch 4: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=4.85e+4, train_acc_step=0.000, val_loss_step=5.85e+4, val_acc_step=0.0588, val_loss_epoch=6.01e+4, val_acc_epoch=0.0276, train_loss_epoch=4.66e+4, train_acc_epoch=0.0396] 

Metric val_loss improved by 1530.258 >= min_delta = 0.0. New best score: 60136.332


Epoch 5: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=4.15e+4, train_acc_step=0.0513, val_loss_step=5.88e+4, val_acc_step=0.0588, val_loss_epoch=5.94e+4, val_acc_epoch=0.0276, train_loss_epoch=4.2e+4, train_acc_epoch=0.0499] 

Metric val_loss improved by 768.602 >= min_delta = 0.0. New best score: 59367.730


Epoch 6: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=3.46e+4, train_acc_step=0.0897, val_loss_step=5.83e+4, val_acc_step=0.0784, val_loss_epoch=5.84e+4, val_acc_epoch=0.0299, train_loss_epoch=3.78e+4, train_acc_epoch=0.0643]

Metric val_loss improved by 934.332 >= min_delta = 0.0. New best score: 58433.398


Epoch 7: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=2.83e+4, train_acc_step=0.0513, val_loss_step=5.83e+4, val_acc_step=0.0784, val_loss_epoch=5.78e+4, val_acc_epoch=0.0299, train_loss_epoch=3.42e+4, train_acc_epoch=0.0723]

Metric val_loss improved by 624.707 >= min_delta = 0.0. New best score: 57808.691


Epoch 8: 100%|██████████| 14/14 [03:34<00:00,  0.07it/s, v_num=11, train_loss_step=2.39e+4, train_acc_step=0.141, val_loss_step=5.85e+4, val_acc_step=0.0588, val_loss_epoch=5.75e+4, val_acc_epoch=0.0253, train_loss_epoch=3.07e+4, train_acc_epoch=0.0913] 

Metric val_loss improved by 322.035 >= min_delta = 0.0. New best score: 57486.656


Epoch 9: 100%|██████████| 14/14 [03:28<00:00,  0.07it/s, v_num=11, train_loss_step=3.27e+4, train_acc_step=0.115, val_loss_step=5.77e+4, val_acc_step=0.0588, val_loss_epoch=5.67e+4, val_acc_epoch=0.023, train_loss_epoch=2.77e+4, train_acc_epoch=0.115]   

Metric val_loss improved by 813.547 >= min_delta = 0.0. New best score: 56673.109


Epoch 10: 100%|██████████| 14/14 [03:28<00:00,  0.07it/s, v_num=11, train_loss_step=2.39e+4, train_acc_step=0.0897, val_loss_step=5.82e+4, val_acc_step=0.0588, val_loss_epoch=5.64e+4, val_acc_epoch=0.0253, train_loss_epoch=2.49e+4, train_acc_epoch=0.130]

Metric val_loss improved by 232.277 >= min_delta = 0.0. New best score: 56440.832


Epoch 11: 100%|██████████| 14/14 [03:29<00:00,  0.07it/s, v_num=11, train_loss_step=2.71e+4, train_acc_step=0.128, val_loss_step=5.71e+4, val_acc_step=0.0588, val_loss_epoch=5.58e+4, val_acc_epoch=0.0276, train_loss_epoch=2.24e+4, train_acc_epoch=0.148] 

Metric val_loss improved by 602.281 >= min_delta = 0.0. New best score: 55838.551


Epoch 12: 100%|██████████| 14/14 [03:29<00:00,  0.07it/s, v_num=11, train_loss_step=2.07e+4, train_acc_step=0.154, val_loss_step=5.63e+4, val_acc_step=0.0588, val_loss_epoch=5.55e+4, val_acc_epoch=0.0253, train_loss_epoch=2.02e+4, train_acc_epoch=0.177]

Metric val_loss improved by 314.137 >= min_delta = 0.0. New best score: 55524.414


Epoch 13: 100%|██████████| 14/14 [03:30<00:00,  0.07it/s, v_num=11, train_loss_step=1.52e+4, train_acc_step=0.218, val_loss_step=5.61e+4, val_acc_step=0.0588, val_loss_epoch=5.49e+4, val_acc_epoch=0.0253, train_loss_epoch=1.82e+4, train_acc_epoch=0.207]

Metric val_loss improved by 626.926 >= min_delta = 0.0. New best score: 54897.488


Epoch 14: 100%|██████████| 14/14 [03:28<00:00,  0.07it/s, v_num=11, train_loss_step=2.13e+4, train_acc_step=0.0897, val_loss_step=5.58e+4, val_acc_step=0.0588, val_loss_epoch=5.48e+4, val_acc_epoch=0.0299, train_loss_epoch=1.66e+4, train_acc_epoch=0.222]

Metric val_loss improved by 93.867 >= min_delta = 0.0. New best score: 54803.621


Epoch 15: 100%|██████████| 14/14 [03:29<00:00,  0.07it/s, v_num=11, train_loss_step=1.61e+4, train_acc_step=0.244, val_loss_step=5.59e+4, val_acc_step=0.0588, val_loss_epoch=5.44e+4, val_acc_epoch=0.0253, train_loss_epoch=1.51e+4, train_acc_epoch=0.251] 

Metric val_loss improved by 437.746 >= min_delta = 0.0. New best score: 54365.875


Epoch 16: 100%|██████████| 14/14 [03:26<00:00,  0.07it/s, v_num=11, train_loss_step=1.43e+4, train_acc_step=0.295, val_loss_step=5.54e+4, val_acc_step=0.0588, val_loss_epoch=5.41e+4, val_acc_epoch=0.0276, train_loss_epoch=1.37e+4, train_acc_epoch=0.270]

Metric val_loss improved by 253.598 >= min_delta = 0.0. New best score: 54112.277


Epoch 17: 100%|██████████| 14/14 [03:25<00:00,  0.07it/s, v_num=11, train_loss_step=9.95e+3, train_acc_step=0.295, val_loss_step=5.52e+4, val_acc_step=0.0588, val_loss_epoch=5.36e+4, val_acc_epoch=0.0276, train_loss_epoch=1.25e+4, train_acc_epoch=0.298]

Metric val_loss improved by 478.336 >= min_delta = 0.0. New best score: 53633.941


Epoch 18: 100%|██████████| 14/14 [03:23<00:00,  0.07it/s, v_num=11, train_loss_step=9.88e+3, train_acc_step=0.346, val_loss_step=5.48e+4, val_acc_step=0.0588, val_loss_epoch=5.34e+4, val_acc_epoch=0.0299, train_loss_epoch=1.15e+4, train_acc_epoch=0.321]

Metric val_loss improved by 270.883 >= min_delta = 0.0. New best score: 53363.059


Epoch 20: 100%|██████████| 14/14 [03:24<00:00,  0.07it/s, v_num=11, train_loss_step=1.05e+4, train_acc_step=0.385, val_loss_step=5.47e+4, val_acc_step=0.0588, val_loss_epoch=5.28e+4, val_acc_epoch=0.0299, train_loss_epoch=9.71e+3, train_acc_epoch=0.362]

Metric val_loss improved by 542.020 >= min_delta = 0.0. New best score: 52821.039


Epoch 21: 100%|██████████| 14/14 [03:25<00:00,  0.07it/s, v_num=11, train_loss_step=1.27e+4, train_acc_step=0.321, val_loss_step=5.47e+4, val_acc_step=0.0588, val_loss_epoch=5.28e+4, val_acc_epoch=0.0345, train_loss_epoch=8.94e+3, train_acc_epoch=0.374]

Metric val_loss improved by 33.473 >= min_delta = 0.0. New best score: 52787.566


Epoch 22: 100%|██████████| 14/14 [03:26<00:00,  0.07it/s, v_num=11, train_loss_step=7.06e+3, train_acc_step=0.423, val_loss_step=5.43e+4, val_acc_step=0.0588, val_loss_epoch=5.25e+4, val_acc_epoch=0.0276, train_loss_epoch=8.2e+3, train_acc_epoch=0.400] 

Metric val_loss improved by 282.832 >= min_delta = 0.0. New best score: 52504.734


Epoch 23: 100%|██████████| 14/14 [03:25<00:00,  0.07it/s, v_num=11, train_loss_step=1.01e+4, train_acc_step=0.321, val_loss_step=5.52e+4, val_acc_step=0.0588, val_loss_epoch=5.25e+4, val_acc_epoch=0.0322, train_loss_epoch=7.64e+3, train_acc_epoch=0.409]

Metric val_loss improved by 8.082 >= min_delta = 0.0. New best score: 52496.652


Epoch 24: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=6.56e+3, train_acc_step=0.487, val_loss_step=5.56e+4, val_acc_step=0.0588, val_loss_epoch=5.23e+4, val_acc_epoch=0.0345, train_loss_epoch=7.04e+3, train_acc_epoch=0.427]

Metric val_loss improved by 212.453 >= min_delta = 0.0. New best score: 52284.199


Epoch 25: 100%|██████████| 14/14 [03:27<00:00,  0.07it/s, v_num=11, train_loss_step=6.09e+3, train_acc_step=0.423, val_loss_step=5.47e+4, val_acc_step=0.0588, val_loss_epoch=5.23e+4, val_acc_epoch=0.0322, train_loss_epoch=6.45e+3, train_acc_epoch=0.445]

Metric val_loss improved by 3.973 >= min_delta = 0.0. New best score: 52280.227


Epoch 26: 100%|██████████| 14/14 [03:26<00:00,  0.07it/s, v_num=11, train_loss_step=4.64e+3, train_acc_step=0.538, val_loss_step=5.48e+4, val_acc_step=0.0588, val_loss_epoch=5.21e+4, val_acc_epoch=0.0299, train_loss_epoch=5.96e+3, train_acc_epoch=0.448]

Metric val_loss improved by 164.180 >= min_delta = 0.0. New best score: 52116.047


Epoch 27: 100%|██████████| 14/14 [03:29<00:00,  0.07it/s, v_num=11, train_loss_step=3.89e+3, train_acc_step=0.500, val_loss_step=5.48e+4, val_acc_step=0.0784, val_loss_epoch=5.2e+4, val_acc_epoch=0.0345, train_loss_epoch=5.5e+3, train_acc_epoch=0.473]  

Metric val_loss improved by 120.590 >= min_delta = 0.0. New best score: 51995.457


Epoch 29: 100%|██████████| 14/14 [03:28<00:00,  0.07it/s, v_num=11, train_loss_step=3.65e+3, train_acc_step=0.526, val_loss_step=5.5e+4, val_acc_step=0.0784, val_loss_epoch=5.21e+4, val_acc_epoch=0.0322, train_loss_epoch=4.7e+3, train_acc_epoch=0.513]  

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 14/14 [03:28<00:00,  0.07it/s, v_num=11, train_loss_step=3.65e+3, train_acc_step=0.526, val_loss_step=5.5e+4, val_acc_step=0.0784, val_loss_epoch=5.21e+4, val_acc_epoch=0.0322, train_loss_epoch=4.7e+3, train_acc_epoch=0.513]


In [31]:
# You can iterate over all parameters of the model and print their names and requires_grad status. This will help you verify whether the freezing and updating status are set correctly.

for name, param in model.named_parameters():
    print(f"{name} -> requires_grad: {param.requires_grad}")


conv_stem.weight -> requires_grad: False
bn1.weight -> requires_grad: False
bn1.bias -> requires_grad: False
blocks.0.0.conv.weight -> requires_grad: False
blocks.0.0.bn1.weight -> requires_grad: False
blocks.0.0.bn1.bias -> requires_grad: False
blocks.0.1.conv.weight -> requires_grad: False
blocks.0.1.bn1.weight -> requires_grad: False
blocks.0.1.bn1.bias -> requires_grad: False
blocks.1.0.conv_exp.weight -> requires_grad: False
blocks.1.0.bn1.weight -> requires_grad: False
blocks.1.0.bn1.bias -> requires_grad: False
blocks.1.0.conv_pwl.weight -> requires_grad: False
blocks.1.0.bn2.weight -> requires_grad: False
blocks.1.0.bn2.bias -> requires_grad: False
blocks.1.1.conv_exp.weight -> requires_grad: False
blocks.1.1.bn1.weight -> requires_grad: False
blocks.1.1.bn1.bias -> requires_grad: False
blocks.1.1.conv_pwl.weight -> requires_grad: False
blocks.1.1.bn2.weight -> requires_grad: False
blocks.1.1.bn2.bias -> requires_grad: False
blocks.1.2.conv_exp.weight -> requires_grad: False
bl

In [32]:
# If you are only interested in certain layers (such as classification layers), you can check the parameter status of these layers specifically:

# Assuming the classifier layer is named 'classifier'
for name, param in model.classifier.named_parameters():
    print(f"{name} in classifier -> requires_grad: {param.requires_grad}")


weight in classifier -> requires_grad: True
bias in classifier -> requires_grad: True


In [33]:
summary(model,input_size=(128,1,40,251))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [128, 160]                --
├─Conv2d: 1-1                                 [128, 24, 19, 125]        (216)
├─BatchNormAct2d: 1-2                         [128, 24, 19, 125]        48
│    └─Identity: 2-1                          [128, 24, 19, 125]        --
│    └─SiLU: 2-2                              [128, 24, 19, 125]        --
├─Sequential: 1-3                             [128, 256, 2, 8]          --
│    └─Sequential: 2-3                        [128, 24, 19, 125]        --
│    │    └─ConvBnAct: 3-1                    [128, 24, 19, 125]        (5,232)
│    │    └─ConvBnAct: 3-2                    [128, 24, 19, 125]        (5,232)
│    └─Sequential: 2-4                        [128, 48, 10, 63]         --
│    │    └─EdgeResidual: 3-3                 [128, 48, 10, 63]         (25,632)
│    │    └─EdgeResidual: 3-4                 [128, 48, 10, 63]         (92,