corresponded information in 11.1-organize-in-torchDataset.ipynb

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split

import lightning as L

from sklearn.preprocessing import LabelEncoder

import torch.nn as nn
import torch
import torch.nn.functional as F


import torchmetrics

from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

In [2]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(device)

mps


In [3]:
labels_path='../../data/train_metadata_new_tiny.csv'

In [4]:
# initialize label encoder

encoder=LabelEncoder()

In [5]:
raw_df=pd.read_csv(labels_path,header=0)

labels_all=raw_df.primary_label.unique().tolist()

print(labels_all)
print(len(labels_all))

labels_encoded=encoder.fit_transform(labels_all)

print(labels_encoded)

# If needed, you can view the mapping of original labels to encodings
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_mapping)

['lblwar1', 'blrwar1', 'bkskit1', 'comros', 'houcro1', 'houspa', 'graher1', 'blhori1', 'mawthr1', 'grewar3', 'hoopoe', 'indpit1', 'litspi1', 'wemhar1', 'laudov1', 'litgre1', 'rocpig', 'grecou1', 'whbwoo2', 'barswa', 'gyhcaf1', 'purher1', 'litegr', 'commyn', 'lirplo', 'putbab1', 'cregos1', 'bkwsti', 'gybpri1', 'commoo3', 'categr', 'asbfly', 'brwowl1', 'marsan', 'maghor2', 'zitcis1', 'bcnher', 'woosan', 'greegr', 'grtdro1', 'comtai1', 'eaywag1', 'grejun2', 'placuc3', 'grnsan', 'eucdov', 'comkin1', 'junowl1', 'ingori1', 'emedov2', 'sttwoo1', 'rorpar', 'thbwar1', 'comsan', 'goflea1', 'indrol2', 'cohcuc1', 'blakit1', 'comgre', 'eurcoo', 'whbbul2', 'rewlap1', 'inbrob1', 'brnshr', 'rerswa1', 'plapri1', 'whiter2', 'whbwat1', 'labcro1', 'plaflo1', 'grywag', 'spodov', 'redspu1', 'spepic1', 'yebbul3', 'gargan', 'spoowl1', 'aspswi1', 'eurbla2', 'brodro1', 'rewbul', 'stbkin1', 'ashdro1', 'lobsun2', 'rossta2', 'tilwar1', 'grefla1', 'compea', 'sbeowl1', 'barfly1', 'crseag1', 'comior1', 'grenig1', 'ru

In [6]:
del raw_df,labels_all,labels_encoded,label_mapping

In [7]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [8]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [9]:
## Convert audio data into mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    # Determine window size and frame shift
    # window_size = 0.04 # 40 milliseconds
    # hop_size = 0.02 # 20 milliseconds, usually half the window size
    n_fft = int(window_size * sample_rate)  # Convert the window size to the number of sampling points
    hop_length = int(hop_size * sample_rate)  # Convert frame shift to sampling point number

    # Calculate Mel Spectrogram
    # n_mels = 40 # Number of Mel filters

    # Set up Mel Spectrogram converter
    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [10]:
class BirdclefDataset(Dataset):
    def __init__(self,
                 encoder:LabelEncoder,
                 audio_dir:str='../../data/train_audio',
                 labels_path:str=None,
                 ):
        """
        Parameters:
            encoder: label encoder
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.encoder=encoder
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        #define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)

            # Calculate mean and standard deviation
            mean_vals = clip.mean(dim=1, keepdim=True)
            std_vals = clip.std(dim=1, keepdim=True)

            # Standardization
            standardized_clip = (clip - mean_vals) / std_vals

                
        else:
            raise ValueError('The clip start time is out of raw audio length')
        


        return standardized_clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path of a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.encoder.transform([self.get_audio_label(index)])[0]

        # Read the audio array according to the path
        audio, sr=read_audio(single_audio_dir)
        
        # Read the duration and number of channels corresponding to the audio
        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)

        # mel spectrogram transformation
        mel_spec=mel_transform(sample_rate=sr,audio=clip)

        return audio_label, mel_spec

In [11]:
BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)

dataloader = DataLoader(dataset=BD, batch_size=128, shuffle=True, num_workers=0)

batch = next(iter(dataloader))
labels, mel_specs = batch
print(labels)
print(mel_specs)
print(mel_specs.shape)
print(len(dataloader))

tensor([ 47, 134,  72,  45,  52,   1,  30,  31,  55, 125, 119,   7,  70,   0,
        126,   1, 127,  16,  92,  49, 149, 126,   7,  80,   7,  72,  21,  70,
         51,  73,  92,  34,  14,  13, 159, 126, 147, 104, 124,   0,  87, 122,
         52, 126,  41,   7,  15,   8,   8,  94,  16,  34,  46,  38,  60,  16,
         16,  72, 154, 119,  34, 158,  72,  10,  38,  10, 148,  36,  61,  94,
        125,  27,  34,  47, 126,  32,  67,  47,  45,   7,  11,  60,   8,  66,
         61,  96,  11,  13,  10, 137,  67,  20,  70,  61,  72,  78, 146, 123,
         34,  16,   2,  16,  36,  72,  45, 128,  70, 137,  47,   7,  30,  15,
         43,  16,  45,  73,   7,  12,  16,  92,  36,  36, 100,  89,  20,  36,
        150, 113])
tensor([[[[2.2131e+01, 3.2052e+00, 2.1518e+00,  ..., 6.9497e-01,
           2.4243e+00, 7.2464e+00],
          [3.6364e+01, 2.7217e+01, 5.2286e+01,  ..., 5.4511e+00,
           9.3516e+00, 1.8424e+01],
          [7.4918e+01, 5.9747e+01, 1.2876e+02,  ..., 1.5926e+01,
           1

In [12]:
class ConvBlock1(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1=nn.Conv2d(in_channels=1,out_channels=32,kernel_size=(2,2),stride=2,padding=0)
        self.conv2=nn.Conv2d(in_channels=1,out_channels=32,kernel_size=(4,4),stride=2,padding=1)
        self.conv3=nn.Conv2d(in_channels=1,out_channels=32,kernel_size=(8,8),stride=2,padding=3)
    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x3=self.conv3(x)
        # The length of the input data shape is 32000, and the stride is 2, so after conv1d, the length becomes 16000
        # The number of output channels of each conv1d layer is 32, so for the shape of the entire output, regardless of batchsize, it is 32*16000
        # Because of the chrononet architecture, we need to connect the outputs of the three layers to become 96*16000 output data.
        x=torch.cat((x1,x2,x3),dim=1)

        return x

In [13]:
model =ConvBlock1()

output_test_1=model(mel_specs)

print(output_test_1.shape)
print(output_test_1)

torch.Size([128, 96, 20, 125])
tensor([[[[ 9.5496e+00,  1.1246e+01,  5.3699e+00,  ...,  4.1395e+00,
            2.5180e+00,  1.5766e+00],
          [ 6.6051e+01,  7.6504e+00,  3.6524e+01,  ...,  2.9806e+01,
            4.5469e+01,  1.3541e+01],
          [ 2.8718e+01, -8.4494e+00, -6.9029e+01,  ...,  6.6813e+01,
           -4.9247e+00, -3.7171e+01],
          ...,
          [-2.4250e+00, -2.0166e+00, -6.0259e+00,  ..., -2.1248e-01,
           -1.1884e+01, -7.1522e+00],
          [-2.7115e+00, -1.3597e+01, -1.2595e+00,  ..., -1.1967e+01,
           -1.8934e+01, -6.2785e+00],
          [-2.8084e+01, -1.3915e+01, -1.2219e+01,  ..., -1.0935e+01,
           -9.3906e+00, -1.3192e+00]],

         [[-2.7752e+01, -2.8979e+01, -1.3609e+01,  ..., -1.0738e+01,
           -6.8813e+00, -6.5438e+00],
          [-2.2020e+02, -8.3683e+01, -1.1905e+02,  ..., -1.1551e+02,
           -1.2815e+02, -1.9631e+02],
          [-3.2874e+02, -3.0995e+02, -3.2455e+02,  ..., -3.6275e+02,
           -3.7750e+02, -1.

In [14]:
class ConvBlock2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1=nn.Conv2d(in_channels=96,out_channels=32,kernel_size=(2,2),stride=2,padding=0)
        self.conv2=nn.Conv2d(in_channels=96,out_channels=32,kernel_size=(4,4),stride=2,padding=1)
        self.conv3=nn.Conv2d(in_channels=96,out_channels=32,kernel_size=(8,8),stride=2,padding=3)

    def forward(self,x):
        x1=self.conv1(x)
        x2=self.conv2(x)
        x3=self.conv3(x)
        # From the output of ConvBlock1, we know that the input shape of convBlock2 is 96*16000
        # After the calculation of this block, the output will become 96*8000
        x=torch.cat((x1,x2,x3),dim=1)

        return x

In [15]:
## check output shape

model =ConvBlock2()

output_test_2=model(output_test_1)

print(output_test_2.shape)
print(output_test_2)

torch.Size([128, 96, 10, 62])
tensor([[[[ 3.9639e+01,  7.8152e+00,  3.3213e+01,  ...,  8.4826e+01,
            5.7910e+01,  4.0655e+01],
          [ 5.3729e+02,  7.6089e+02,  1.0988e+03,  ...,  4.6021e+02,
            4.0823e+02,  5.1080e+02],
          [-4.6800e+02, -6.8036e+01,  2.7977e+02,  ...,  1.4788e+02,
           -1.9590e+02,  1.7623e+02],
          ...,
          [ 7.8117e+00,  1.3898e+01,  1.8326e+01,  ...,  1.3463e+01,
            3.5881e+00,  2.2058e+00],
          [ 2.8225e+01,  1.5488e+01,  2.2156e+01,  ...,  2.1408e+01,
            5.3735e+00,  1.3720e+01],
          [ 2.2280e+01,  2.2439e+01,  1.5581e+01,  ...,  1.6920e+01,
            1.4210e+01,  1.8780e+01]],

         [[ 4.3399e+01, -6.4069e+00,  5.5218e+01,  ...,  1.7421e+01,
            2.6955e+00, -1.5655e+01],
          [ 8.1665e+02,  5.1402e+02,  3.7353e+02,  ...,  6.0700e+02,
            4.5032e+02,  3.2020e+02],
          [ 1.0027e+02,  6.3871e+01, -3.7434e+02,  ..., -1.4297e+02,
           -1.1242e+02,  2.2

In [16]:
class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1=ConvBlock1()
        self.block2=ConvBlock2()
        # input_size=ch_num*height
        self.gru1=nn.GRU(input_size=96*10,hidden_size=32,num_layers=1,batch_first=True)

    def forward(self,x):
        x=self.block1(x)
        x=self.block2(x)
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, channel_num, width, length)
        # I need to change the shape
        # (batch_size,width,ch_num*height)
        batch_size,ch_num,height,width=x.size()
        # x=x.permute(0,2,1,3).reshape(batch_size,width,ch_num*height)
        x=x.reshape(batch_size,width,ch_num*height)
        gru_out1,_=self.gru1(x)

        return gru_out1,_
        
        
# The above is used to view the output after adding a gru
# Next I need to add more layers

In [17]:
model=ChronoNet()
output_test_4,_=model(mel_specs)

print(output_test_4.shape)

print(_.shape)

torch.Size([128, 62, 32])
torch.Size([1, 128, 32])


In [18]:
class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1=ConvBlock1()
        self.block2=ConvBlock2()
        # input_size=ch_num*height
        self.gru1=nn.GRU(input_size=96*10,hidden_size=32,num_layers=1,batch_first=True)
        self.gru2=nn.GRU(input_size=32,hidden_size=32,num_layers=1,batch_first=True)


    def forward(self,x):
        x=self.block1(x)
        x=self.block2(x)
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, channel_num, width, length)
        # I need to change the shape
        # (batch_size, width, ch_num*height)
        batch_size,ch_num,height,width=x.size()
        # x=x.permute(0,2,1,3).reshape(batch_size,width,ch_num*height)
        x=x.reshape(batch_size,width,ch_num*height)
        gru_out1,_=self.gru1(x)
        gru_out2,_=self.gru2(gru_out1)
        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x=torch.cat((gru_out1,gru_out2),dim=2)

        return x

In [19]:
model=ChronoNet()
output_test_5=model(mel_specs)

print(output_test_5.shape)

torch.Size([128, 62, 64])


In [20]:
class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1=ConvBlock1()
        self.block2=ConvBlock2()
        # input_size=ch_num*height
        self.gru1=nn.GRU(input_size=96*10,hidden_size=32,num_layers=1,batch_first=True)
        self.gru2=nn.GRU(input_size=32,hidden_size=32,num_layers=1,batch_first=True)
        self.gru3=nn.GRU(input_size=64,hidden_size=32,num_layers=1,batch_first=True)


    def forward(self,x):
        x=self.block1(x)
        x=self.block2(x)
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, channel_num, width, length)
        # I need to change the shape
        # (batch_size,width,ch_num*height)
        batch_size,ch_num,height,width=x.size()
        # x=x.permute(0,2,1,3).reshape(batch_size,width,ch_num*height)
        x=x.reshape(batch_size,width,ch_num*height)
        gru_out1,_=self.gru1(x)
        gru_out2,_=self.gru2(gru_out1)
        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x=torch.cat((gru_out1,gru_out2),dim=2)
        gru_out3,_=self.gru3(x)
        x=torch.cat((gru_out1,gru_out2,gru_out3),dim=2)

        return x

In [21]:
model=ChronoNet()
output_test_6=model(mel_specs)

print(output_test_6.shape)

torch.Size([128, 62, 96])


In [22]:
class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1=ConvBlock1()
        self.block2=ConvBlock2()
        # input_size=ch_num*height
        self.gru1=nn.GRU(input_size=96*10,hidden_size=32,num_layers=1,batch_first=True)
        self.gru2=nn.GRU(input_size=32,hidden_size=32,num_layers=1,batch_first=True)
        self.gru3=nn.GRU(input_size=64,hidden_size=32,num_layers=1,batch_first=True)
        self.gru4=nn.GRU(input_size=96,hidden_size=32,num_layers=1,batch_first=True)


    def forward(self,x):
        x=self.block1(x)
        x=self.block2(x)
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, channel_num, width, length)
        # I need to change the shape
        # (batch_size,width,ch_num*height)
        batch_size,ch_num,height,width=x.size()
        # x=x.permute(0,2,1,3).reshape(batch_size,width,ch_num*height)
        x=x.reshape(batch_size,width,ch_num*height)
        gru_out1,_=self.gru1(x)
        gru_out2,_=self.gru2(gru_out1)
        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x=torch.cat((gru_out1,gru_out2),dim=2)
        gru_out3,_=self.gru3(x)
        x=torch.cat((gru_out1,gru_out2,gru_out3),dim=2)
        gru_out4,_=self.gru4(x)

        return gru_out4

In [23]:
model=ChronoNet()
output_test_7=model(mel_specs)

print(output_test_7.shape)

torch.Size([128, 62, 32])


In [24]:
# Next, integrate the steps of building the model above, and add activation functions and regularization layers

# Because we need to train the model first, we need to calculate the loss
# For multi-classification problems, if you choose to use nn.crossentropylss, you need to remove F.softmax(),
# Because this loss function already combines Log-Softmax and NLL Loss (Negative Log Likelihood Loss).

class ChronoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1=ConvBlock1()
        self.block2=ConvBlock2()
        self.bn1 = nn.BatchNorm1d(num_features=62)
        # input_size=ch_num*height
        self.gru1=nn.GRU(input_size=96*10,hidden_size=32,num_layers=1,batch_first=True)
        self.bn2 = nn.BatchNorm1d(num_features=62)
        self.gru2=nn.GRU(input_size=32,hidden_size=32,num_layers=1,batch_first=True)
        self.gru3=nn.GRU(input_size=64,hidden_size=32,num_layers=1,batch_first=True)
        self.gru4=nn.GRU(input_size=96,hidden_size=32,num_layers=1,batch_first=True)
        self.fc1=nn.Linear(in_features=32,out_features=64)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 160)  



    def forward(self,x):
        x=self.block1(x)
        x=self.block2(x)
        # Because the input shape required by gru is (batch_size, sequence length, feature_size)
        # But the result of the previous conversion calculation is (batchsize, channel_num, width, length)
        # I need to change the shape
        # (batch_size,width,ch_num*height)
        batch_size,ch_num,height,width=x.size()
        # x=x.permute(0,2,1,3).reshape(batch_size,width,ch_num*height)
        x=x.reshape(batch_size,width,ch_num*height)

        # add batch normalization
        x=self.bn1(x)
        # add relu activation fucntion
        x = F.relu(x) 

        gru_out1,_=self.gru1(x)

        x=self.bn2(gru_out1)
        x=F.relu(x)

        gru_out2,_=self.gru2(gru_out1)

        x=F.relu(x)

        # According to the chrononet architecture, we need to connect the calculations of the two layers of GRU according to the feature-size dimension
        x=torch.cat((gru_out1,gru_out2),dim=2)
        gru_out3,_=self.gru3(x)

        x=F.relu(x)

        x=torch.cat((gru_out1,gru_out2,gru_out3),dim=2)
        gru_out4,_=self.gru4(x)

        x=F.relu(x)

        x = self.fc1(gru_out4[:, -1, :])  #Usually take the final output of GRU
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [25]:
class ChronoNetModule(L.LightningModule):
    def __init__(self,model,learning_rate):
        super().__init__()
        self.model=model
        self.lr=learning_rate
        self.train_acc=torchmetrics.Accuracy(task='multiclass',num_classes=160)
        self.val_acc=torchmetrics.Accuracy(task='multiclass',num_classes=160)



    def forward(self,x):
        '''
        x: feature data for training 

        This is the part of the neural model that is used to read or build
        define the computation performed at every call define the computation performed at every call

        return:
            model's output
        '''
        return self.model(x)
    
    def training_step(self,batch,batch_idx):
        '''
        we need to train the model right here
        including provide the loss step, acc calculation step

        This function will perform the following operations:
        1. Calculate the loss value for each training batch
        2. Perform optimization and gradient descent (automatically performed by lightningModule)
        3. Update parameters (automatically performed by lightningModule)
        https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#training
        '''
        # Read batch data
        labels,features=batch

        # Send data to GPU for training
        features=features.to(self.device)
        labels=labels.to(self.device)
        
        # feeding feature to the model
        # Only self() is used here because the forward() function is automatically called
        # forward propagation
        out=self(features)

        # After getting the output of the model, you need to calculate the loss function
        loss=F.cross_entropy(out, labels)

        # logs metrics for each training_step,
        # and the average across the epoch, to the progress bar and logger
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # after adding self.train_acc=torchmetrics.Accuracy(task='multiclass',num_classes=6)
        # You can add the step of calculating accuracy below
        # Because we use cross_entropy() as the loss function
        # So we need to use argmax to convert to normal values ​​for accuracy calculation
        # predicted_labels=torch.argmax(out)
        # But torchmetrics.Accuracy is already configured to handle logits suitable for multi-category classification problems. 
        # It will apply softmax (or log_softmax) and calculate argmax internally to determine the most likely category.
        acc=self.train_acc(out,labels)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        

        # In training_step(), we only calculate and return the loss. 
        # The optimization part does not belong to this part, and the optimization method will be defined in configure_optimizers.
        return loss # this is passed to the optimizer for training
    
    def validation_step(self,batch,batch_idx):
        '''
        val step is not used in traning, only in validation
        '''
        labels,features=batch

        # Send data to GPU for training
        features=features.to(self.device)
        labels=labels.to(self.device)
        
        out=self(features)
        loss=F.cross_entropy(out, labels)

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        acc=self.val_acc(out,labels)
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)


    def configure_optimizers(self):
        '''
        Choose what optimizers and learning-rate schedulers to use in your optimization.

        The optimizer defined here will be automatically called by lightningModule
        Used in the training step
        '''
        optimizer=torch.optim.Adam(self.parameters(), lr=self.lr)

        return optimizer


    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass


    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        # If you have only one tensor (feature) in your TensorDataset, batch will be a tuple containing a tensor and an empty tuple (since there are no labels)
        features= batch
        features=features.to(self.device)
        predictions = self(features)
        # Because what our model ultimately wants is the probability of an object corresponding to all categories, so add the softmax function here
        probabilities = torch.softmax(predictions, dim=1)

        return probabilities




In [26]:
class ChronoNetDataModule(L.LightningDataModule):
    def __init__(self,dataset:Dataset,pred=None,batch_size:int=128):
        super().__init__()
        self.dataset=dataset
        self.batch_size=batch_size

        self.pred=pred

    def setup(self,stage:str):
        # assign train/val splits for use in dataloaders
        if stage=='fit':
            self.train_dataset,self.val_dataset=random_split(self.dataset,[0.8,0.2],generator=torch.Generator().manual_seed(42))
        
        # if stage=='predict':
            

    def train_dataloader(self):
        loader= DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

        return loader
    
    def val_dataloader(self):
        loader= DataLoader(self.val_dataset, batch_size=self.batch_size,shuffle=False)

        return loader

    def predict_dataloader(self):
        loader=DataLoader(self.dataset,batch_size=self.batch_size,shuffle=False)

        return loader


In [27]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss', 
    dirpath='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/checkpoints/',
    filename='chrononet-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1, 
    mode='min', 
    auto_insert_metric_name=False 
)

early_stop_callback = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.00,
    patience=3, 
    verbose=True,
    mode='min'  
)


In [28]:
#initialize Dataset first

BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)
# Previously we used a separate dataloader to feed the model
# Here we encapsulate the dataloader and use this class to read data for training

dm=ChronoNetDataModule(dataset=BD,batch_size=128)
print(dm)




model=ChronoNet()
ChronoNetModule=ChronoNetModule(model=model,learning_rate=0.01)

trainer=L.Trainer(
    max_epochs=10,
    accelerator="gpu", # set to 'auto' or 'gpu' to use gpu if possible
    devices=1, # use all gpus if applicable like value=1 or "auto"
    default_root_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/',
    # logger=CSVLogger(save_dir='/Users/yiding/personal_projects/ML/github_repo/birdcief/code/model-training/log/',name='chrononet')
    callbacks=[checkpoint_callback, early_stop_callback],  # Add callback to trainer
)

# train the model
trainer.fit(
    model=ChronoNetModule,
    datamodule=dm 
)


### load checkpoint for prediction

In [29]:
## Preparing forecast data

## reference for 12.1-predict-data-transform-clean-version.ipynb


from datasets import Dataset

from pathlib import Path

pred_dir = Path("../../data/predict")
pred_files = pred_dir.glob("*.ogg")


def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate


# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels


def split_audio(audio: torch.Tensor, segment_length:int):

    '''
    split raw audio tensor into multiple clips with 5 seconds long.

    Parameters:
        audio: the raw audio tensor
        segment_length: the audio length of each 5 seconds

    return:
        parts: list includes all clips
        end_time_list: the list of all clips' end time in seconds
    '''

    length_audio = audio.shape[1]
    parts = []
    # For example, if this is the first 5 seconds of audio, then the end time is 5. If it is 5-10 seconds, the end time is 10
    end_time_list=[]
    end_time=5
    for i in range(0, length_audio, segment_length):
        part = audio[0][i:i + segment_length]
        # if len(part) == segment_length:  # Ensure the fragment lengths are consistent
        parts.append(part)  #Store the raw bytes of audio data
        end_time_list.append(end_time)
        end_time+=5

        

    return parts,end_time_list



audio_clips_list=[]
clip_names_list=[]

for path in pred_files:
    # read audio as tensor
    audio,sr=read_audio(path=path)

    # get audio corresponding informatino
    duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

    # split audio into multi clips with 5 seconds
    audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

    # generate each label name for each clip
    soundscape_id=path.stem
    clip_name=[f'soundscape_{soundscape_id}_{end_time}' for end_time in end_time_list]

    audio_clips_list.extend(audio_clips)
    
    clip_names_list.extend(clip_name)

    

# create Dataset
dataset = Dataset.from_dict({'audio_clip': audio_clips_list})


# We need to modify melspec so that it can accept batch as a function and use the map function


## convert audio to mel spectrogram


def mel_transform(batch):
    """
    transform audio data into mel sepctrogram
    """
    n_fft=int(0.04*32000)
    # hop_length = int(hop_size * sample_rate)  
    hop_length=int(0.02*32000)

    n_mels = 40  

    mel_transformer = MelSpectrogram(
        sample_rate=32000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    audio_clip_batch=batch['audio_clip']

    melspec_list=[]

    for audio_clip in audio_clip_batch:
        
        audio_clip=torch.tensor(audio_clip).unsqueeze(0)

        melspec=mel_transformer(audio_clip)

        melspec_list.append(melspec)


    return {'audio_mel':melspec_list}



dataset_mel=dataset.map(mel_transform, batched=True)


del dataset


dataset_mel_single=dataset_mel.remove_columns('audio_clip')


from torch.utils.data import Dataset


class PredDataset(Dataset):
    def __init__(self,dataset):
        super().__init__()
        self.dataset=dataset

    def __len__(self):

        return len(self.dataset)
    
    def __getitem__(self, index):
        audio_melspec=self.dataset['audio_mel'][index]

        audio_tensor=torch.tensor(audio_melspec)

        return audio_tensor

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 96/96 [00:04<00:00, 22.68 examples/s]


In [30]:
# 1. load checkpoint
model=ChronoNet()
model = ChronoNetModule.load_from_checkpoint(
    checkpoint_path="./checkpoints/chrononet-02-4.41.ckpt",
    model=model,  # Pass additional parameters needed for model initialization if they are not saved in the checkpoint
    learning_rate=0.01  # Any other required parameters
)




In [31]:
# 2. prepare predicted data
PD=PredDataset(dataset=dataset_mel)
predict_dataloader = DataLoader(dataset=PD, batch_size=32, shuffle=False, num_workers=0)



In [33]:
# 3. model prediction
trainer = L.Trainer(
    accelerator="gpu",  
    devices=1
)

# predict using trainer
predictions = trainer.predict(model, dataloaders=predict_dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/homebrew/Caskroom/miniforge/base/envs/birdclef/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 3/3 [00:11<00:00,  0.26it/s]


In [34]:
predictions

[tensor([[0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         ...,
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061]]),
 tensor([[0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         ...,
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061]]),
 tensor([[0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.0061],
         [0.0024, 0.0029, 0.0026,  ..., 0.0006, 0.0007, 0.

In [42]:
submission=pd.DataFrame({
    'row_id':clip_names_list
})

In [43]:
submission

Unnamed: 0,row_id
0,soundscape_1000170626_5
1,soundscape_1000170626_10
2,soundscape_1000170626_15
3,soundscape_1000170626_20
4,soundscape_1000170626_25
...,...
91,soundscape_1000389428_220
92,soundscape_1000389428_225
93,soundscape_1000389428_230
94,soundscape_1000389428_235


In [51]:
# Convert each tensor to a NumPy array and use them as rows of the DataFrame
data_frames = [pd.DataFrame(tensor.numpy()) for tensor in predictions]


In [53]:
# Merge all DataFrames into one big DataFrame
# Each tensor forms a block of the DataFrame
df = pd.concat(data_frames, ignore_index=True)

In [54]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
1,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
2,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
3,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
4,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
92,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
93,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
94,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,0.000084,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122


In [55]:
final = pd.concat([submission, df], axis=1)

In [58]:
final

Unnamed: 0,row_id,0,1,2,3,4,5,6,7,8,...,150,151,152,153,154,155,156,157,158,159
0,soundscape_1000170626_5,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
1,soundscape_1000170626_10,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
2,soundscape_1000170626_15,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
3,soundscape_1000170626_20,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
4,soundscape_1000170626_25,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,soundscape_1000389428_220,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006121
92,soundscape_1000389428_225,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
93,soundscape_1000389428_230,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
94,soundscape_1000389428_235,0.002427,0.002863,0.002577,0.000327,0.002349,0.00107,0.001924,0.028795,0.012024,...,0.000466,0.000712,0.01239,0.001746,0.002529,0.010009,0.000388,0.000552,0.000695,0.006122
