In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import Dataset, DataLoader,TensorDataset

import lightning as L

In [2]:
labels_path='../../data/train_metadata.csv'

## build up the customized torch Dataset

Dataset is a way to encapsulate and process datasets. It is used to load and prepare data in PyTorch so that it can be used with `DataLoader` to achieve efficient data iteration.

### build up `__len__()` function

In [4]:
class BirdclefDataset(Dataset):
    def __init__(self,audio_dir=None,labels_path=None):
        """
        Parameters:
            audio_dir: the directory of the audio
            labels_path: the file including all corresponding labels
        """
        super().__init__()
        self.audio_dir=audio_dir
        self.labels_path=labels_path

    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.

        print('marked_label')

        # read data into dataframe
        labels_df=pd.read_csv(self.labels_path,header=0)
        
        return len(labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitem__(1)

        pass

In [5]:
#test __len__()function

dataset_length=len(BirdclefDataset(labels_path=labels_path))

print(dataset_length)

dataset_length=BirdclefDataset(labels_path=labels_path).__len__()

print(dataset_length)

marked_label
24459
marked_label
24459


### add path generation function ino Dataset

In [6]:
class BirdclefDataset(Dataset):
    def __init__(self,audio_dir:str='../../data/train_audio',
                 labels_path:str=None):
        """
        Parameters:
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitem__(1)
        # Get the path of a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.get_audio_label(index)

        return single_audio_dir, audio_label

In [7]:
# test 
# Find the filename of the corresponding index from the metadata file

metafile=pd.read_csv(labels_path,header=0)
filename=metafile['filename'].iloc[0]
print(filename)
print(metafile.iloc[:,11][0])


asbfly/XC134896.ogg
asbfly/XC134896.ogg


In [8]:
# test
# Get a set of corresponding audio paths and labels from the Dataset

audio_path,true_label=BirdclefDataset(labels_path=labels_path).__getitem__(0)

print(audio_path)
print(true_label)


audio_path,true_label=BirdclefDataset(labels_path=labels_path).__getitem__(24458)

print(audio_path)
print(true_label)

../../data/train_audio/asbfly/XC134896.ogg
asbfly
../../data/train_audio/zitcis1/XC858550.ogg
zitcis1


### add audio read function, let Dataset could return `audio array` instead with `audio path`

In [9]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [10]:
class BirdclefDataset(Dataset):
    def __init__(self,audio_dir:str='../../data/train_audio',
                 labels_path:str=None):
        """
        Parameters:
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path to a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.get_audio_label(index)

        # Read audio array according to path
        audio, sr=read_audio(single_audio_dir)

        return audio, audio_label

In [11]:
# test
# Get a set of corresponding audio array and label from the Dataset

audio_path,true_label=BirdclefDataset(labels_path=labels_path).__getitem__(0)

print(audio_path.shape)
print(true_label)


BD=BirdclefDataset(labels_path=labels_path)

audio_path,true_label=BD[0]

print(audio_path.shape)
print(true_label)

torch.Size([1, 875207])
asbfly
torch.Size([1, 875207])
asbfly


### I need to split each signal into 5-second signals, and now I need to add this part to the torch Dataset

Because I need to treat each 5-second audio as a whole, and not lose other information of the entire audio

If we use the traditional method, that is, Dataset gets an index, then reads the audio corresponding to the index, and then cuts it into multiple segments, when using dataloader to load data, we will use collate_fn to customize the rensor stacking method, but it will cause inaccurate batch size, thus losing data in the training process.

So I used 4.1-create-new-trainmatadata-csv.ipynb to rebuild a csv file to correspond to the index.

According to this logic,

1. I need to create a function, target_clip() to get the corresponding audio clip in the train-metadata-csv file.

In [12]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [13]:
def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [14]:
class BirdclefDataset(Dataset):
    def __init__(self,audio_dir:str='../../data/train_audio',
                 labels_path:str=None):
        """
        Parameters:
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        #define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # Join the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)
                
        else:
            raise ValueError('The clip start time is out of raw audio length')

        return clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path to a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.get_audio_label(index)

        # Read audio array according to path
        audio, sr=read_audio(single_audio_dir)
        
        # Read the duration and number of channels corresponding to the audio
        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)

        return audio, audio_label, clip

In [15]:
labels_path='../../data/train_metadata_new.csv'

In [16]:
BD=BirdclefDataset(labels_path=labels_path)

audio_path,true_label,clip=BD[0]
print(audio_path.shape)
print(true_label)
print(clip)
print(clip.shape)


audio_path,true_label,clip=BD[1]
print(audio_path.shape)
print(true_label)
print(clip)
print(clip.shape)

audio_path,true_label,clip=BD[2]
print(audio_path.shape)
print(true_label)
print(clip)
print(clip.shape)

torch.Size([1, 425482])
malpar1
tensor([[ 4.2087e-07, -5.0013e-07, -6.1193e-06,  ...,  3.1517e-03,
          5.3921e-03,  7.1919e-03]])
torch.Size([1, 160000])
torch.Size([1, 425482])
malpar1
tensor([[ 0.0051,  0.0061, -0.0214,  ...,  0.0072,  0.0065,  0.0038]])
torch.Size([1, 160000])
torch.Size([1, 425482])
malpar1
tensor([[-0.0018, -0.0085, -0.0017,  ...,  0.0000,  0.0000,  0.0000]])
torch.Size([1, 160000])


In [17]:
425482/32000

13.2963125

### This step requires converting the audio into mel spectgram representation

In [18]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [19]:
def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [20]:
## Convert audio data into mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate) 

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [21]:
class BirdclefDataset(Dataset):
    def __init__(self,audio_dir:str='../../data/train_audio',
                 labels_path:str=None):
        """
        Parameters:
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)
                
        else:
            raise ValueError('The clip start time is out of raw audio length')

        return clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path to a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.get_audio_label(index)

        # Read audio array according to path
        audio, sr=read_audio(single_audio_dir)
        
        # Read the duration and number of channels corresponding to the audio
        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)

        # mel spectrogram transformation
        mel_spec=mel_transform(sample_rate=sr,audio=clip)

        return audio_label, mel_spec

In [22]:
BD=BirdclefDataset(labels_path=labels_path)

audio_label,mel_spec=BD[0]

print(audio_label)
print(mel_spec)
print(mel_spec.shape)
print(type(mel_spec))

malpar1
tensor([[[1.0330e-08, 2.2865e-06, 1.0572e-05,  ..., 2.3917e-08,
          1.0114e-08, 4.6815e-03],
         [4.9843e-08, 4.1700e-06, 4.2066e-05,  ..., 3.5288e-06,
          1.3134e-05, 5.3317e-03],
         [7.5681e-08, 4.7742e-05, 3.2700e-04,  ..., 1.1220e-04,
          7.3640e-05, 4.6690e-03],
         ...,
         [6.5626e-06, 5.8203e-02, 3.5607e-01,  ..., 4.0643e-01,
          4.1383e-01, 2.8917e-01],
         [6.2657e-06, 3.6519e-02, 2.4755e-01,  ..., 2.7120e-01,
          3.3269e-01, 2.1432e-01],
         [8.5976e-06, 3.2936e-02, 1.7693e-01,  ..., 1.9959e-01,
          1.7707e-01, 1.9223e-01]]])
torch.Size([1, 40, 251])
<class 'torch.Tensor'>


### Finally, use dataloader to read the data

In [25]:
# Assume that each batch size is 4, the data is shuffled, and two processes are used for loading
dataloader = DataLoader(dataset=BD, batch_size=128, shuffle=True, num_workers=0)

In [27]:
# iterrate DataLoader
for batch in dataloader:
    labels, mel_specs = batch
    # Here you can process your batch data
    print(labels)
    print(mel_specs)
    print(mel_specs.shape)
    break

# mel_specs.shape -> [batch_size,channel_num,height,width]


('houspa', 'grywag', 'grewar3', 'blrwar1', 'whiter2', 'grejun2', 'gybpri1', 'bkwsti', 'litegr', 'bcnher', 'hoopoe', 'maltro1', 'hoopoe', 'labcro1', 'pursun3', 'commyn', 'ruftre2', 'gybpri1', 'kenplo1', 'houspa', 'rocpig', 'comior1', 'zitcis1', 'eaywag1', 'putbab1', 'blrwar1', 'graher1', 'zitcis1', 'comior1', 'ingori1', 'whbwoo2', 'houspa', 'commoo3', 'bkwsti', 'stbkin1', 'ashpri1', 'whtkin2', 'pursun3', 'commyn', 'litgre1', 'gargan', 'grehor1', 'goflea1', 'whiter2', 'grtdro1', 'grywag', 'wemhar1', 'blrwar1', 'eaywag1', 'whbsho3', 'stbkin1', 'junbab2', 'whbtre1', 'barswa', 'blrwar1', 'whbwag1', 'graher1', 'rerswa1', 'plapri1', 'lirplo', 'whtkin2', 'gyhcaf1', 'comkin1', 'comsan', 'purher1', 'woosan', 'blakit1', 'ashpri1', 'eaywag1', 'blakit1', 'gyhcaf1', 'woosan', 'cregos1', 'blrwar1', 'sohmyn1', 'bkwsti', 'commoo3', 'comsan', 'greegr', 'comsan', 'eurcoo', 'whcbar1', 'plapri1', 'inbrob1', 'grywag', 'putbab1', 'rorpar', 'grewar3', 'comtai1', 'grnsan', 'commyn', 'bladro1', 'blnmon1', 'ruft

In [28]:
batch = next(iter(dataloader))
labels, mel_specs = batch
print(labels)
print(mel_specs)
print(mel_specs.shape)

('grywag', 'comsan', 'eucdov', 'grewar3', 'litswi1', 'blrwar1', 'grewar3', 'blrwar1', 'bladro1', 'rorpar', 'ingori1', 'putbab1', 'comgre', 'bkwsti', 'whbtre1', 'gyhcaf1', 'placuc3', 'houspa', 'houspa', 'woosan', 'litswi1', 'gloibi', 'sbeowl1', 'rufwoo2', 'woosan', 'wemhar1', 'grefla1', 'brnhao1', 'putbab1', 'commyn', 'blakit1', 'crseag1', 'houspa', 'wemhar1', 'asbfly', 'copbar1', 'blrwar1', 'comtai1', 'rorpar', 'zitcis1', 'gryfra', 'labcro1', 'stbkin1', 'graher1', 'ashdro1', 'bkwsti', 'zitcis1', 'cregos1', 'blrwar1', 'graher1', 'litspi1', 'litegr', 'wemhar1', 'inbrob1', 'gloibi', 'bkskit1', 'asbfly', 'blrwar1', 'eucdov', 'asikoe2', 'eurcoo', 'lirplo', 'insowl1', 'comsan', 'whcbar1', 'houspa', 'blakit1', 'placuc3', 'blrwar1', 'grtdro1', 'blrwar1', 'houspa', 'emedov2', 'comgre', 'litegr', 'whiter2', 'commyn', 'spepic1', 'lewduc1', 'blrwar1', 'gyhcaf1', 'piekin1', 'comros', 'piekin1', 'compea', 'houspa', 'commoo3', 'comros', 'eucdov', 'whbwat1', 'zitcis1', 'labcro1', 'grywag', 'blrwar1', 

In [45]:
# release

del batch, dataloader,labels,mel_specs

### add label encoding

Note that in the batch output, the labels are still in string format. As the input to the model, I need to encode them.

In [51]:
from sklearn.preprocessing import LabelEncoder
import sklearn

In [38]:
# initialize label encoder

encoder=LabelEncoder()



In [40]:
raw_df=pd.read_csv(labels_path,header=0)

labels_all=raw_df.primary_label.unique().tolist()

print(labels_all)
print(len(labels_all))

labels_encoded=encoder.fit_transform(labels_all)

print(labels_encoded)

# If needed, you can view the mapping of original labels to encodings
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_mapping)

['malpar1', 'litgre1', 'houspa', 'indrob1', 'comtai1', 'grynig2', 'rufwoo2', 'yebbul3', 'indpit1', 'gyhcaf1', 'ruftre2', 'wynlau1', 'inpher1', 'comkin1', 'comior1', 'tibfly3', 'pomgrp2', 'oripip1', 'indtit1', 'nutman', 'junmyn1', 'rutfly6', 'goflea1', 'litegr', 'lesyel1', 'lewduc1', 'inbrob1', 'crfbar1', 'scamin3', 'shikra1', 'gryfra', 'commoo3', 'grewar3', 'brodro1', 'rocpig', 'categr', 'ingori1', 'plhpar1', 'sbeowl1', 'bwfshr1', 'junowl1', 'orihob2', 'greegr', 'barswa', 'paisto1', 'moipig1', 'plapri1', 'forwag1', 'maghor2', 'brasta1', 'lirplo', 'grecou1', 'kenplo1', 'bkcbul1', 'grbeat1', 'junbab2', 'comsan', 'whbtre1', 'brnhao1', 'brcful1', 'whcbar1', 'hoopoe', 'plaflo1', 'maltro1', 'piekin1', 'brnshr', 'whiter2', 'brfowl1', 'pursun4', 'grehor1', 'pursun3', 'purswa3', 'yebbab1', 'lblwar1', 'malwoo1', 'laudov1', 'grenig1', 'tilwar1', 'heswoo1', 'compea', 'putbab1', 'smamin1', 'rorpar', 'graher1', 'ashpri1', 'piebus1', 'grnwar1', 'eurbla2', 'asikoe2', 'whbwat1', 'sqtbul1', 'brwowl1', '

In [50]:
print(type(encoder))

<class 'sklearn.preprocessing._label.LabelEncoder'>


In [41]:
# release

del raw_df,labels_all,labels_encoded,label_mapping

In [46]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [47]:
def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [48]:
## Convert audio data into mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate)  

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [62]:
class BirdclefDataset(Dataset):
    def __init__(self,
                 encoder:LabelEncoder,
                 audio_dir:str='../../data/train_audio',
                 labels_path:str=None,
                 ):
        """
        Parameters:
            encoder: label encoder
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.encoder=encoder
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)

                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)
                
        else:
            raise ValueError('The clip start time is out of raw audio length')

        return clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)
        # Get the path of a single audio file
        single_audio_dir=self.get_audio_path(index)
        # Get the corresponding label value
        audio_label=self.encoder.transform([self.get_audio_label(index)])[0]

        # Read audio array according to path
        audio, sr=read_audio(single_audio_dir)
        
        # Read the duration and number of channels corresponding to the audio
        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)

        # Get the audio clip corresponding to index
        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)

        # mel spectrogram transformation
        mel_spec=mel_transform(sample_rate=sr,audio=clip)

        return audio_label, mel_spec

In [63]:
BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)

dataloader = DataLoader(dataset=BD, batch_size=128, shuffle=True, num_workers=0)

batch = next(iter(dataloader))
labels, mel_specs = batch
print(labels)
print(mel_specs)
print(mel_specs.shape)

tensor([ 62,  55,  62, 139,  36,  35,  57,  41,  45, 153,  80,  15,  74, 164,
        100, 105, 127,  82,  43,  43, 105,  64,  14,  20, 139, 181,  20, 136,
         44, 166,  57, 122, 138,  41,  80,  18,  41,  41,  20, 109,  58,  38,
        181,  10,  62,  70,  80,  20,  62,  13,  70,  45, 105, 143,  43,  20,
         55,  90,  20,  80,  41,  55,  15, 161,  38, 115,  10, 171,  20,  40,
        105,  32,  80, 137, 136,  78,  98,  44,  72,  57, 132, 167,  70,  55,
         20,  20, 168,  69,  10, 129,  53,  78, 106,  62,  10, 129,   9,  10,
        181, 100,  57,  20,  44,  40,  73, 107,  53,  62,  20, 166,  33,  18,
         44, 101, 138,  70,  98, 158,  57,  70,  82, 141,  20, 161, 177,   9,
         40, 138])
tensor([[[[3.4798e-02, 7.2094e-07, 1.3548e-06,  ..., 4.9665e-07,
           1.7406e-07, 6.0493e-05],
          [3.7928e-02, 7.6341e-06, 9.7611e-06,  ..., 5.5148e-06,
           2.5833e-06, 9.8878e-05],
          [4.1640e-02, 3.9300e-05, 1.4848e-04,  ..., 8.8981e-05,
           1

In [64]:
del encoder, batch, dataloader,labels,mel_specs

### Then I need to normalize a single clip

In [65]:
# initialize label encoder

encoder=LabelEncoder()

In [66]:
raw_df=pd.read_csv(labels_path,header=0)

labels_all=raw_df.primary_label.unique().tolist()

print(labels_all)
print(len(labels_all))

labels_encoded=encoder.fit_transform(labels_all)

print(labels_encoded)

label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(label_mapping)

['malpar1', 'litgre1', 'houspa', 'indrob1', 'comtai1', 'grynig2', 'rufwoo2', 'yebbul3', 'indpit1', 'gyhcaf1', 'ruftre2', 'wynlau1', 'inpher1', 'comkin1', 'comior1', 'tibfly3', 'pomgrp2', 'oripip1', 'indtit1', 'nutman', 'junmyn1', 'rutfly6', 'goflea1', 'litegr', 'lesyel1', 'lewduc1', 'inbrob1', 'crfbar1', 'scamin3', 'shikra1', 'gryfra', 'commoo3', 'grewar3', 'brodro1', 'rocpig', 'categr', 'ingori1', 'plhpar1', 'sbeowl1', 'bwfshr1', 'junowl1', 'orihob2', 'greegr', 'barswa', 'paisto1', 'moipig1', 'plapri1', 'forwag1', 'maghor2', 'brasta1', 'lirplo', 'grecou1', 'kenplo1', 'bkcbul1', 'grbeat1', 'junbab2', 'comsan', 'whbtre1', 'brnhao1', 'brcful1', 'whcbar1', 'hoopoe', 'plaflo1', 'maltro1', 'piekin1', 'brnshr', 'whiter2', 'brfowl1', 'pursun4', 'grehor1', 'pursun3', 'purswa3', 'yebbab1', 'lblwar1', 'malwoo1', 'laudov1', 'grenig1', 'tilwar1', 'heswoo1', 'compea', 'putbab1', 'smamin1', 'rorpar', 'graher1', 'ashpri1', 'piebus1', 'grnwar1', 'eurbla2', 'asikoe2', 'whbwat1', 'sqtbul1', 'brwowl1', '

In [67]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [68]:
def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [69]:
## Convert audio data into mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate)  

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [73]:
class BirdclefDataset(Dataset):
    def __init__(self,
                 encoder:LabelEncoder,
                 audio_dir:str='../../data/train_audio',
                 labels_path:str=None,
                 ):
        """
        Parameters:
            encoder: label encoder
            audio_dir: the parent path where all audio files stored
            labels_path: the file including all corresponding labels
        """

        super().__init__()
        self.encoder=encoder
        self.audio_dir=audio_dir
        # read data into dataframe
        self.labels_df=pd.read_csv(labels_path,header=0)


    def get_audio_path(self,index) -> str:
        '''
        Get the audio path of the corresponding index through the provided train metadata csv file. 
        Since there is only one index, only one path will be returned.

        Parameters:
            index: the index of labels metadata file

        Return:
            the single audio path string
        '''
        # Get the child path of audio from labels_df
        audio_child_path=self.labels_df['filename'].iloc[index]

        # concatenate parent path and child path
        return os.path.join(self.audio_dir,audio_child_path)
    

    def get_audio_label(self,index)->str:
        '''
        According to the provided index, get the corresponding label from the train metadata file

        Parameters:
            index: the index of labels metadata file
        '''

        return self.labels_df['primary_label'].iloc[index]
    

    def target_clip(self,index:int,audio:torch.Tensor,sample_rate:int, duration_seconds:float)->torch.Tensor:
        """
        calculate the index corresponding audio clip 

        information from the train metadata csv

        Parameters:
            audio: the raw audio in tensor [num_channels,length]
            sample_rate: audio sampling rate
            duration_seconds: audio duration in seconds
        """
        # Get the audio start time corresponding to index
        clip_start_time=self.labels_df['clip_start_time'].iloc[index]

        # define clip length
        segment_duration = 5 * sample_rate

        # Total number of samples in the waveform
        total_samples = audio.shape[1]

        if clip_start_time<=duration_seconds:
            clip_start_point=clip_start_time*sample_rate
            # For the last clip, the original audio may not be long enough, so we need to use a mask to fill the sequence
            # The first step is to confirm whether the length is sufficient
            # The length is sufficient, no mask is needed
            if clip_start_point+segment_duration<=total_samples:
                clip=audio[:, clip_start_point:clip_start_point + segment_duration]

            # Not long enough, a mask is needed
            else:
                padding_length = clip_start_point+segment_duration - total_samples
                silence = torch.zeros(audio.shape[0], padding_length)
                # concat the last segment of raw audio with silence
                clip=torch.cat((audio[:,clip_start_point:],silence),dim=1)

            # mean and std
            mean_vals = clip.mean(dim=1, keepdim=True)
            std_vals = clip.std(dim=1, keepdim=True)

            # normalization
            standardized_clip = (clip - mean_vals) / std_vals

                
        else:
            raise ValueError('The clip start time is out of raw audio length')
        


        return standardized_clip


    def __len__(self):
        #  return the size of the dataset by many Sampler implementations and the default options of DataLoader.
    
        return len(self.labels_df)

    def __getitem__(self, index):
        # a_list[1] -> a_list.__getitems__(1)

        single_audio_dir=self.get_audio_path(index)

        audio_label=self.encoder.transform([self.get_audio_label(index)])[0]


        audio, sr=read_audio(single_audio_dir)
        

        duration_seconds, num_channels=audio_info(audio,sample_rate=sr)


        clip=self.target_clip(index,audio,sample_rate=sr, duration_seconds=duration_seconds)


        mel_spec=mel_transform(sample_rate=sr,audio=clip)

        return audio_label, mel_spec

In [74]:
BD=BirdclefDataset(encoder=encoder,labels_path=labels_path)

dataloader = DataLoader(dataset=BD, batch_size=128, shuffle=True, num_workers=0)

batch = next(iter(dataloader))
labels, mel_specs = batch
print(labels)
print(mel_specs)
print(mel_specs.shape)

tensor([ 66, 171,   2,  70,  65, 100,  82,  82, 116, 129, 136,  41, 127,  61,
         50,  20,  65,  73,  82, 129,  78,  39,  37,  73,  66, 146, 122,  20,
        107,  20,  73,  55,  82,  71,  57,  47, 174, 136, 156,  43,  41,  53,
         86, 105, 134,  71,  76,  97,  39,  19,  20,  20,  37, 143, 126,  76,
         20,  82,  69,  44,  71,  41,  20,  20,  55,  20,  28,  22,  59,  41,
         45,   0,  70, 129,  35, 146, 139, 135,  70, 161,  70,  38,  20,  82,
         56,  38,  70,  64, 155,   0,  57,  80,  81,  71,  60,  73,  28,  20,
         62, 107,  14,  43,   9,  62, 172,  62, 181,  39,  50,  43,  44,  20,
         70, 168,  10,  21,  37,  73,  45, 160,  82, 136,  14, 105, 100,  82,
         55,  70])
tensor([[[[5.5261e+02, 2.1294e+03, 2.0573e+03,  ..., 2.6892e-03,
           2.6892e-03, 2.6892e-03],
          [1.9958e+03, 1.0570e+03, 4.1430e+02,  ..., 9.7440e-19,
           9.7440e-19, 9.7440e-19],
          [2.3729e+03, 4.7244e+02, 8.3954e+02,  ..., 4.7130e-19,
           4

In [75]:
del encoder, batch, dataloader,labels,mel_specs