In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

from datasets import Dataset

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pred_dir = Path("../../data/predict")
pred_files = pred_dir.glob("*.ogg")

In [3]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [4]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [5]:
def split_audio(audio: torch.Tensor, segment_length:int):

    '''
    split raw audio tensor into multiple clips with 5 seconds long.

    Parameters:
        audio: the raw audio tensor
        segment_length: the audio length of each 5 seconds

    return:
        parts: list includes all clips
        end_time_list: the list of all clips' end time in seconds
    '''

    length_audio = audio.shape[1]
    parts = []
    # For example, if this is the first 5 seconds of audio, then the end time is 5. If it is 5-10 seconds, the end time is 10
    end_time_list=[]
    end_time=5
    for i in range(0, length_audio, segment_length):
        part = audio[0][i:i + segment_length]
        # if len(part) == segment_length:  # Ensure the fragment lengths are consistent
        parts.append(part)  #Store the raw bytes of audio data
        end_time_list.append(end_time)
        end_time+=5

        

    return parts,end_time_list

In [6]:
audio_clips_list=[]
clip_names_list=[]

for path in pred_files:
    # read audio as tensor
    audio,sr=read_audio(path=path)

    # get audio corresponding informatino
    duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

    # split audio into multi clips with 5 seconds
    audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

    # generate each label name for each clip
    soundscape_id=path.stem
    clip_name=[f'soundscape_{soundscape_id}_{end_time}' for end_time in end_time_list]

    audio_clips_list.extend(audio_clips)
    
    clip_names_list.extend(clip_name)

    

In [7]:
# create Dataset
dataset = Dataset.from_dict({'audio_clip': audio_clips_list})

In [13]:
# We need to modify melspec so that it can accept batch as a function and use the map function


## Convert audio data into mel spectrogram


def mel_transform(batch):
    """
    transform audio data into mel sepctrogram
    """
    n_fft=int(0.04*32000)
    hop_length=int(0.02*32000)

    # Calculate Mel spectrum
    n_mels = 40  # Number of Mel filters

    # Setting up the Mel Spectrogram Converter
    mel_transformer = MelSpectrogram(
        sample_rate=32000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    audio_clip_batch=batch['audio_clip']

    melspec_list=[]

    for audio_clip in audio_clip_batch:
        
        audio_clip=torch.tensor(audio_clip).unsqueeze(0)

        melspec=mel_transformer(audio_clip)

        melspec_list.append(melspec)


    return {'audio_mel':melspec_list}



In [14]:
dataset_mel=dataset.map(mel_transform, batched=True)

Map: 100%|██████████| 96/96 [00:04<00:00, 23.02 examples/s]


In [16]:
del dataset

In [17]:
dataset_mel_single=dataset_mel.remove_columns('audio_clip')

In [18]:
from torch.utils.data import Dataset

In [19]:
class PredDataset(Dataset):
    def __init__(self,dataset):
        super().__init__()
        self.dataset=dataset

    def __len__(self):

        return len(self.dataset)
    
    def __getitem__(self, index):
        audio_melspec=self.dataset['audio_mel'][index]

        audio_tensor=torch.tensor(audio_melspec)

        return audio_tensor

In [20]:
# test

PD=PredDataset(dataset=dataset_mel)

dataloader = DataLoader(dataset=PD, batch_size=32, shuffle=False, num_workers=0)


batch = next(iter(dataloader))
mel_specs = batch
print(mel_specs)
print(mel_specs.shape)
print(type(mel_specs))

tensor([[[[1.4445e-02, 3.2618e-02, 7.9513e-02,  ..., 3.6232e-02,
           3.2288e-02, 2.0945e-02],
          [1.7908e-03, 1.9852e-02, 7.0106e-02,  ..., 7.7466e-03,
           2.4540e-02, 3.0045e-02],
          [1.4791e-02, 8.8226e-03, 3.0246e-02,  ..., 2.5429e-02,
           2.7441e-02, 1.4711e-02],
          ...,
          [6.2000e-02, 6.8676e-02, 6.3356e-02,  ..., 7.7837e-02,
           5.4044e-02, 6.0893e-02],
          [1.0289e-01, 1.1478e-01, 1.0321e-01,  ..., 1.4589e-01,
           9.4384e-02, 8.3586e-02],
          [1.9869e-01, 1.9208e-01, 1.3534e-01,  ..., 1.5763e-01,
           1.7445e-01, 1.2904e-01]]],


        [[[8.0196e-03, 7.4538e-03, 3.1931e-02,  ..., 4.7921e-03,
           3.5222e-02, 2.3430e-02],
          [1.7595e-02, 2.3124e-02, 3.1656e-02,  ..., 1.5919e-02,
           1.3917e-02, 9.6076e-02],
          [1.2621e-02, 1.5387e-02, 1.7095e-02,  ..., 2.6534e-02,
           1.0663e-02, 3.0125e-02],
          ...,
          [3.7748e-02, 6.0891e-02, 4.9411e-02,  ..., 6.96