In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

from datasets import Dataset

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pred_dir = Path("../../data/predict")
pred_files = pred_dir.glob("*.ogg")

In [3]:
pred_files

<generator object Path.glob at 0x29c7a5770>

In [4]:
# for file in pred_files:
#     print(file)

# pred_files is a generator. If I run this step, the contents of the generator will be exhausted if I run the following step
# ds = Dataset.from_dict({"audio_path": list(map(str, list(pred_files)))})
# it will represents:
# Dataset({
#    features: ['audio_path'],
#    num_rows: 0
#})
# rows is 0 because the contents of the generator have been exhausted

In [5]:
ds = Dataset.from_dict({"audio_path": list(map(str, list(pred_files)))})

ds

Dataset({
    features: ['audio_path'],
    num_rows: 2
})

In [6]:
print(ds['audio_path'][0])

print(type(ds['audio_path'][0]))

../../data/predict/1000170626.ogg
<class 'str'>


In [7]:
def read_audio(batch):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Because this function will be used in datasets.Dataset.map()
    So each received data is a batch
    For each batch
    I need to add a loop to process each data
    """
    audio_paths=batch['audio_path']

    audio_tensor=[]

    for audio_path in audio_paths:

        audio, sample_rate = torchaudio.load(audio_path)
        audio_tensor.append(audio)
    
    return {"audio": audio_tensor,"sr":[sample_rate]}

In [8]:
ds1 = ds.map(read_audio, batched=True, num_proc=8)

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.
Map (num_proc=2): 100%|██████████| 2/2 [00:02<00:00,  1.06s/ examples]


In [9]:
print(ds1)


Dataset({
    features: ['audio_path', 'audio', 'sr'],
    num_rows: 2
})


In [10]:
print(ds1['audio_path'][0])

../../data/predict/1000170626.ogg


### The above content is for demonstration. Since datasets.Dataset cannot expand the number of rows, we perform the following operations to slice all the data

In [11]:
pred_data='../../data/predict/1000170626.ogg'

In [12]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [13]:
audio,sr=read_audio(path=pred_data)

In [14]:
print(audio)
print(audio.shape)
print(sr)
print('duration_sec:',audio.shape[1] / sr)
print('duration_min:',audio.shape[1] / sr/60)

tensor([[-0.0049, -0.0014, -0.0067,  ...,  0.0003,  0.0040, -0.0059]])
torch.Size([1, 7680000])
32000
duration_sec: 240.0
duration_min: 4.0


In [15]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [16]:
duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

print(duration_seconds)
print(num_channels)

240.0
1


In [17]:
audio[0][:10]

tensor([-0.0049, -0.0014, -0.0067,  0.0002,  0.0012, -0.0006,  0.0008, -0.0017,
        -0.0006, -0.0019])

In [18]:
def split_audio(audio: torch.Tensor, segment_length:int):

    '''
    split raw audio tensor into multiple clips with 5 seconds long.

    Parameters:
        audio: the raw audio tensor
        segment_length: the audio length of each 5 seconds

    return:
        parts: list includes all clips
        end_time_list: the list of all clips' end time in seconds
    '''

    length_audio = audio.shape[1]
    parts = []
    # For example, if this is the first 5 seconds of audio, then the end time is 5. If it is 5-10 seconds, the end time is 10
    end_time_list=[]
    end_time=5
    for i in range(0, length_audio, segment_length):
        part = audio[0][i:i + segment_length]
        # if len(part) == segment_length:  # Ensure the fragment lengths are consistent
        parts.append(part)  #Store the raw bytes of audio data
        end_time_list.append(end_time)
        end_time+=5

        

    return parts,end_time_list

In [19]:
audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

In [20]:
type(audio_clips)

list

In [21]:
audio_clips

[tensor([-0.0049, -0.0014, -0.0067,  ..., -0.0016,  0.0016,  0.0013]),
 tensor([ 0.0002,  0.0024,  0.0023,  ..., -0.0038, -0.0036, -0.0077]),
 tensor([-0.0013, -0.0011,  0.0041,  ..., -0.0044, -0.0034,  0.0003]),
 tensor([ 0.0014,  0.0066,  0.0043,  ...,  0.0010, -0.0002,  0.0038]),
 tensor([ 0.0033, -0.0029, -0.0031,  ...,  0.0055, -0.0036,  0.0076]),
 tensor([-0.0071,  0.0060, -0.0041,  ...,  0.0088,  0.0049,  0.0009]),
 tensor([-0.0033, -0.0064, -0.0062,  ..., -0.0013, -0.0036, -0.0024]),
 tensor([-0.0028, -0.0013,  0.0022,  ..., -0.0006,  0.0009,  0.0022]),
 tensor([ 0.0040,  0.0010, -0.0015,  ...,  0.0026, -0.0023, -0.0014]),
 tensor([-0.0016, -0.0017, -0.0025,  ...,  0.0119, -0.0084,  0.0077]),
 tensor([-0.0070, -0.0009,  0.0023,  ...,  0.0006,  0.0003, -0.0017]),
 tensor([ 4.5747e-04, -2.4891e-03,  1.6573e-03,  ..., -4.5676e-05,
          2.1456e-03, -1.3286e-03]),
 tensor([ 0.0033, -0.0034,  0.0033,  ..., -0.0002, -0.0105,  0.0027]),
 tensor([ 8.1047e-03,  2.1192e-03,  9.7564e-

In [22]:
len(audio_clips) # represent how many 5 second clips in the audio

48

In [23]:
print(audio_clips[0].shape)

torch.Size([160000])


In [24]:
print(end_time_list)

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240]


In [25]:
pred_dir = Path("../../data/predict")
pred_files = pred_dir.glob("*.ogg")

In [26]:
audio_clips_list=[]
clip_names_list=[]

for path in pred_files:
    # read audio as tensor
    audio,sr=read_audio(path=path)

    # get audio corresponding informatino
    duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

    # split audio into multi clips with 5 seconds
    audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

    # generate each label name for each clip
    soundscape_id=path.stem
    clip_name=[f'soundscape_{soundscape_id}_{end_time}' for end_time in end_time_list]

    audio_clips_list.extend(audio_clips)
    
    clip_names_list.extend(clip_name)

    

In [27]:
len(audio_clips_list)

96

In [28]:
audio_clips_list

[tensor([-0.0049, -0.0014, -0.0067,  ..., -0.0016,  0.0016,  0.0013]),
 tensor([ 0.0002,  0.0024,  0.0023,  ..., -0.0038, -0.0036, -0.0077]),
 tensor([-0.0013, -0.0011,  0.0041,  ..., -0.0044, -0.0034,  0.0003]),
 tensor([ 0.0014,  0.0066,  0.0043,  ...,  0.0010, -0.0002,  0.0038]),
 tensor([ 0.0033, -0.0029, -0.0031,  ...,  0.0055, -0.0036,  0.0076]),
 tensor([-0.0071,  0.0060, -0.0041,  ...,  0.0088,  0.0049,  0.0009]),
 tensor([-0.0033, -0.0064, -0.0062,  ..., -0.0013, -0.0036, -0.0024]),
 tensor([-0.0028, -0.0013,  0.0022,  ..., -0.0006,  0.0009,  0.0022]),
 tensor([ 0.0040,  0.0010, -0.0015,  ...,  0.0026, -0.0023, -0.0014]),
 tensor([-0.0016, -0.0017, -0.0025,  ...,  0.0119, -0.0084,  0.0077]),
 tensor([-0.0070, -0.0009,  0.0023,  ...,  0.0006,  0.0003, -0.0017]),
 tensor([ 4.5747e-04, -2.4891e-03,  1.6573e-03,  ..., -4.5676e-05,
          2.1456e-03, -1.3286e-03]),
 tensor([ 0.0033, -0.0034,  0.0033,  ..., -0.0002, -0.0105,  0.0027]),
 tensor([ 8.1047e-03,  2.1192e-03,  9.7564e-

In [29]:
len(clip_names_list)

96

In [30]:
clip_names_list

['soundscape_1000170626_5',
 'soundscape_1000170626_10',
 'soundscape_1000170626_15',
 'soundscape_1000170626_20',
 'soundscape_1000170626_25',
 'soundscape_1000170626_30',
 'soundscape_1000170626_35',
 'soundscape_1000170626_40',
 'soundscape_1000170626_45',
 'soundscape_1000170626_50',
 'soundscape_1000170626_55',
 'soundscape_1000170626_60',
 'soundscape_1000170626_65',
 'soundscape_1000170626_70',
 'soundscape_1000170626_75',
 'soundscape_1000170626_80',
 'soundscape_1000170626_85',
 'soundscape_1000170626_90',
 'soundscape_1000170626_95',
 'soundscape_1000170626_100',
 'soundscape_1000170626_105',
 'soundscape_1000170626_110',
 'soundscape_1000170626_115',
 'soundscape_1000170626_120',
 'soundscape_1000170626_125',
 'soundscape_1000170626_130',
 'soundscape_1000170626_135',
 'soundscape_1000170626_140',
 'soundscape_1000170626_145',
 'soundscape_1000170626_150',
 'soundscape_1000170626_155',
 'soundscape_1000170626_160',
 'soundscape_1000170626_165',
 'soundscape_1000170626_170',


In [31]:
# create Dataset
dataset = Dataset.from_dict({'audio_clip': audio_clips_list})

In [32]:
dataset

Dataset({
    features: ['audio_clip'],
    num_rows: 96
})

In [33]:
dataset['audio_clip'][0]

[-0.00486694136634469,
 -0.001434282516129315,
 -0.006679195910692215,
 0.00018624569929670542,
 0.001159784384071827,
 -0.0006362766143865883,
 0.0007990088197402656,
 -0.0016613916959613562,
 -0.0005808754940517247,
 -0.0019451251719146967,
 -0.004689020570367575,
 0.00022642739349976182,
 0.0016787637723609805,
 -0.0009094381239265203,
 0.00279269739985466,
 -0.0004847948148380965,
 0.00037320313276723027,
 -0.001978378975763917,
 -0.0006876777624711394,
 -0.0004574874183163047,
 0.0014841874362900853,
 0.0008858710643835366,
 2.8857426514150575e-05,
 6.0924608987988904e-05,
 0.0006317903753370047,
 -0.0007048454717732966,
 0.0003596975584514439,
 0.00119151605758816,
 0.00016394871636293828,
 0.0009970649844035506,
 7.448722317349166e-05,
 0.0009197275503538549,
 -0.001915594795718789,
 7.07045619492419e-05,
 -0.00025813718093559146,
 -0.0012907410273328424,
 -0.00017707380175124854,
 0.0016095865285024047,
 0.0009580257465131581,
 0.0009247900452464819,
 -0.0015112601686269045,
 -

In [34]:
type(dataset['audio_clip'][0])

list

In [35]:
len(dataset['audio_clip'][0])

160000

In [36]:
## convert audio to mel spectrogram


def mel_transform(sample_rate:float,audio:torch.Tensor,window_size: float=0.04,hop_size:float=0.02,n_mels:int=40)->torch.Tensor:
    """
    transform audio data into mel sepctrogram
    """
    n_fft = int(window_size * sample_rate)  
    hop_length = int(hop_size * sample_rate)  

    mel_transformer = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    melspec=mel_transformer(audio)

    return melspec



In [37]:
mel_spec=mel_transform(sample_rate=sr,audio=audio_clips[0])

In [38]:
print(mel_spec.shape)
print(mel_spec)

torch.Size([40, 251])
tensor([[1.6236e+00, 9.6578e-01, 1.4815e+00,  ..., 1.0720e-01, 7.6143e-01,
         1.9525e-01],
        [1.1255e-01, 4.0416e-01, 4.7889e-01,  ..., 1.1456e-01, 3.0281e-01,
         3.2574e-02],
        [1.7619e-01, 6.7140e-02, 6.0950e-02,  ..., 8.4696e-02, 6.1022e-02,
         2.0605e-03],
        ...,
        [2.0022e+00, 3.3749e+00, 3.1411e+00,  ..., 3.1161e+00, 3.5394e+00,
         4.0921e+00],
        [2.5722e+00, 3.9314e+00, 4.9333e+00,  ..., 4.5915e+00, 4.2611e+00,
         3.3965e+00],
        [2.5862e+00, 3.9724e+00, 3.7711e+00,  ..., 4.2322e+00, 2.5727e+00,
         2.5983e+00]])


In [56]:
# We need to modify melspec so that it can accept batch as a function and use the map function

## Convert audio data into mel spectrogram


def mel_transform(batch):
    """
    transform audio data into mel sepctrogram
    """
    n_fft=int(0.04*32000)

    hop_length=int(0.02*32000)

    print('3333')

    # Calculate Mel spectrum
    n_mels = 40  # Number of Mel filters

    print('4444')

    # Set up the Mel Spectrogram converter
    mel_transformer = MelSpectrogram(
        sample_rate=32000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )

    audio_clip_batch=batch['audio_clip']

    melspec_list=[]

    for audio_clip in audio_clip_batch:
        print('1111')
        
        audio_clip=torch.tensor(audio_clip).unsqueeze(0)
        print(audio_clip.shape)
        print(type(audio_clip))

        melspec=mel_transformer(audio_clip)

        print('444')

        melspec_list.append(melspec)


    return {'audio_mel':melspec_list}



In [58]:
dataset_mel=dataset.map(mel_transform, batched=True)

Map:   0%|          | 0/96 [00:00<?, ? examples/s]

3333
4444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
1111
torch.Size([1, 160000])
<class 't

Map: 100%|██████████| 96/96 [00:04<00:00, 22.30 examples/s]


In [None]:
audio_clip=torch.tensor(dataset['audio_clip'][0])

In [None]:
a=audio_clip.unsqueeze(0)


In [None]:
a.shape

torch.Size([1, 160000])

In [51]:
# Test that the function works properly
test_dataset = Dataset.from_dict({'audio_clip': [dataset['audio_clip'][0]]})  # Take a sample for testing
test_result = test_dataset.map(mel_transform, batched=True)
print(test_result)

Map: 100%|██████████| 1/1 [00:00<00:00, 21.21 examples/s]

3333
4444
1111
torch.Size([1, 160000])
<class 'torch.Tensor'>
444
Dataset({
    features: ['audio_clip', 'audio_mel'],
    num_rows: 1
})





In [59]:
dataset_mel

Dataset({
    features: ['audio_clip', 'audio_mel'],
    num_rows: 96
})

In [60]:
dataset_mel['audio_mel'][0]

[[[0.014445433393120766,
   0.03261750563979149,
   0.0795130729675293,
   0.014880393631756306,
   0.034302111715078354,
   0.0756247267127037,
   0.04478197917342186,
   0.008939237333834171,
   0.030673140659928322,
   0.022805506363511086,
   0.030401233583688736,
   0.04520334303379059,
   0.005300435237586498,
   0.0031909102108329535,
   0.018131444230675697,
   0.026072535663843155,
   0.03324717655777931,
   0.018786918371915817,
   0.018036698922514915,
   0.011405395343899727,
   0.02585269697010517,
   0.032170895487070084,
   0.0751618891954422,
   0.008046779781579971,
   0.00562533363699913,
   0.024004152044653893,
   0.004208015277981758,
   0.03738512098789215,
   0.059720542281866074,
   0.015256804414093494,
   0.020740851759910583,
   0.043094974011182785,
   0.04861636832356453,
   0.007634046487510204,
   0.017268121242523193,
   0.013295130804181099,
   0.012863477692008018,
   0.016939377412199974,
   0.06180068105459213,
   0.031573086977005005,
   0.024078454

In [62]:
torch.tensor(dataset_mel['audio_mel'][0]).shape

torch.Size([1, 40, 251])

In [63]:
len(dataset_mel)

96

In [67]:
dataset_mel_single=dataset_mel.remove_columns('audio_clip')

In [68]:
dataset_mel_single

Dataset({
    features: ['audio_mel'],
    num_rows: 96
})

In [69]:
from torch.utils.data import Dataset

In [71]:
class PredDataset(Dataset):
    def __init__(self,dataset):
        super().__init__()
        self.dataset=dataset

    def __len__(self):

        return len(self.dataset)
    
    def __getitem__(self, index):
        audio_melspec=self.dataset['audio_mel'][index]

        audio_tensor=torch.tensor(audio_melspec)

        return audio_tensor

In [72]:
# test

PD=PredDataset(dataset=dataset_mel)

dataloader = DataLoader(dataset=PD, batch_size=32, shuffle=False, num_workers=0)


batch = next(iter(dataloader))
mel_specs = batch
print(mel_specs)
print(mel_specs.shape)
print(type(mel_specs))

tensor([[[[1.4445e-02, 3.2618e-02, 7.9513e-02,  ..., 3.6232e-02,
           3.2288e-02, 2.0945e-02],
          [1.7908e-03, 1.9852e-02, 7.0106e-02,  ..., 7.7466e-03,
           2.4540e-02, 3.0045e-02],
          [1.4791e-02, 8.8226e-03, 3.0246e-02,  ..., 2.5429e-02,
           2.7441e-02, 1.4711e-02],
          ...,
          [6.2000e-02, 6.8676e-02, 6.3356e-02,  ..., 7.7837e-02,
           5.4044e-02, 6.0893e-02],
          [1.0289e-01, 1.1478e-01, 1.0321e-01,  ..., 1.4589e-01,
           9.4384e-02, 8.3586e-02],
          [1.9869e-01, 1.9208e-01, 1.3534e-01,  ..., 1.5763e-01,
           1.7445e-01, 1.2904e-01]]],


        [[[8.0196e-03, 7.4538e-03, 3.1931e-02,  ..., 4.7921e-03,
           3.5222e-02, 2.3430e-02],
          [1.7595e-02, 2.3124e-02, 3.1656e-02,  ..., 1.5919e-02,
           1.3917e-02, 9.6076e-02],
          [1.2621e-02, 1.5387e-02, 1.7095e-02,  ..., 2.6534e-02,
           1.0663e-02, 3.0125e-02],
          ...,
          [3.7748e-02, 6.0891e-02, 4.9411e-02,  ..., 6.96