#Custom Audio Pytorch dataset with Pytorch & torchaudio 

**USD:**
https://urbansounddataset.weebly.com/urbansound8k.html



**torchaudio:**
https://pytorch.org/audio/stable/backend.html

**torchaudio.transform:**
https://pytorch.org/audio/stable/transforms.html

In [None]:
# Unzip dataset
!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz -O urban8k.tgz
!tar -xzf urban8k.tgz
!rm urban8k.tgz

--2022-03-19 20:19:55--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘urban8k.tgz’


2022-03-19 20:31:03 (8.61 MB/s) - ‘urban8k.tgz’ saved [6023741708/6023741708]



In [None]:
!mv "/content/UrbanSound8K" "/content/drive/MyDrive/Afshari/Part2"

In [1]:
import torch
from torch.utils.data import dataset
import pandas as pd
import torchaudio
import os

In [6]:
class UrbanSoundDataset():

  def __init__(self,annotations_file,audio_dir,transformation, target_sample_rate):
    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.transformation = transformation
    self.target_sample_rate = target_sample_rate

  def __len__(self):
    return len(self.annotations)

  #len(usd)
  #usd -> urban sound dataset
  def __getitem__(self,index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    #signal -> (num_channels,samples) -> (2,16000) -> (1,16000)
    signal = self._resample_if_necessary(signal,sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self.transformation(signal)
    return signal,label

  def _resample_if_necessary(self,signal,sr):
    if sr != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sr,self.target_sample_rate)
      signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self,signal):  
    if signal.shape[0]>1: #(2,16000)
      signal = torch.mean(signal, dim=0, keepdim = True)
    return signal


  #a_list[] ->a_list.__getitem__(1)

  def _get_audio_sample_path(self,index):
    fold = f"fold{self.annotations.iloc[index,5]}"
    path = os.path.join(self.audio_dir,fold,self.annotations.iloc[index,0])
    return path
  
  def _get_audio_sample_label(self,index):
    return self.annotations.iloc[index,6]

if __name__ == "__main__":

  ANNOTATIONS_FILE = "/content/drive/MyDrive/Afshari/Part2/UrbanSound8K/metadata/UrbanSound8K.csv"
  AUDIO_DIR = "/content/drive/MyDrive/Afshari/Part2/UrbanSound8K/audio"
  SAMPLE_RATE = 16000

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
      sample_rate=SAMPLE_RATE,
      n_fft= 1024,
      hop_length = 512,
      n_mels = 64
  )
  #ms = mel_spectrogram(signal)
  usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE)

  print(f"There are {len(usd)} samples in the dataset.")

  signal, label = usd[0]

  print(f"lable = {label}")
  print(f"lable = {signal.size()}")

  a=1


There are 8732 samples in the dataset.
lable = 3
lable = torch.Size([1, 64, 10])


list