In [1]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import librosa
import time
import torchaudio
import torch
from scipy import special
from scipy.io import wavfile
import torchaudio.transforms as T
import soundfile as sf
import torch.nn.functional as F



In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/UrbanSound8K/metadata/UrbanSound8K.csv')
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [4]:
df['file_location'] = ['../data/UrbanSound8K/audio/' + '/original/fold' + str(df['fold'][i]) + '/' + df['slice_file_name'][i] for i in range(0, len(df['slice_file_name']))]

In [5]:
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,file_location
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,../data/UrbanSound8K/audio//original/fold5/100...
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...


In [7]:
df[df['classID']!=3]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,file_location
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
5,100263-2-0-143.wav,100263,71.500000,75.500000,1,5,2,children_playing,../data/UrbanSound8K/audio//original/fold5/100...
...,...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn,../data/UrbanSound8K/audio//original/fold7/998...
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn,../data/UrbanSound8K/audio//original/fold7/998...
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn,../data/UrbanSound8K/audio//original/fold7/998...
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn,../data/UrbanSound8K/audio//original/fold7/998...


In [6]:
y, sr = librosa.load(df['file_location'][0])

In [7]:
y.shape[0]/sr

0.31755102040816324

In [8]:
a, b = wavfile.read(df['file_location'][0])

In [9]:
a

44100

In [10]:
b.shape

(14004, 2)

In [11]:
def print_metadata(metadata, src=None):
    if src:
        print("-" * 10)
        print("Source:", src)
        print("-" * 10)
    print(" - sample_rate:", metadata.sample_rate)
    print(" - num_channels:", metadata.num_channels)
    print(" - num_frames:", metadata.num_frames)
    print(" - bits_per_sample:", metadata.bits_per_sample)
    print(" - encoding:", metadata.encoding)
    print()

In [12]:
metadata = torchaudio.info(df['file_location'][0])
print_metadata(metadata, src=df['file_location'][0])

----------
Source: ../data/UrbanSound8K/audio//original/fold5/100032-3-0-0.wav
----------
 - sample_rate: 44100
 - num_channels: 2
 - num_frames: 14004
 - bits_per_sample: 16
 - encoding: PCM_S



In [18]:
waveform, sample_rate = torchaudio.load(df['file_location'][0])
waveform.shape

torch.Size([2, 14004])

In [22]:
mel_specgram = T.MelSpectrogram(sample_rate)(waveform)  # (channel, n_mels, time)

In [23]:
mel_specgram.shape

torch.Size([2, 40, 71])

In [26]:
mfcc = T.MFCC(sample_rate, n_mfcc=40)(waveform)
mfcc.shape

torch.Size([2, 40, 71])

In [47]:
def get_feature(filepath):
    def postprocess(feats, sample_rate):
        if feats.ndim == 2:
            feats = feats.mean(-1)

        #assert feats.ndim() == 1, feats.ndim()

        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
        return feats

    wav, sample_rate = sf.read(filepath)
    feats = torch.from_numpy(wav).float()
    feats = postprocess(feats, sample_rate)
    return feats, sample_rate

In [59]:
features = get_feature(df['file_location'][0])
features.unsqueeze(0).shape

torch.Size([1, 14004])

In [67]:
def parse_audio(audio_path):
    
        """ Computes and returns the spectrogram of the given audio file."""
        y, sample_rate = get_feature(audio_path)
        D = librosa.stft(y, n_fft=1012, hop_length=512,
                         win_length=0.025)
        spect, phase = librosa.magphase(D)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        
    # Global normalization
        mean = spect.mean() 
        std = spect.std()
        spect.add_(-mean)
        spect.div_(std)

        return spect

In [68]:
parse_audio(df['file_location'][0])

ValueError: too many values to unpack (expected 2)