In [None]:
'''
This notebook basically explains- 
A. How to extract the Audio features in form of MFCC/Melc/ZCR values (from librosa library) and save it in either CSV file or Numpy Array format.
B. How to Iterate over Urban audio dataset and convert them into MFCC values
C. How to mix Urban audio dataset with noise signal and convert them into MFCC values
D. How to use pyAudioAnalysis library to extract the audio feature values.

Note: There are three different ways to get the MFCC values for given audio as mentioned into this notebook.
a. MFCC base
b. Scaled MFCC feature
c. MFCC features with padding

'''

In [228]:
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import librosa
import glob
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import math  
import numpy as np
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures

In [6]:
#Extract MFCC
def extract_mfcc_features(audio, sample_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 
    return mfccsscaled

In [None]:
def extract_mfcc_features_scaled(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsscaled = np.mean(mfccs.T,axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 
    return mfccsscaled

In [311]:
max_pad_len = 174
def extract_mfcc_features_with_padding(audio, sample_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ")
        return None 
     
    return mfccs

In [275]:
def get_white_noise(signal,SNR) :
    #RMS value of signal
    RMS_s=math.sqrt(np.mean(signal**2))
    #RMS values of noise
    RMS_n=math.sqrt(RMS_s**2/(pow(10,SNR/20)))
    #Additive white gausian noise. Thereore mean=0
    #Because sample length is large (typically > 40000)
    #we can use the population formula for standard daviation.
    #because mean=0 STD=RMS
    STD_n=RMS_n
    noise=np.random.normal(0, STD_n, signal.shape[0])
    return noise

def get_noise_from_sound(signal,noise,SNR):
    try:
        RMS_s=math.sqrt(np.mean(signal**2))
        #required RMS of noise
        RMS_n=math.sqrt(RMS_s**2/(pow(10,SNR/20)))

        #current RMS of noise
        RMS_n_current=math.sqrt(np.mean(noise**2))
        noise=noise*(RMS_n/RMS_n_current)
    
    except Exception as e:
        print("Error encountered while parsing file: ")
        return None
    
    return noise

In [None]:
# This section is to extract audio feature values using librosa lib for given single audio file
features_df = pd.DataFrame()
audio_file = '101415-3-0-2-gun-shot.wav'
audio, sample_rate = librosa.load(audio_file, sr=22050, res_type='kaiser_fast')

mfcc_df = pd.DataFrame([np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)])
mel_df = pd.DataFrame([np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate).T, axis=0)])
zcr = pd.DataFrame([np.mean(librosa.feature.zero_crossing_rate(y= audio))])

wrkng_df1 = pd.concat([mfcc_transformed, mel_spect_transformed], axis=1)
features_df = pd.concat([features_df, wrkng_df1], ignore_index=True)

In [None]:
# This section is to extract audio feature values using PyAudioAnalysis lib for given single audio file
audio_file = '101415-3-0-2-gun-shot.wav'
audio, sample_rate = librosa.load(audio_file, sr=22050, res_type='kaiser_fast')

aud_feature, f_name = ShortTermFeatures.feature_extraction(audio, sample_rate, 0.050*sample_rate, 0.025*sample_rate)
aud_feature_scaled = np.mean(aud_feature.T,axis=0)

In [None]:
# This section is to convert Urban audio data into mfcc values.
parent_dir = '/UrbanSound8K.tar/UrbanSound8K/audio'
sub_dirs= ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']
file_ext = "*.wav"

features= []
n = 0
for l, sub_dir in enumerate(sub_dirs):
    print('Processing folder: ', sub_dir)
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        n = n+1
        if n%1000 == 0:
            print('processing reocrd: ', n)
        
        signal,sr = librosa.load(fn)
        mfcc = extract_mfcc_features(signal, sr)
        class_label = "original"
        features.append([mfcc, class_label])
        
        
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

#Save into CSV file
featuresdf.to_csv (r'D:\audio_dataframe.csv', index = False, header=True)

#Save into Numpy array
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

np.save('result_np_array_Y1_SNR10' , y)
np.save('result_np_array_X1_SNR10' , X)

In [None]:
# This section is to mix Urban audio data with noise signal and convert into mfcc values.

hop_length = 512
n_fft = 2048

#image_parent_dir = 'D:/Abhishek/Machine Learning Models/Audio Data Analysis/UrbanSound8K.tar/UrbanSound8K/images/'
parent_dir = 'D:/Abhishek/Machine Learning Models/Audio Data Analysis/UrbanSound8K.tar/UrbanSound8K/audio'
sub_dirs= ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']
file_ext = "*.wav"

noise_files = ['white_noise', 'Noise_09.wav', 'Noise_Hit_01.wav', 'Tape_Noise_02.wav', 'Perc_Hit_06.wav']

noise_features= []
n = 0

for l, sub_dir in enumerate(sub_dirs):
    print('Processing folder: ', sub_dir)
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        n = n+1
            
        if n%1000 == 0:
            print('processing reocrd: ', n)
        
        signal,sr = librosa.load(fn, res_type='kaiser_fast')
        
        if mfcc is not None:
            noise_file= str(random.choice(noise_files))
        
            if noise_file == 'white_noise':
                noise=get_white_noise(signal,SNR=10)
                signal_noise=signal+noise

                mfcc = extract_features_with_padding(signal_noise, sr)
                class_label = "white_noise"
                noise_features.append([mfcc, class_label])
            else:
                noise_sample, sr1 = librosa.load(noise_file, sr=None)
                
                if(len(noise_sample) > len(signal)):
                    noise_sample=noise_sample[0:len(signal)]

                if(len(noise_sample) < len(signal)):
                    signal=signal[0:len(noise_sample)]

                noise3=get_noise_from_sound(signal,noise_sample,SNR=10)
                
                if noise3 is not None:
                    signal_noise3=signal+noise3
                    
                    trim_signal = librosa.effects.trim(signal_noise3)
                    total_duration = 4*sr
                    split_signal = trim_signal[0] 
                    if len(trim_signal[0]) > total_duration:
                        split_signal=trim_signal[0][0:total_duration]
                    
                    mfcc = extract_features_with_padding(split_signal, sr)
                    if mfcc is not None:
                        class_label = "real_noise"
                        noise_features.append([mfcc, class_label])
        

noise_featuresdf = pd.DataFrame(noise_features, columns=['feature','class_label'])
noise_featuresdf.to_csv (r'D:\noise_audio_dataframe.csv', index = False, header=True)

X = np.array(noise_featuresdf.feature.tolist())
y = np.array(noise_featuresdf.class_label.tolist())

np.save('noise_result_np_array_Y1_SNR10' , y)
np.save('noise_result_np_array_X1_SNR10' , X)

In [235]:
frames= [featuresdf, noise_featuresdf]
result = pd.concat(frames)
result = shuffle(result)
result.to_csv (r'D:\final_audio_dataframe.csv', index = False, header=True)

In [251]:
# To load numpy array 
result_y_numpy_data = np.load('result_np_array_Y.npy',allow_pickle=True)

In [256]:
# This section is to iterate over Urban MetaData CSV file and convert corresponding audio data into mfcc values.

fulldatasetpath = '/UrbanSound8K.tar/UrbanSound8K/audio/'

metadata = pd.read_csv(fulldatasetpath + '../metadata/UrbanSound8K.csv')

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    
    class_label = row["class"]
    data = extract_features_scaled(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files')

Finished feature extraction from  8732  files
