# Preprocess Audio Files
This notebook contains the code for preprocessing the audio files before training.

10 second audio in wav format with a sample rate of 48k was chosen because it matches the audio used to train the l3 model from which I plan to use for feature embeddings.

The sample dataset only includes the training data, so we will split that into training and testing data.  If the mode proves viable we will download more data from xeno-canto.org.

In [1]:
import sox # must install sox locally if you want mp3 support
import os
from glob import glob
import shutil
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import pandas as pd
import soundfile as sf
import librosa

In [2]:
ORIGINAL_DATA_DIR = 'data/train_audio'
EXTRA_DATA_DIR = 'data/xeno-canto/'
OUTPUT_DIR = 'data/audio_10sec'
SAMPLE_RATE = 22050

# create output dir if it does not exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [17]:
# get a list of all mp3 files

# from original dataset
audio_files = glob(os.path.join(ORIGINAL_DATA_DIR, '*/*.mp3'))

# from additional downloaded data from xeno-canto.com
audio_files += glob(os.path.join(EXTRA_DATA_DIR, '*.mp3'))

audio_files[:5], audio_files[-5:]

(['data/train_audio/olsfly/XC386256.mp3',
  'data/train_audio/olsfly/XC484154.mp3',
  'data/train_audio/olsfly/XC239498.mp3',
  'data/train_audio/olsfly/XC368006.mp3',
  'data/train_audio/olsfly/XC156193.mp3'],
 ['data/xeno-canto/XC76626.mp3',
  'data/xeno-canto/XC441588.mp3',
  'data/xeno-canto/XC368433.mp3',
  'data/xeno-canto/XC146762.mp3',
  'data/xeno-canto/XC627879.mp3'])

# Convert files to 10 second WAV / downsample to `SAMPLE_RATE`

The audio files have to be processed in two steps because adding padding to an mp3 then saving as wav results in imprecise final times.  We need files to be exactly 10 seconds long wtih a sample rate of 48khz for a total of `SAMPLE_RATE` * 10 samples.

The first pass resamples the audio to `SAMPLE_RATE`, cuts the clips to 11 seconds, and saves as wav files.

The second pass pads or crops the files to exactly 10 seconds long.

In [10]:
# get the number of cpu cores available
num_cores = multiprocessing.cpu_count()
print(num_cores)

4


In [11]:
num_cores = 3

In [12]:
def convert_to_wav(af):
        
    wav_name = os.path.splitext(os.path.basename(af))[0] + '.wav'
    class_name = af.split('/')[-2]
    outfile = os.path.join(OUTPUT_DIR, wav_name)
    
    if(not os.path.exists(outfile)):
        tfm = sox.Transformer()
        tfm.trim(start_time=0.0, end_time=11.0)
        tfm.convert(samplerate=SAMPLE_RATE, 
                    n_channels=1, 
                    bitdepth= 16)
        tfm.build(input_filepath=af, output_filepath=outfile)    

In [13]:
# Process the conversion using all cores in parallel to save time
print(f'Starting to process {len(audio_files)} files')
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(convert_to_wav)(i) for i in audio_files)

Starting to process 49759 files


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 355 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done 49759 out of 49759 | elapsed:  7.8min finished


In [43]:
# get a list of all the .wav files
wav_audio_files = glob(os.path.join(OUTPUT_DIR, '*.wav'), recursive=True)
wav_audio_files[:5]

['data/audio_10sec/XC174211.wav',
 'data/audio_10sec/XC454444.wav',
 'data/audio_10sec/XC334841.wav',
 'data/audio_10sec/XC172660.wav',
 'data/audio_10sec/XC134635.wav']

In [44]:
def crop_pad_audio(af):

    duration = sox.file_info.duration(af) # gets duration in seconds
    basename = os.path.basename(af)
    class_name = af.split('/')[-2]
    outfile = os.path.join(OUTPUT_DIR, basename)    

    if duration == None:
        print('duration is None:', af)
    elif duration < 10.0:
        # first move the working file to pwd
        os.rename(af, basename)

        tfm = sox.Transformer()
        tfm.pad(start_duration=0.0, end_duration=(10.0 - duration))
        tfm.build(input_filepath=basename, output_filepath=outfile)

        # remove the old file
        os.remove(basename)        

    elif duration > 10.0:
        # first move the working file to pwd
        os.rename(af, basename)

        tfm = sox.Transformer()
        tfm.trim(start_time=0.0, end_time=10.0)
        tfm.build(input_filepath=basename, output_filepath=outfile)

        # remove the old file
        os.remove(basename)

In [45]:
print(f'Starting to process {len(wav_audio_files)} files')
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(crop_pad_audio)(i) for i in wav_audio_files)

Starting to process 49745 files


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 145 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done 1345 tasks      | elapsed:   12.6s
[Parallel(n_jobs=3)]: Done 3345 tasks      | elapsed:   30.2s
[Parallel(n_jobs=3)]: Done 6145 tasks      | elapsed:   52.7s
[Parallel(n_jobs=3)]: Done 9745 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 14145 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 19345 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done 25345 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done 32145 tasks      | elapsed:  4.5min
[Parallel(n_jobs=3)]: Done 39745 tasks      | elapsed:  5.6min
[Parallel(n_jobs=3)]: Done 48145 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done 49745 out of 49745 | elapsed:  7.6min finished


In [47]:
# Sanity check
# Make sure all the files are exatcly 10 seconds long

def check_length(pf):
    duration = sox.file_info.duration(pf)
    sr = sox.file_info.sample_rate(pf)
    if duration != 10.0  or sr != SAMPLE_RATE:
        return (duration, 
                sox.file_info.sample_rate(pf),
                sox.file_info.bitrate(pf),
                pf)
    channels = sox.file_info.channels(pf)
    if channels != 1:
        return (channels, pf)
    return False

print(f'Starting to process {len(wav_audio_files)} files')
errors = Parallel(n_jobs=num_cores, verbose=1)(delayed(check_length)(i) for i in wav_audio_files)

Starting to process 49745 files


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  67 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 367 tasks      | elapsed:    9.5s
[Parallel(n_jobs=3)]: Done 867 tasks      | elapsed:   21.6s
[Parallel(n_jobs=3)]: Done 1567 tasks      | elapsed:   38.6s
[Parallel(n_jobs=3)]: Done 2467 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done 3567 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 4867 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done 6367 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done 8067 tasks      | elapsed:  3.3min
[Parallel(n_jobs=3)]: Done 9967 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done 12067 tasks      | elapsed:  4.6min
[Parallel(n_jobs=3)]: Done 14367 tasks      | elapsed:  5.4min
[Parallel(n_jobs=3)]: Done 16867 tasks      | elapsed:  6.3min
[Parallel(n_jobs=3)]: Done 19567 tasks      | elapsed:  7.3min
[Parallel(n_jobs=3)]: Done 22467 tasks      | elapsed:  

In [48]:
# count errors
filtered_errors = [x for x in errors if x]

if len(filtered_errors) > 0:
    print('There were errors')
    print(filtered_errors)
else:
    print('All files were 10.0 seconds long.  Ready to move on.')

There were errors
[(None, 22050.0, None, 'data/audio_10sec/XC109763.wav')]


# Save Mel-Spectrograms

Here we are saving the Mel-Spectrograms with the best parameters from our model testing so that I can try using mixins to augment the data.  Mixins work on the image representation of the audio file and are calculated by the datagenerator so the Mel-Spectrogram must be completed prior to feeding the data into the model.

In [19]:
melspec_dir = OUTPUT_DIR + '_mels'

In [20]:
def save_melspec(af, n_ftt, hop_length, n_mels):  n_fft=2048, hop_length=512
    mel_spec = librosa.feature.melspectrogram(data, sr=SAMPLE_RATE, n_fft=n_fft, 
                                       hop_length=hop_length, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    return mel_spec_dg

In [None]:
librosa.display.specshow(S_DB, sr=sample_rate, hop_length=hop_length, x_axis='time', y_axis='mel');
plt.colorbar(format='%+2.0f dB');

In [42]:
# supress mp3 warnings
import warnings
warnings.filterwarnings('ignore', message='PySoundFile failed. Trying audioread instead.')

In [43]:
# see if we can load 10 second clips to make mel spectrograms
shapes = []

# set from best model
n_fft = 2048
hop_length=512
n_mels=128

for file in tqdm(audio_files):
    

    data, sr = librosa.load(file, sr=SAMPLE_RATE, duration=10.0)
    mel_spec = librosa.feature.melspectrogram(data, 
                                              sr=SAMPLE_RATE, 
                                              n_fft=n_fft, 
                                              hop_length=hop_length, 
                                              n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    shapes.append(mel_spec_db.shape)

  0%|          | 7/37503 [00:09<14:03:21,  1.35s/it]


KeyboardInterrupt: 

In [44]:
print('All outputs are the same shape:', len(set(shapes)) == 1)

All outputs are the same shape: True
