# Preprocess Audio Files
This notebook contains the code for preprocessing the audio files before training.

10 second audio in wav format with a sample rate of 48k was chosen because it matches the audio used to train the l3 model from which I plan to use for feature embeddings.

The sample dataset only includes the training data, so we will split that into training and testing data.  If the mode proves viable we will download more data from xeno-canto.org.

In [1]:
import sox # must install sox locally if you want mp3 support
import os
from glob import glob
import shutil
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import pandas as pd
import soundfile as sf
import librosa

In [2]:
ORIGINAL_DATA_DIR = 'data/train_audio'
EXTRA_DATA_DIR = 'data/xeno-canto/'
OUTPUT_DIR = 'data/audio_10sec'
SAMPLE_RATE = 22050

# create output dir if it does not exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# get a list of all mp3 files

# from original dataset
audio_files = glob(os.path.join(ORIGINAL_DATA_DIR, '*/*.mp3'))

# from additional downloaded data from xeno-canto.com
audio_files += glob(os.path.join(EXTRA_DATA_DIR, '*.mp3'))

audio_files[:5], audio_files[-5:]

(['data/train_audio/olsfly/XC386256.mp3',
  'data/train_audio/olsfly/XC484154.mp3',
  'data/train_audio/olsfly/XC239498.mp3',
  'data/train_audio/olsfly/XC368006.mp3',
  'data/train_audio/olsfly/XC156193.mp3'],
 ['data/xeno-canto/XC76626.mp3',
  'data/xeno-canto/XC441588.mp3',
  'data/xeno-canto/XC368433.mp3',
  'data/xeno-canto/XC146762.mp3',
  'data/xeno-canto/XC627879.mp3'])

# Convert files to 10 second WAV / downsample to `SAMPLE_RATE`

The audio files have to be processed in two steps because adding padding to an mp3 then saving as wav results in imprecise final times.  We need files to be exactly 10 seconds long wtih a sample rate of 48khz for a total of `SAMPLE_RATE` * 10 samples.

The first pass resamples the audio to `SAMPLE_RATE`, cuts the clips to 11 seconds, and saves as wav files.

The second pass pads or crops the files to exactly 10 seconds long.

In [4]:
# get the number of cpu cores available
num_cores = multiprocessing.cpu_count()
print(num_cores)

4


In [5]:
def convert_to_wav(af):
    
    # generate filenames
    wav_name = os.path.splitext(os.path.basename(af))[0] + '.wav'
    class_name = af.split('/')[-2]
    outfile = os.path.join(OUTPUT_DIR, wav_name)
    
    # only convert if we haven't already
    if(not os.path.exists(outfile)):
        
        # use sox to convert
        tfm = sox.Transformer()
        
        # use 11 seconds because the conversion is imprecise
        tfm.trim(start_time=0.0, end_time=11.0)
        tfm.convert(samplerate=SAMPLE_RATE, 
                    n_channels=1, 
                    bitdepth= 16)
        
        # convert
        try:
            tfm.build(input_filepath=af, output_filepath=outfile)    
        except:
            print(f'There was a Sox error on {af}.  The file was not processed')

In [6]:
# Process the conversion using all cores in parallel to save time
print(f'Starting to process {len(audio_files)} files')
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(convert_to_wav)(i) for i in audio_files)

Starting to process 61283 files


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 61283 out of 61283 | elapsed:    2.4s finished


In [7]:
# get a list of all the .wav files
wav_audio_files = glob(os.path.join(OUTPUT_DIR, '*.wav'), recursive=True)
wav_audio_files[:5]

['data/audio_10sec/XC174211.wav',
 'data/audio_10sec/XC454444.wav',
 'data/audio_10sec/XC334841.wav',
 'data/audio_10sec/XC103099.wav',
 'data/audio_10sec/XC172660.wav']

In [8]:
def crop_pad_audio(af):

    # get file duration
    duration = sox.file_info.duration(af)
    
    # generate filename
    basename = os.path.basename(af)
    class_name = af.split('/')[-2]
    outfile = os.path.join(OUTPUT_DIR, basename)    

    # Check for no lengh error
    if duration == None:
        print('duration is None:', af)
    elif duration < 10.0:
        # crop long files
        # move file to pwd
        os.rename(af, basename)
        
        # setup transformer
        tfm = sox.Transformer()
        tfm.pad(start_duration=0.0, end_duration=(10.0 - duration))
        
        # convert
        tfm.build(input_filepath=basename, output_filepath=outfile)

        # remove the old file
        os.remove(basename)        

    elif duration > 10.0:
        # pad short files
        
        # first move the working file to pwd
        os.rename(af, basename)
        
        # setup transformer
        tfm = sox.Transformer()
        tfm.trim(start_time=0.0, end_time=10.0)
        
        # convert
        tfm.build(input_filepath=basename, output_filepath=outfile)

        # remove the old file
        os.remove(basename)

In [9]:
print(f'Starting to process {len(wav_audio_files)} files')
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(crop_pad_audio)(i) for i in wav_audio_files)

Starting to process 61283 files


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 200 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 1400 tasks      | elapsed:   13.6s
[Parallel(n_jobs=4)]: Done 3400 tasks      | elapsed:   32.4s
[Parallel(n_jobs=4)]: Done 6200 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 9800 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 14200 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 19400 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 25400 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 32200 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 39800 tasks      | elapsed:  5.1min
[Parallel(n_jobs=4)]: Done 48200 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 57400 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done 61283 out of 61283 | elapsed:  7.8min finished


In [10]:
# Sanity check
# Make sure all the files are exatcly 10 seconds long

def check_length(pf):
    duration = sox.file_info.duration(pf)
    sr = sox.file_info.sample_rate(pf)
    if duration != 10.0  or sr != SAMPLE_RATE:
        return (duration, 
                sox.file_info.sample_rate(pf),
                sox.file_info.bitrate(pf),
                pf)
    channels = sox.file_info.channels(pf)
    if channels != 1:
        return (channels, pf)
    return False

print(f'Starting to process {len(wav_audio_files)} files')
errors = Parallel(n_jobs=num_cores, verbose=1)(delayed(check_length)(i) for i in wav_audio_files)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Starting to process 61283 files


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    8.9s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   20.0s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:   35.4s
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:   55.2s
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 8076 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 9976 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 12076 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 14376 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 16876 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done 19576 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 22476 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 25576 tasks      | elapsed:  8.0min
[Para

In [11]:
# count errors
def count_errors(errors):
    filtered_errors = [x for x in errors if x]

    if len(filtered_errors) > 0:
        print('There were errors')
        print(filtered_errors)
    else:
        print('All files were 10.0 seconds long.  Ready to move on.')
        
count_errors(errors)

All files were 10.0 seconds long.  Ready to move on.


# Create Full Length npy files

In [12]:
# Set ouput directory
OUTPUT_DIR_NPY = 'data/npy'
os.makedirs(OUTPUT_DIR_NPY, exist_ok=True)

In [13]:
def convert_to_npy(af):
    
    # generate output filename
    npy_name = os.path.splitext(os.path.basename(af))[0] + '.npy'
    outfile = os.path.join(OUTPUT_DIR_NPY, npy_name)

    # convert the audio file to npy
    if not os.path.exists(outfile):
        try:
            signal, sr = librosa.load(af, sr=22050, mono=True)
            np.save(outfile, signal)
        except:
            # return filename if there is an error
            return af

In [15]:
print(f'Starting to process {len(audio_files)} files')
errors = Parallel(n_jobs=num_cores, verbose=1)(delayed(convert_to_npy)(af) for af in audio_files)

Starting to process 61283 files


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 61283 out of 61283 | elapsed:    2.7s finished


In [16]:
# list errors
[e for e in errors if e != None]

[]