# Splitting

### Example

Copy a few files for example splitting

In [1]:
%%bash
cp /Volumes/seagate4/xeno-canto/aba_code_1-2/acanthis-flammea/mp3s/1156*.mp3 ./example_files_to_split/
cp /Volumes/seagate4/xeno-canto/aba_code_1-2/bubo-scandiacus/mp3s/219348.mp3 ./example_files_to_split/
cp /Volumes/seagate4/xeno-canto/aba_code_1-2/catharus-fuscescens/mp3s/63336.mp3 ./example_files_to_split/

In [2]:
import os
import math
import time
import shutil
import librosa

In [3]:
base_dir = './example_files_to_split/'
new_dir = './example_splits/'
out_file = os.path.join(new_dir, 'split_out_test.txt')
if os.path.exists(new_dir):
    shutil.rmtree(new_dir)
if os.path.exists(out_file):
    os.remove(out_file)

In [4]:
def dry_out(dry_run, output, out_file=None):
    '''
    Print if dry_run, save output to file if not

    Args:
        dry_run (bool): whether this is a dry run
            if True: prints output
            if False: appends output to out_file
        output (str): string output to print or save
        out_file (file): file to save output to;
            only used if dry_run is False
    '''

    if dry_run:
        print(output)
    else:
        assert out_file
        with open(out_file, 'a') as f:
            f.write(output + '\n')

In [5]:
def list_audios(base_dir):
    return [os.path.join(base_dir, file) 
            for file in os.listdir(base_dir)
            if file.lower().endswith(('.wav', '.mp3'))]

In [None]:
def sox_split():
    '''
    Split long audio files using sox
    '''

    stop = start + len_file

    # Create a filename indicating start and stop times
    cat_num = file_path.split(os.sep)[-1][:-4]
    #split_filename = f'{cat_num}-split-{idx}.wav'
    split_filename = f'{cat_num}_split_{start}s-{stop}s.wav'
    split_path = str(os.path.join(new_dir, split_filename))

    # Fail if try to stop after duration of file
    if stop > duration:
        print_or_save(
            output = f'FAIL (stop ({stop}) < duration ({duration})): {split_path}'
        )
        continue

    # Split file using call to sox in bash
    if not dry_run:
        output = !sox $file_path $split_path trim $start $stop
    print_or_save(output = f'SUCC: {split_path}')

In [None]:
def librosa_split():
    '''
    Split long audio files using librosa
    '''
    
    stop = start + len_file

    # Fail if try to stop after duration of file
    if stop > duration:
        print_or_save(output = f'FAIL (stop ({stop}) < duration ({duration})): {split_path}')
        continue

    start_samples = start * sample_rate
    stop_samples = stop * sample_rate

    # Create a filename indicating start and stop times
    cat_num = file_path.split(os.sep)[-1][:-4]
    split_filename = f'{cat_num}_split_{start}s-{stop}s.wav'
    split_path = str(os.path.join(new_dir, split_filename))

    # Split file using call to sox in bash
    if not dry_run:
        cut = samples[start_samples:stop_samples]
        librosa.output.write_wav(path = split_path, y = cut, sr = sample_rate)
    print_or_save(output = f'SUCC: {split_path}')


In [None]:
def split_files(
    method,
    new_dir,
    base_dir = None,
    files_to_split = None,
    out_file=None,
    len_file = 6,
    jump_size = 1,
    dry_run = False
):
    '''
    Split long audio files
    
    Use sox or librosa to split .mp3 or .wav audio files into .wav
    files of length `len_file`. Files are split with 
    a rolling window, which increments according 
    to `jump_size`. For instance, with the default settings,
    an 8-second file would be broken up into three files:
    
        one from 0s to 6s
        one from 1s to 7s
        one from 2s to 8s
        
    These files are named starting with the filename of the
    source file, plus an indication of the times from which
    they were split. For instance, in the above example, if
    the original file were named 'test.wav' the output files
    would have the following filenames:
    
        test_split_0s-6s.wav
        test_split_1s-7s.wav
        test_split_2s-8s.wav
    
    The 'librosa' method loads the entire audio file into memory
    once, and then splits the 'samples' array. The 'sox' method
    loads the file once for every output file to be made.
    If jump_size is small or files or long, more output files 
    will be made. The 'librosa' method is faster than the sox if a 
    file must be loaded many times (librosa is faster at ~75 loads);
    the 'fastest' method can be used to select 'sox' if fewer than
    75 files will be made; otherwise it selects 'librosa'.
    
    Files to split can be specified by the `base_dir` argument,
    in which case all `.wav` and `.mp3` files in `base_dir` will be 
    split, or as a list of files in the `files_to_split` argument.
    
    Args:
        method (str, 'fastest', 'sox', 'librosa'): what method
            to use to split files. 'fastest' automatically
            selects which will be faster for each file
        new_dir (str): directory where split files will
            be saved
        base_dir (str): base directory containing all mp3
            or wav files to split. Only used if 
            files_to_split is not provided.
        files_to_split (list): list of files to be split.
            Alternatively, can use `base_dir` (see above)
        len_file (float): length of each finished split file
        jump_size (int): size to jump between each split file
        dry_run (bool): whether to print example results 
            instead of performing the actual actions.
            does not create directories or save files.
    '''
    
    if method not in ('sox', 'librosa', 'fastest'):
         raise ValueError(f'method must be sox, librosa, or fastest. Got "{method}"')
    
    # Assert an out_file is provided
    if (not dry_run) and (not out_file):
        raise ValueError('out_file must be provided when not dry_running')

    # List all WAV or MP3 files
    if not files_to_split:
        if not base_dir:
            raise ValueError('one of base_dir or files_to_split must be provided')
        else:
            files_to_split = list_audios(base_dir)
    
    t0 = time.time()
    
    # Outputter function
    def print_or_save(output): 
        return dry_out(dry_run = dry_run, out_file = out_file, output = output)
    
    # Make new directory if it doesn't exist already
    if not os.path.exists(new_dir):
        if not dry_run: os.makedirs(new_dir)
        print_or_save(output = f'Made {new_dir}')

    # Split each file using either sox or librosa
    for file_path in files_to_split:
        # Get duration using sox
        out = !soxi -D $file_path
        try: duration = float(out[0])
        except ValueError:
            print_or_save(output = f'FAIL (no duration): {file_path}')
            continue

        seconds_to_split = math.floor(duration) - (len_file - 1)
        starts = range(0, seconds_to_split, jump_size)
        
        # Select the fastest method based on the number of files
        if method == 'fastest':
            if len(starts) < 75:
                method = 'sox'
            else:
                method = 'librosa'

        # Split any files that are longer than len_file
        if method == 'sox':
            for idx, start in enumerate(starts):
                sox_split()
            
        elif method == 'librosa':
            samples, sample_rate = librosa.load(file_path)
            for idx, start in enumerate(starts):
                librosa_split()
                

    t1 = time.time()

    final_msg = f'DONE: {t1-t0} seconds elapsed for {len(files_to_split)} files'
    dry_out(
        dry_run = dry_run,
        out_file = out_file,
        output = final_msg
    )

In [6]:
def split_files(
    method,
    new_dir,
    base_dir = None,
    files_to_split = None,
    out_file=None,
    len_file = 6,
    jump_size = 1,
    dry_run = False
):
    '''
    Split long audio files
    
    Use sox or librosa to split .mp3 or .wav audio files into .wav
    files of length `len_file`. Files are split with 
    a rolling window, which increments according 
    to `jump_size`. For instance, with the default settings,
    an 8-second file would be broken up into three files:
    
        one from 0s to 6s
        one from 1s to 7s
        one from 2s to 8s
        
    These files are named starting with the filename of the
    source file, plus an indication of the times from which
    they were split. For instance, in the above example, if
    the original file were named 'test.wav' the output files
    would have the following filenames:
    
        test_split_0s-6s.wav
        test_split_1s-7s.wav
        test_split_2s-8s.wav
    
    The 'librosa' method loads the entire audio file into memory
    once, and then splits the 'samples' array. The 'sox' method
    loads the file once for every output file to be made.
    If jump_size is small or files or long, more output files 
    will be made. The 'librosa' method is faster than the sox if a 
    file must be loaded many times (librosa is faster at ~75 loads);
    the 'fastest' method can be used to select 'sox' if fewer than
    75 files will be made; otherwise it selects 'librosa'.
    
    Files to split can be specified by the `base_dir` argument,
    in which case all `.wav` and `.mp3` files in `base_dir` will be 
    split, or as a list of files in the `files_to_split` argument.
    
    Args:
        method (str, 'fastest', 'sox', 'librosa'): what method
            to use to split files. 'fastest' automatically
            selects which will be faster for each file
        new_dir (str): directory where split files will
            be saved
        base_dir (str): base directory containing all mp3
            or wav files to split. Only used if 
            files_to_split is not provided.
        files_to_split (list): list of files to be split.
            Alternatively, can use `base_dir` (see above)
        len_file (float): length of each finished split file
        jump_size (int): size to jump between each split file
        dry_run (bool): whether to print example results 
            instead of performing the actual actions.
            does not create directories or save files.
    '''
    
    if method not in ('sox', 'librosa', 'fastest'):
         raise ValueError(f'method must be sox, librosa, or fastest. Got "{method}"')
    
    # Assert an out_file is provided
    if (not dry_run) and (not out_file):
        raise ValueError('out_file must be provided when not dry_running')

    # List all WAV or MP3 files
    if not files_to_split:
        if not base_dir:
            raise ValueError('one of base_dir or files_to_split must be provided')
        else:
            files_to_split = list_audios(base_dir)
    
    t0 = time.time()
    
    # Outputter function
    def print_or_save(output): 
        return dry_out(dry_run = dry_run, out_file = out_file, output = output)
    
    # Make new directory if it doesn't exist already
    if not os.path.exists(new_dir):
        if not dry_run: os.makedirs(new_dir)
        print_or_save(output = f'Made {new_dir}')

    # Split each file using either sox or librosa
    for file_path in files_to_split:
        # Get duration using sox
        out = !soxi -D $file_path
        try: duration = float(out[0])
        except ValueError:
            print_or_save(output = f'FAIL (no duration): {file_path}')
            continue
        

        seconds_to_split = math.floor(duration) - (len_file - 1)
        starts = range(0, seconds_to_split, jump_size)
        
        # Select the fastest method based on the number of files
        if method == 'fastest':
            if len(starts) < 75:
                method = 'sox'
            else:
                method = 'librosa'

        # Split any files that are longer than len_file
        if method == 'sox':
            for idx, start in enumerate(starts):
                stop = start + len_file

                # Create a filename indicating start and stop times
                cat_num = file_path.split(os.sep)[-1][:-4]
                #split_filename = f'{cat_num}-split-{idx}.wav'
                split_filename = f'{cat_num}_split_{start}s-{stop}s.wav'
                split_path = str(os.path.join(new_dir, split_filename))

                # Fail if try to stop after duration of file
                if stop > duration:
                    print_or_save(
                        output = f'FAIL (stop ({stop}) < duration ({duration})): {split_path}'
                    )
                    continue

                # Split file using call to sox in bash
                if not dry_run:
                    output = !sox $file_path $split_path trim $start $stop
                print_or_save(output = f'SUCC: {split_path}')
        
        elif method == 'librosa':
            samples, sample_rate = librosa.load(file_path)
            for idx, start in enumerate(starts):
                stop = start + len_file
                
                # Fail if try to stop after duration of file
                if stop > duration:
                    print_or_save(output = f'FAIL (stop ({stop}) < duration ({duration})): {split_path}')
                    continue
                    
                start_samples = start * sample_rate
                stop_samples = stop * sample_rate
        
                # Create a filename indicating start and stop times
                cat_num = file_path.split(os.sep)[-1][:-4]
                split_filename = f'{cat_num}_split_{start}s-{stop}s.wav'
                split_path = str(os.path.join(new_dir, split_filename))
                
                # Split file using call to sox in bash
                if not dry_run:
                    cut = samples[start_samples:stop_samples]
                    librosa.output.write_wav(path = split_path, y = cut, sr = sample_rate)
                print_or_save(output = f'SUCC: {split_path}')
                

    t1 = time.time()

    final_msg = f'DONE: {t1-t0} seconds elapsed for {len(files_to_split)} files'
    dry_out(
        dry_run = dry_run,
        out_file = out_file,
        output = final_msg
    )

In [7]:
# Test dry run
assert not os.path.exists(new_dir)
split_files(method = 'fastest', new_dir = new_dir, base_dir = base_dir, dry_run = True)
assert not os.path.exists(new_dir)

Made ./example_splits/
SUCC: ./example_splits/115600_split_0s-6s.wav
SUCC: ./example_splits/115600_split_1s-7s.wav
SUCC: ./example_splits/115600_split_2s-8s.wav
SUCC: ./example_splits/115600_split_3s-9s.wav
SUCC: ./example_splits/115600_split_4s-10s.wav
SUCC: ./example_splits/115600_split_5s-11s.wav
SUCC: ./example_splits/115600_split_6s-12s.wav
SUCC: ./example_splits/115600_split_7s-13s.wav
SUCC: ./example_splits/115600_split_8s-14s.wav
SUCC: ./example_splits/115600_split_9s-15s.wav
SUCC: ./example_splits/115600_split_10s-16s.wav
SUCC: ./example_splits/115600_split_11s-17s.wav
SUCC: ./example_splits/115600_split_12s-18s.wav
SUCC: ./example_splits/115600_split_13s-19s.wav
SUCC: ./example_splits/115600_split_14s-20s.wav
SUCC: ./example_splits/115600_split_15s-21s.wav
SUCC: ./example_splits/115600_split_16s-22s.wav
SUCC: ./example_splits/115600_split_17s-23s.wav
SUCC: ./example_splits/115600_split_18s-24s.wav
SUCC: ./example_splits/115600_split_19s-25s.wav
SUCC: ./example_splits/115600_s

In [8]:
# Test real instance
split_files(
    method = 'fastest',
    base_dir = base_dir,
    new_dir = new_dir,
    dry_run = False,
    out_file = out_file)
assert os.path.exists(new_dir)

### For real

In [9]:
base_dir = os.path.join(os.sep, 
    *'/Volumes/seagate4/xeno-canto/aba_code_1-2'.split(os.sep))
old_str = 'aba_code_1-2'
new_str = 'aba_code_1-2_split'

Generate dictionary:
* Keys: absolute path to the original `mp3s/` folder of any directory in `base_dir` that contains files in said `mp3s/` folder
* Values: the new path to be created for split files

In [10]:
to_skip = []
to_split = []
for sp_dir in os.listdir(base_dir):
    mp3s_dir = os.path.join(base_dir, sp_dir, 'mp3s')
    try:
        if any(('mp3' in file) for file in os.listdir(mp3s_dir)):
            to_split.append(mp3s_dir)
        else:
            to_skip.append(sp_dir)
    except NotADirectoryError:
        continue
split_dict = {}
for directory in to_split:
    split_dict[directory] = directory.replace(old_str, new_str)

In [11]:
out_file = './full_split_output.txt'
t0 = time.time()
print_freq = 10
for idx, orig_dir in enumerate(sorted(split_dict.keys())[20:]):
    new_dir = split_dict[orig_dir]
    split_files(
        method = 'fastest',
        base_dir = orig_dir,
        new_dir = new_dir,
        out_file = out_file,
        dry_run = False)
    if not (idx + 1) % print_freq:
        print(f'Finished {idx + 1} species in {time.time() - t0} seconds')
t1 = time.time()
print(f'Finished all species in {t1-t0} seconds')

Finished 10 species in 12403.57561993599 seconds


NameError: name 'librosa' is not defined