In [1]:
import os
import time
import pydub
from pydub.silence import split_on_silence
from pydub import AudioSegment

#### Establish Sound Processing Criteria

In [2]:
silence_length_ms = 4_000
silence_threshold_delta_dBFS = -5

min_chunk_length_ms = 2_000
max_chunk_length_ms = 60_000

#### Identify File Paths Specific to Your Machine

In [3]:
# File Path to Raw Audio
raw_path = "./Datasets/mp3_download/cambridge/LUKE"

# Verify Appropriate Files Exist in this Path
os.listdir(raw_path)

['31120-20200502-1738.mp3',
 '31120-20200502-1539.mp3',
 '31120-20200502-1808.mp3',
 '31120-20200502-1216.mp3',
 '31120-20200502-1422.mp3',
 '.ipynb_checkpoints',
 '31120-20200502-1452.mp3',
 '31120-20200502-1322.mp3',
 '31120-20200502-1252.mp3',
 'held',
 '31120-20200502-1522.mp3',
 '31120-20200502-1708.mp3']

In [4]:
# File Path for Structured Audio
output_path = "./Datasets/sample_audio/"

# Verify Outpath Path Exists
os.listdir(output_path)

['.DS_Store', 'Old-metro-bos', '.ipynb_checkpoints']

#### Split Archive Feeds into Chunks

In [5]:
aggregate_chunks = []

for raw_file in os.listdir(raw_path):
    if raw_file.endswith('.mp3'):
        print(f"Processing {raw_file}")
        # Start Timer
        start_time = time.time()

        # Pull in File
        audio_to_split = AudioSegment.from_mp3(raw_path + raw_file)

        # Split based on silence critiera
        chunks = split_on_silence(audio_to_split,
                                 min_silence_len = silence_length_ms,
                                 silence_thresh = audio_to_split.dBFS + silence_threshold_delta_dBFS)

        # Calculate Processing Time
        processing_time = round(time.time() - start_time,0)

        # Return informative statement
        print(f'file {raw_file} required {processing_time} seconds to split into {len(chunks)} pieces \n')

        # Append chunks to master list
        aggregate_chunks.append(chunks)   
        
        start_time = time.time()

        counter = 1
        for chunks in aggregate_chunks:
            for chunk in chunks:
                file_name = "sample{}.wav".format(counter)
                if len(chunk) >= min_chunk_length_ms and len(chunk) <= max_chunk_length_ms:
                    chunk.export(f"{output_path}{raw_file}sample{counter}.wav", format = "wav",
                                 format = "wav")
                    counter+=1
        print(f'Exporting {counter} chunks required {time.time()-start_time} seconds')

Processing 31120-20200502-1738.mp3
file 31120-20200502-1738.mp3 required 600.0 seconds to split into 1 pieces 

Processing 31120-20200502-1539.mp3


KeyboardInterrupt: 

#### Export Chunks that meet Criteria

In [None]:
chunks

In [None]:
start_time = time.time()

counter = 1
for chunks in aggregate_chunks:
    for chunk in chunks:
        file_name = "sample{}.wav".format(counter)
        if len(chunk) >= min_chunk_length_ms and len(chunk) <= max_chunk_length_ms:
            chunk.export(output_path + file_name,
                         format = "wav")
            counter+=1
print(f'Exporting {counter} chunks required {time.time()-start_time} seconds')

#### Confirm Files Exist

In [None]:
os.listdir(output_path)