In [1]:
import os
import time
import pydub
import seaborn as sns
from pydub.silence import split_on_silence
from pydub import AudioSegment

#### Establish Sound Processing Criteria

In [2]:
silence_length_ms = 4_000
silence_threshold_delta_dBFS = -5

min_chunk_length_ms = 2_000
max_chunk_length_ms = 60_000

#### Identify File Paths Specific to Your Machine

In [7]:
# File Path to Raw Audio
raw_path = "./Datasets/dolby_tests/enhanced/"

# Verify Appropriate Files Exist in this Path
os.listdir(raw_path)

['enhanced-25818-20200501-1210.mp3']

In [8]:
# File Path for Structured Audio
output_path = "./Datasets/dolby_tests/chunks/enhanced/"

# Verify Outpath Path Exists
os.listdir(output_path)

[]

#### Split Archive Feeds into Chunks and Export Chunks that meet Criteria

In [9]:
# Start Timer
start_time = time.time()

# Counter for file names
counter = 0
# Calculat total length of audio
total_length_to_process = 0

for raw_file in os.listdir(raw_path): 
    if raw_file.endswith('.mp3'):
        print(f"Examine {raw_file}")

        # Pull in File
        audio_to_split = AudioSegment.from_mp3(raw_path + raw_file)

        # Split based on silence critiera
        chunks = split_on_silence(audio_to_split,
                                 min_silence_len = silence_length_ms,
                                 silence_thresh = audio_to_split.dBFS + silence_threshold_delta_dBFS)

        # Calculate Processing Time
        processing_time = round(time.time() - start_time,0)

        # Return informative statement
        print(f'File {raw_file} required {processing_time} seconds to split into {len(chunks)} candidate chunks of audio')

        # Export qualifying chunks
        for chunk in chunks:
            sound_length = len(chunk)
            if sound_length >= min_chunk_length_ms and sound_length <= max_chunk_length_ms:
                total_length_to_process += len(chunk)
                print(f"this chunk is {len(chunk)/1_000} seconds long and our count is {counter}")
                counter +=1
                file_name = "sample{}-{}.wav".format(counter,raw_file.split(".")[0])
                print(f'write this file as {file_name}')
                chunk.export(output_path + file_name,
                         format = "wav")
        print('\n')
    
print(f'Exporting {counter} qualifying chunks required {time.time()-start_time} seconds') 
print(f'Total audio length of {total_length_to_process/36_000} minutes')

Examine enhanced-25818-20200501-1210.mp3
File enhanced-25818-20200501-1210.mp3 required 280.0 seconds to split into 72 candidate chunks of audio
this chunk is 14.592 seconds long and our count is 0
write this file as sample1-enhanced-25818-20200501-1210.wav
this chunk is 14.354 seconds long and our count is 1
write this file as sample2-enhanced-25818-20200501-1210.wav
this chunk is 18.79 seconds long and our count is 2
write this file as sample3-enhanced-25818-20200501-1210.wav
this chunk is 2.466 seconds long and our count is 3
write this file as sample4-enhanced-25818-20200501-1210.wav
this chunk is 2.136 seconds long and our count is 4
write this file as sample5-enhanced-25818-20200501-1210.wav
this chunk is 20.274 seconds long and our count is 5
write this file as sample6-enhanced-25818-20200501-1210.wav
this chunk is 4.567 seconds long and our count is 6
write this file as sample7-enhanced-25818-20200501-1210.wav
this chunk is 5.133 seconds long and our count is 7
write this file 

#### Confirm Files Exist

In [None]:
os.listdir(output_path)