In [1]:
import os
import time
import pydub
import seaborn as sns
from pydub.silence import split_on_silence
from pydub import AudioSegment

#### Establish Sound Processing Criteria

In [2]:
silence_length_ms = 4_000
silence_threshold_delta_dBFS = -5

min_chunk_length_ms = 2_000
max_chunk_length_ms = 60_000

#### Identify File Paths Specific to Your Machine

In [3]:
# File Path to Raw Audio
raw_path = "./Datasets/dolby_tests/enhanced/"

# Verify Appropriate Files Exist in this Path
os.listdir(raw_path)

['enhanced-25818-20200502-1531.mp3',
 'enhanced-25818-20200502-1332.mp3',
 '.DS_Store',
 'enhanced-25818-20200502-1431.mp3',
 'enhanced-25818-20200502-1222.mp3',
 'enhanced-25818-20200502-1401.mp3',
 'enhanced-25818-20200502-1501.mp3']

In [4]:
# File Path for Structured Audio
output_path = "./Datasets/dolby_tests/chunks/enhanced/"

# Verify Outpath Path Exists
os.listdir(output_path)

['sample27-enhanced-25818-20200501-1210.wav',
 'sample4-enhanced-25818-20200501-1210.wav',
 'sample10-enhanced-25818-20200501-1210.wav',
 'sample38-enhanced-25818-20200501-1210.wav',
 'sample49-enhanced-25818-20200501-1210.wav',
 'sample23-enhanced-25818-20200501-1210.wav',
 'sample14-enhanced-25818-20200501-1210.wav',
 'sample45-enhanced-25818-20200501-1210.wav',
 'sample34-enhanced-25818-20200501-1210.wav',
 'sample8-enhanced-25818-20200501-1210.wav',
 'sample18-enhanced-25818-20200501-1210.wav',
 'sample41-enhanced-25818-20200501-1210.wav',
 'sample30-enhanced-25818-20200501-1210.wav',
 'sample28-enhanced-25818-20200501-1210.wav',
 'sample46-enhanced-25818-20200501-1210.wav',
 'sample37-enhanced-25818-20200501-1210.wav',
 'sample42-enhanced-25818-20200501-1210.wav',
 'sample33-enhanced-25818-20200501-1210.wav',
 'sample24-enhanced-25818-20200501-1210.wav',
 'sample7-enhanced-25818-20200501-1210.wav',
 'sample13-enhanced-25818-20200501-1210.wav',
 'sample3-enhanced-25818-20200501-121

#### Split Archive Feeds into Chunks and Export Chunks that meet Criteria

In [5]:
# Start Timer
start_time = time.time()

# Counter for file names
counter = 0
# Calculat total length of audio
total_length_to_process = 0

for raw_file in os.listdir(raw_path): 
    if raw_file.endswith('.mp3'):
        print(f"Examine {raw_file}")

        # Pull in File
        audio_to_split = AudioSegment.from_mp3(raw_path + raw_file)

        # Split based on silence critiera
        chunks = split_on_silence(audio_to_split,
                                 min_silence_len = silence_length_ms,
                                 silence_thresh = audio_to_split.dBFS + silence_threshold_delta_dBFS)

        # Calculate Processing Time
        processing_time = round(time.time() - start_time,0)

        # Return informative statement
        print(f'File {raw_file} required {processing_time} seconds to split into {len(chunks)} candidate chunks of audio')

        # Export qualifying chunks
        for chunk in chunks:
            sound_length = len(chunk)
            if sound_length >= min_chunk_length_ms and sound_length <= max_chunk_length_ms:
                total_length_to_process += len(chunk)
                print(f"this chunk is {len(chunk)/1_000} seconds long and our count is {counter}")
                counter +=1
                file_name = "sample{}-{}.wav".format(counter,raw_file.split(".")[0])
                print(f'write this file as {file_name}')
                chunk.export(output_path + file_name,
                         format = "wav")
        print('\n')
    
print(f'Exporting {counter} qualifying chunks required {time.time()-start_time} seconds') 
print(f'Total audio length of {total_length_to_process/36_000} minutes')

Examine enhanced-25818-20200502-1531.mp3
File enhanced-25818-20200502-1531.mp3 required 311.0 seconds to split into 98 candidate chunks of audio
this chunk is 4.155 seconds long and our count is 0
write this file as sample1-enhanced-25818-20200502-1531.wav
this chunk is 6.606 seconds long and our count is 1
write this file as sample2-enhanced-25818-20200502-1531.wav
this chunk is 14.95 seconds long and our count is 2
write this file as sample3-enhanced-25818-20200502-1531.wav
this chunk is 2.478 seconds long and our count is 3
write this file as sample4-enhanced-25818-20200502-1531.wav
this chunk is 30.665 seconds long and our count is 4
write this file as sample5-enhanced-25818-20200502-1531.wav
this chunk is 22.227 seconds long and our count is 5
write this file as sample6-enhanced-25818-20200502-1531.wav
this chunk is 47.774 seconds long and our count is 6
write this file as sample7-enhanced-25818-20200502-1531.wav
this chunk is 6.103 seconds long and our count is 7
write this file 

File enhanced-25818-20200502-1431.mp3 required 922.0 seconds to split into 78 candidate chunks of audio
this chunk is 6.94 seconds long and our count is 128
write this file as sample129-enhanced-25818-20200502-1431.wav
this chunk is 4.532 seconds long and our count is 129
write this file as sample130-enhanced-25818-20200502-1431.wav
this chunk is 6.272 seconds long and our count is 130
write this file as sample131-enhanced-25818-20200502-1431.wav
this chunk is 14.793 seconds long and our count is 131
write this file as sample132-enhanced-25818-20200502-1431.wav
this chunk is 56.766 seconds long and our count is 132
write this file as sample133-enhanced-25818-20200502-1431.wav
this chunk is 25.611 seconds long and our count is 133
write this file as sample134-enhanced-25818-20200502-1431.wav
this chunk is 5.832 seconds long and our count is 134
write this file as sample135-enhanced-25818-20200502-1431.wav
this chunk is 6.475 seconds long and our count is 135
write this file as sample136

File enhanced-25818-20200502-1401.mp3 required 1382.0 seconds to split into 79 candidate chunks of audio
this chunk is 4.011 seconds long and our count is 248
write this file as sample249-enhanced-25818-20200502-1401.wav
this chunk is 2.935 seconds long and our count is 249
write this file as sample250-enhanced-25818-20200502-1401.wav
this chunk is 4.865 seconds long and our count is 250
write this file as sample251-enhanced-25818-20200502-1401.wav
this chunk is 2.452 seconds long and our count is 251
write this file as sample252-enhanced-25818-20200502-1401.wav
this chunk is 14.51 seconds long and our count is 252
write this file as sample253-enhanced-25818-20200502-1401.wav
this chunk is 5.412 seconds long and our count is 253
write this file as sample254-enhanced-25818-20200502-1401.wav
this chunk is 11.355 seconds long and our count is 254
write this file as sample255-enhanced-25818-20200502-1401.wav
this chunk is 9.97 seconds long and our count is 255
write this file as sample256-

#### Confirm Files Exist

In [None]:
os.listdir(output_path)