In [1]:
import os
import time
import pydub
import seaborn as sns
from pydub.silence import split_on_silence
from pydub import AudioSegment

#### Establish Sound Processing Criteria

In [2]:
silence_length_ms = 4_000
silence_threshold_delta_dBFS = -5

min_chunk_length_ms = 2_000
max_chunk_length_ms = 60_000

#### Identify File Paths Specific to Your Machine

In [3]:
# File Path to Raw Audio
raw_path = "./Datasets/mp3_download/Alex/"

# Verify Appropriate Files Exist in this Path
os.listdir(raw_path)

['25818-20200501-0413.mp3',
 '25818-20200501-1240.mp3',
 '25818-20200501-0941.mp3',
 '25818-20200501-1040.mp3',
 '25818-20200501-0612.mp3',
 '25818-20200501-0213.mp3',
 '25818-20200501-1509.mp3',
 '25818-20200501-0712.mp3',
 '25818-20200501-1140.mp3',
 '25818-20200501-0841.mp3',
 '25818-20200501-0512.mp3',
 '25818-20200501-0113.mp3',
 '25818-20200501-1409.mp3',
 '25818-20200501-0313.mp3',
 '25818-20200501-1439.mp3',
 '25818-20200501-0243.mp3',
 '25818-20200501-1210.mp3',
 '25818-20200501-0442.mp3',
 '25818-20200501-0911.mp3',
 '25818-20200501-0044.mp3',
 '25818-20200501-0642.mp3',
 '25818-20200501-1011.mp3',
 '25818-20200501-0143.mp3',
 '25818-20200501-0343.mp3',
 '25818-20200501-1110.mp3',
 '25818-20200501-0811.mp3',
 '25818-20200501-1339.mp3',
 '25818-20200501-0741.mp3',
 '25818-20200501-0542.mp3',
 '25818-20200501-1310.mp3']

In [4]:
# File Path for Structured Audio
output_path = "./Datasets/sample_audio/Alex/"

# Verify Outpath Path Exists
os.listdir(output_path)

['untitled copy.txt']

#### Split Archive Feeds into Chunks and Export Chunks that meet Criteria

In [6]:
# Start Timer
start_time = time.time()

# Counter for file names
counter = 0
# Calculat total length of audio
total_length_to_process = 0

for raw_file in os.listdir(raw_path): 
    if raw_file.endswith('.mp3'):
        print(f"Examine {raw_file}")

        # Pull in File
        audio_to_split = AudioSegment.from_mp3(raw_path + raw_file)

        # Split based on silence critiera
        chunks = split_on_silence(audio_to_split,
                                 min_silence_len = silence_length_ms,
                                 silence_thresh = audio_to_split.dBFS + silence_threshold_delta_dBFS)

        # Calculate Processing Time
        processing_time = round(time.time() - start_time,0)

        # Return informative statement
        print(f'File {raw_file} required {processing_time} seconds to split into {len(chunks)} candidate chunks of audio')

        # Export qualifying chunks
        for chunk in chunks:
            sound_length = len(chunk)
            if sound_length >= min_chunk_length_ms and sound_length <= max_chunk_length_ms:
                total_length_to_process += len(chunk)
                print(f"this chunk is {len(chunk)/1_000} seconds long and our count is {counter}")
                counter +=1
                file_name = "sample{}-{}.wav".format(counter,raw_file.split(".")[0])
                print(f'write this file as {file_name}')
                chunk.export(output_path + file_name,
                         format = "wav")
        print('\n')
    
print(f'Exporting {counter} qualifying chunks required {time.time()-start_time} seconds') 
print(f'Total audio length of {total_length_to_process/36_000} minutes')

Examine 25818-20200501-0413.mp3
File 25818-20200501-0413.mp3 required 443.0 seconds to split into 38 candidate chunks of audio
this chunk is 2.7 seconds long and our count is 0
write this file as sample1-25818-20200501-0413.wav
this chunk is 17.118 seconds long and our count is 1
write this file as sample2-25818-20200501-0413.wav
this chunk is 18.559 seconds long and our count is 2
write this file as sample3-25818-20200501-0413.wav
this chunk is 4.065 seconds long and our count is 3
write this file as sample4-25818-20200501-0413.wav
this chunk is 14.434 seconds long and our count is 4
write this file as sample5-25818-20200501-0413.wav
this chunk is 8.232 seconds long and our count is 5
write this file as sample6-25818-20200501-0413.wav
this chunk is 7.303 seconds long and our count is 6
write this file as sample7-25818-20200501-0413.wav
this chunk is 22.608 seconds long and our count is 7
write this file as sample8-25818-20200501-0413.wav
this chunk is 13.648 seconds long and our count

File 25818-20200501-0941.mp3 required 1251.0 seconds to split into 57 candidate chunks of audio
this chunk is 10.23 seconds long and our count is 90
write this file as sample91-25818-20200501-0941.wav
this chunk is 5.506 seconds long and our count is 91
write this file as sample92-25818-20200501-0941.wav
this chunk is 3.046 seconds long and our count is 92
write this file as sample93-25818-20200501-0941.wav
this chunk is 3.907 seconds long and our count is 93
write this file as sample94-25818-20200501-0941.wav
this chunk is 9.931 seconds long and our count is 94
write this file as sample95-25818-20200501-0941.wav
this chunk is 12.165 seconds long and our count is 95
write this file as sample96-25818-20200501-0941.wav
this chunk is 11.986 seconds long and our count is 96
write this file as sample97-25818-20200501-0941.wav
this chunk is 11.597 seconds long and our count is 97
write this file as sample98-25818-20200501-0941.wav
this chunk is 15.235 seconds long and our count is 98
write t

File 25818-20200501-0612.mp3 required 1911.0 seconds to split into 43 candidate chunks of audio
this chunk is 2.355 seconds long and our count is 194
write this file as sample195-25818-20200501-0612.wav
this chunk is 15.08 seconds long and our count is 195
write this file as sample196-25818-20200501-0612.wav
this chunk is 2.029 seconds long and our count is 196
write this file as sample197-25818-20200501-0612.wav
this chunk is 6.996 seconds long and our count is 197
write this file as sample198-25818-20200501-0612.wav
this chunk is 8.498 seconds long and our count is 198
write this file as sample199-25818-20200501-0612.wav
this chunk is 6.564 seconds long and our count is 199
write this file as sample200-25818-20200501-0612.wav
this chunk is 9.46 seconds long and our count is 200
write this file as sample201-25818-20200501-0612.wav
this chunk is 2.208 seconds long and our count is 201
write this file as sample202-25818-20200501-0612.wav
this chunk is 12.226 seconds long and our count i

File 25818-20200501-0712.mp3 required 2747.0 seconds to split into 35 candidate chunks of audio
this chunk is 12.627 seconds long and our count is 334
write this file as sample335-25818-20200501-0712.wav
this chunk is 7.42 seconds long and our count is 335
write this file as sample336-25818-20200501-0712.wav
this chunk is 13.21 seconds long and our count is 336
write this file as sample337-25818-20200501-0712.wav
this chunk is 3.864 seconds long and our count is 337
write this file as sample338-25818-20200501-0712.wav
this chunk is 14.34 seconds long and our count is 338
write this file as sample339-25818-20200501-0712.wav
this chunk is 9.157 seconds long and our count is 339
write this file as sample340-25818-20200501-0712.wav
this chunk is 6.624 seconds long and our count is 340
write this file as sample341-25818-20200501-0712.wav
this chunk is 3.763 seconds long and our count is 341
write this file as sample342-25818-20200501-0712.wav
this chunk is 11.887 seconds long and our count 

File 25818-20200501-0841.mp3 required 3302.0 seconds to split into 56 candidate chunks of audio
this chunk is 14.053 seconds long and our count is 420
write this file as sample421-25818-20200501-0841.wav
this chunk is 4.207 seconds long and our count is 421
write this file as sample422-25818-20200501-0841.wav
this chunk is 14.169 seconds long and our count is 422
write this file as sample423-25818-20200501-0841.wav
this chunk is 7.342 seconds long and our count is 423
write this file as sample424-25818-20200501-0841.wav
this chunk is 3.195 seconds long and our count is 424
write this file as sample425-25818-20200501-0841.wav
this chunk is 2.492 seconds long and our count is 425
write this file as sample426-25818-20200501-0841.wav
this chunk is 14.842 seconds long and our count is 426
write this file as sample427-25818-20200501-0841.wav
this chunk is 5.189 seconds long and our count is 427
write this file as sample428-25818-20200501-0841.wav
this chunk is 19.314 seconds long and our cou

File 25818-20200501-1409.mp3 required 4135.0 seconds to split into 71 candidate chunks of audio
this chunk is 7.882 seconds long and our count is 544
write this file as sample545-25818-20200501-1409.wav
this chunk is 8.155 seconds long and our count is 545
write this file as sample546-25818-20200501-1409.wav
this chunk is 7.197 seconds long and our count is 546
write this file as sample547-25818-20200501-1409.wav
this chunk is 7.073 seconds long and our count is 547
write this file as sample548-25818-20200501-1409.wav
this chunk is 24.937 seconds long and our count is 548
write this file as sample549-25818-20200501-1409.wav
this chunk is 24.759 seconds long and our count is 549
write this file as sample550-25818-20200501-1409.wav
this chunk is 24.975 seconds long and our count is 550
write this file as sample551-25818-20200501-1409.wav
this chunk is 15.835 seconds long and our count is 551
write this file as sample552-25818-20200501-1409.wav
this chunk is 2.881 seconds long and our cou

File 25818-20200501-1439.mp3 required 4653.0 seconds to split into 82 candidate chunks of audio
this chunk is 14.861 seconds long and our count is 627
write this file as sample628-25818-20200501-1439.wav
this chunk is 4.572 seconds long and our count is 628
write this file as sample629-25818-20200501-1439.wav
this chunk is 2.398 seconds long and our count is 629
write this file as sample630-25818-20200501-1439.wav
this chunk is 7.257 seconds long and our count is 630
write this file as sample631-25818-20200501-1439.wav
this chunk is 10.073 seconds long and our count is 631
write this file as sample632-25818-20200501-1439.wav
this chunk is 8.698 seconds long and our count is 632
write this file as sample633-25818-20200501-1439.wav
this chunk is 4.609 seconds long and our count is 633
write this file as sample634-25818-20200501-1439.wav
this chunk is 17.966 seconds long and our count is 634
write this file as sample635-25818-20200501-1439.wav
this chunk is 12.813 seconds long and our cou

File 25818-20200501-1210.mp3 required 5170.0 seconds to split into 70 candidate chunks of audio
this chunk is 32.733 seconds long and our count is 712
write this file as sample713-25818-20200501-1210.wav
this chunk is 20.278 seconds long and our count is 713
write this file as sample714-25818-20200501-1210.wav
this chunk is 4.148 seconds long and our count is 714
write this file as sample715-25818-20200501-1210.wav
this chunk is 2.158 seconds long and our count is 715
write this file as sample716-25818-20200501-1210.wav
this chunk is 20.207 seconds long and our count is 716
write this file as sample717-25818-20200501-1210.wav
this chunk is 4.306 seconds long and our count is 717
write this file as sample718-25818-20200501-1210.wav
this chunk is 6.037 seconds long and our count is 718
write this file as sample719-25818-20200501-1210.wav
this chunk is 5.224 seconds long and our count is 719
write this file as sample720-25818-20200501-1210.wav
this chunk is 10.149 seconds long and our cou

File 25818-20200501-0911.mp3 required 5685.0 seconds to split into 77 candidate chunks of audio
this chunk is 2.175 seconds long and our count is 788
write this file as sample789-25818-20200501-0911.wav
this chunk is 19.937 seconds long and our count is 789
write this file as sample790-25818-20200501-0911.wav
this chunk is 8.43 seconds long and our count is 790
write this file as sample791-25818-20200501-0911.wav
this chunk is 2.18 seconds long and our count is 791
write this file as sample792-25818-20200501-0911.wav
this chunk is 5.598 seconds long and our count is 792
write this file as sample793-25818-20200501-0911.wav
this chunk is 10.467 seconds long and our count is 793
write this file as sample794-25818-20200501-0911.wav
this chunk is 2.547 seconds long and our count is 794
write this file as sample795-25818-20200501-0911.wav
this chunk is 5.125 seconds long and our count is 795
write this file as sample796-25818-20200501-0911.wav
this chunk is 6.486 seconds long and our count i

File 25818-20200501-0642.mp3 required 6208.0 seconds to split into 39 candidate chunks of audio
this chunk is 2.486 seconds long and our count is 895
write this file as sample896-25818-20200501-0642.wav
this chunk is 3.085 seconds long and our count is 896
write this file as sample897-25818-20200501-0642.wav
this chunk is 16.024 seconds long and our count is 897
write this file as sample898-25818-20200501-0642.wav
this chunk is 8.423 seconds long and our count is 898
write this file as sample899-25818-20200501-0642.wav
this chunk is 24.807 seconds long and our count is 899
write this file as sample900-25818-20200501-0642.wav
this chunk is 4.078 seconds long and our count is 900
write this file as sample901-25818-20200501-0642.wav
this chunk is 11.032 seconds long and our count is 901
write this file as sample902-25818-20200501-0642.wav
this chunk is 21.861 seconds long and our count is 902
write this file as sample903-25818-20200501-0642.wav
this chunk is 2.167 seconds long and our cou

File 25818-20200501-0343.mp3 required 6986.0 seconds to split into 27 candidate chunks of audio
this chunk is 16.015 seconds long and our count is 1006
write this file as sample1007-25818-20200501-0343.wav
this chunk is 11.923 seconds long and our count is 1007
write this file as sample1008-25818-20200501-0343.wav
this chunk is 8.827 seconds long and our count is 1008
write this file as sample1009-25818-20200501-0343.wav
this chunk is 10.935 seconds long and our count is 1009
write this file as sample1010-25818-20200501-0343.wav
this chunk is 6.523 seconds long and our count is 1010
write this file as sample1011-25818-20200501-0343.wav
this chunk is 15.813 seconds long and our count is 1011
write this file as sample1012-25818-20200501-0343.wav
this chunk is 2.485 seconds long and our count is 1012
write this file as sample1013-25818-20200501-0343.wav
this chunk is 10.667 seconds long and our count is 1013
write this file as sample1014-25818-20200501-0343.wav
this chunk is 4.885 seconds

File 25818-20200501-1339.mp3 required 7757.0 seconds to split into 78 candidate chunks of audio
this chunk is 19.574 seconds long and our count is 1110
write this file as sample1111-25818-20200501-1339.wav
this chunk is 6.349 seconds long and our count is 1111
write this file as sample1112-25818-20200501-1339.wav
this chunk is 28.719 seconds long and our count is 1112
write this file as sample1113-25818-20200501-1339.wav
this chunk is 2.038 seconds long and our count is 1113
write this file as sample1114-25818-20200501-1339.wav
this chunk is 6.812 seconds long and our count is 1114
write this file as sample1115-25818-20200501-1339.wav
this chunk is 9.091 seconds long and our count is 1115
write this file as sample1116-25818-20200501-1339.wav
this chunk is 9.663 seconds long and our count is 1116
write this file as sample1117-25818-20200501-1339.wav
this chunk is 2.56 seconds long and our count is 1117
write this file as sample1118-25818-20200501-1339.wav
this chunk is 21.712 seconds lo

File 25818-20200501-0542.mp3 required 8272.0 seconds to split into 33 candidate chunks of audio
this chunk is 21.709 seconds long and our count is 1210
write this file as sample1211-25818-20200501-0542.wav
this chunk is 2.222 seconds long and our count is 1211
write this file as sample1212-25818-20200501-0542.wav
this chunk is 11.367 seconds long and our count is 1212
write this file as sample1213-25818-20200501-0542.wav
this chunk is 4.558 seconds long and our count is 1213
write this file as sample1214-25818-20200501-0542.wav
this chunk is 3.531 seconds long and our count is 1214
write this file as sample1215-25818-20200501-0542.wav
this chunk is 16.65 seconds long and our count is 1215
write this file as sample1216-25818-20200501-0542.wav
this chunk is 4.042 seconds long and our count is 1216
write this file as sample1217-25818-20200501-0542.wav
this chunk is 3.07 seconds long and our count is 1217
write this file as sample1218-25818-20200501-0542.wav
this chunk is 19.342 seconds lo

#### Confirm Files Exist

In [None]:
os.listdir(output_path)