In [1]:
import os
import time
import pydub
from pydub.silence import split_on_silence
from pydub import AudioSegment

#### Establish Sound Processing Criteria

In [2]:
silence_length_ms = 4_000
silence_threshold_delta_dBFS = -5

min_chunk_length_ms = 2_000
max_chunk_length_ms = 60_000

#### Identify File Paths Specific to Your Machine

In [4]:
# File Path to Raw Audio
raw_path = "./Datasets/mp3_download/cambridge/"

# Verify Appropriate Files Exist in this Path
os.listdir(raw_path)

['\u2068 13467-20200507-2331.mp3',
 '\u2068 13467-20200508-0100.mp3',
 '\u2068 13467-20200508-0001.mp3',
 '\u2068 13467-20200507-2231.mp3',
 '\u2068 13467-20200508-0200.mp3',
 '\u2068 13467-20200508-0030.mp3',
 '\u2068 13467-20200508-0130.mp3',
 '\u2068 13467-20200507-2301.mp3']

In [4]:
# File Path for Structured Audio
output_path = ""

# Verify Outpath Path Exists
os.listdir(output_path)

[]

#### Split Archive Feeds into Chunks

In [5]:
aggregate_chunks = []

for raw_file in os.listdir(raw_path):
    # Start Timer
    start_time = time.time()
    
    # Pull in File
    audio_to_split = AudioSegment.from_mp3(raw_path + raw_file)
    
    # Split based on silence critiera
    chunks = split_on_silence(audio_to_split,
                             min_silence_len = silence_length_ms,
                             silence_thresh = audio_to_split.dBFS + silence_threshold_delta_dBFS)
    
    # Calculate Processing Time
    processing_time = round(time.time() - start_time,0)
    
    # Return informative statement
    print(f'file {raw_file} required {processing_time} seconds to split into {len(chunks)} pieces \n')
    
    # Append chunks to master list
    aggregate_chunks.append(chunks)   

file 202005011636-603577-26120.mp3 required 243.0 seconds to split into 75 pieces 



#### Export Chunks that meet Criteria

In [6]:
start_time = time.time()

counter = 1
for chunk in chunks:
    file_name = "sample{}.wav".format(counter)
    if len(chunk) >= min_chunk_length_ms and len(chunk) <= max_chunk_length_ms:
        chunk.export(output_path + file_name,
                     format = "wav")
        counter+=1
print(f'Exporting {counter} chunks required {time.time()-start_time} seconds')

Exporting 49 chunks required 0.03287005424499512 seconds


#### Confirm Files Exist

In [7]:
os.listdir(output_path)

['sample1.wav',
 'sample10.wav',
 'sample11.wav',
 'sample12.wav',
 'sample13.wav',
 'sample14.wav',
 'sample15.wav',
 'sample16.wav',
 'sample17.wav',
 'sample18.wav',
 'sample19.wav',
 'sample2.wav',
 'sample20.wav',
 'sample21.wav',
 'sample22.wav',
 'sample23.wav',
 'sample24.wav',
 'sample25.wav',
 'sample26.wav',
 'sample27.wav',
 'sample28.wav',
 'sample29.wav',
 'sample3.wav',
 'sample30.wav',
 'sample31.wav',
 'sample32.wav',
 'sample33.wav',
 'sample34.wav',
 'sample35.wav',
 'sample36.wav',
 'sample37.wav',
 'sample38.wav',
 'sample39.wav',
 'sample4.wav',
 'sample40.wav',
 'sample41.wav',
 'sample42.wav',
 'sample43.wav',
 'sample44.wav',
 'sample45.wav',
 'sample46.wav',
 'sample47.wav',
 'sample48.wav',
 'sample5.wav',
 'sample6.wav',
 'sample7.wav',
 'sample8.wav',
 'sample9.wav']