In [1]:
# Import the AudioSegment class for processing audio and the 
# split_on_silence function for separating out silent chunks.
from pydub import AudioSegment
from pydub.silence import split_on_silence
import numpy as np
import librosa

In [4]:
# Define a function to normalize a chunk to a target amplitude.
def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)

# Load your audio.
song = AudioSegment.from_file('test.m4a')
final_chunk=AudioSegment.empty()
# Split track where the silence is 3 seconds or more and get chunks using 
# the imported function.
chunks = split_on_silence (
    # Use the loaded audio.
    song, 
    # Specify that a silent chunk must be at least 3 seconds or 3000 ms long.
    min_silence_len = 3000,
    # Consider a chunk silent if it's quieter than -16 dBFS.
    # (You may want to adjust this parameter.)
    silence_thresh = -30
)

# Process each chunk with your parameters
for i, chunk in enumerate(chunks):
    normalized_chunk = match_target_amplitude(chunk, -20.0)
    
    # Normalize the entire chunk.
    normalized_chunk = match_target_amplitude(chunk, -20.0)
    
    #final chunk made by joining all non silent chunks
    final_chunk+=normalized_chunk

final_chunk.export(
        ".//new_final_chunk.wav",
        bitrate = "192k",
        format = "wav"
     )    
# trimming the song, taking only 1 sec from beginning
final_song = final_chunk[:1000]

#convert the song into numpy array
song_array = final_song.get_array_of_samples()
song_array = np.array(song_array)

In [7]:
#extracting mfcc
song_array = song_array.astype(float)
mfccs = librosa.feature.mfcc(song_array)