In [2]:
import librosa
import numpy as np
import os


In [3]:
def process_audio_directory(directory_path, sample_rate=22050, segment_duration=3, n_mfcc=40):
    """
    Concatenates audio files, extracts fixed-length segments, calculates MFCCs, and saves them.

    Args:
        directory_path (str): Path to the directory containing .wav files.
        sample_rate (int): The target sample rate for all audio files.
        segment_duration (int): Duration in seconds for each audio segment.
        n_mfcc (int): Number of MFCC coefficients to extract.

    Returns:
        np.ndarray: A 3D NumPy array containing all extracted MFCC samples.
                    Shape will be (num_samples, n_mfcc, num_frames).
    """
    full_audio = []
    
    # 1. Load and concatenate all audio files
    print("Loading and concatenating audio files...")
    for filename in os.listdir(directory_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory_path, filename)
            # librosa.load resamples the audio to the target_sr by default
            y, sr = librosa.load(file_path, sr=sample_rate)
            full_audio.append(y)
    
    # Concatenate all audio segments into a single array
    concatenated_audio = np.concatenate(full_audio)
    
    # Calculate the number of samples per segment
    segment_length = segment_duration * sample_rate
    
    mfcc_samples = []
    
    # 2. Extract small samples and calculate MFCCs
    print("Extracting segments and calculating MFCCs...")
    for i in range(0, len(concatenated_audio), segment_length):
        # Take a segment of the audio
        segment = concatenated_audio[i:i + segment_length]
        
        # Skip incomplete segments at the end
        if len(segment) < segment_length:
            continue
        
        # Calculate MFCCs for the segment
        mfccs = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=n_mfcc)
        
        # Append the MFCCs to our list
        mfcc_samples.append(mfccs)
        
    print(f"Total MFCC samples created: {len(mfcc_samples)}")
    
    # Convert list of 2D arrays to a single 3D NumPy array
    return np.array(mfcc_samples)



In [9]:
# Define the path to your directory of .wav files
audio_directory = 'data/lstm_data'

#print each subdirectory in audio_directory unless it has a "."
for subdir in os.listdir(audio_directory):
    if '.' not in subdir:
        print(subdir + "... ", end='')

        # Process the audio and get the training samples
        training_data = process_audio_directory(audio_directory + '/' + subdir)

        # The shape of your training data is now:
        # (number_of_segments, n_mfcc, number_of_frames_per_segment)
        # This is the ideal input shape for an LSTM network.
        print(f"Shape of the final training data: {training_data.shape}")

        # You can save this data for later use
        np.save(f'data/lstm_data/{subdir}_data.npy', training_data)

6... Loading and concatenating audio files...
Extracting segments and calculating MFCCs...
Total MFCC samples created: 814
Shape of the final training data: (814, 40, 130)
10... Loading and concatenating audio files...
Extracting segments and calculating MFCCs...
Total MFCC samples created: 172
Shape of the final training data: (172, 40, 130)
8... Loading and concatenating audio files...
Extracting segments and calculating MFCCs...
Total MFCC samples created: 350
Shape of the final training data: (350, 40, 130)
4... Loading and concatenating audio files...
Extracting segments and calculating MFCCs...
Total MFCC samples created: 274
Shape of the final training data: (274, 40, 130)
12... Loading and concatenating audio files...
Extracting segments and calculating MFCCs...
Total MFCC samples created: 230
Shape of the final training data: (230, 40, 130)
