In [1]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Extracting MFCC features from an audio file
def extract_mfcc(file_path, n_mfcc=13):
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

# Function to process the dataset and extract MFCC features for all audio files
def process_dataset(directory_path):
    mfcc_features = []
    labels = []
    speakers = sorted(os.listdir(directory_path))  # Sort to ensure consistency
    
    for speaker in speakers:
        speaker_path = os.path.join(directory_path, speaker)
        if not os.path.isdir(speaker_path) or speaker.startswith('_'):
            continue
        
        for file_name in os.listdir(speaker_path):
            if file_name.endswith('.wav'):
                file_path = os.path.join(speaker_path, file_name)
                mfcc = extract_mfcc(file_path)
                mfcc_features.append(mfcc)
                labels.append(speaker)
    
    df = pd.DataFrame(mfcc_features)
    df['label'] = labels
    return df

# Define dataset path
dataset_path = "16000_pcm_speeches"  # Change to actual dataset path

df = process_dataset(dataset_path)

# Get unique speakers
unique_speakers = df['label'].unique()
np.random.seed(42)
np.random.shuffle(unique_speakers)  # Shuffle speakers randomly

# Assign 4 speakers for training and 1 speaker for testing
train_speakers = unique_speakers[:4]
test_speaker = unique_speakers[4]

# Split data
train_df = df[df['label'].isin(train_speakers)]
test_df = df[df['label'] == test_speaker]

# Separate features and labels
X_train = train_df.drop('label', axis=1).values
y_train = train_df['label'].values
X_test = test_df.drop('label', axis=1).values
y_test = test_df['label'].values

# Reshape for CNN input
X_train_cnn = X_train.reshape(X_train.shape[0], 13, 1)
X_test_cnn = X_test.reshape(X_test.shape[0], 13, 1)

print("Training data shape:", X_train_cnn.shape)
print("Test data shape:", X_test_cnn.shape)

def get_train_test_data():
    return X_train_cnn, X_test_cnn, y_train, y_test


Training data shape: (6001, 13, 1)
Test data shape: (1500, 13, 1)
