In [19]:
# Install required libraries
# pip install librosa scikit-learn soundfile

import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import to_categorical

# Function to extract features from audio data
def extract_features(file_path):
    audio_data, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
    return np.mean(mfccs, axis=1)

# Function to collect audio samples and labels from a dataset folder
def collect_data(dataset_folder):
    data = []
    labels = []

    for speaker_folder in os.listdir(dataset_folder):
        speaker_path = os.path.join(dataset_folder, speaker_folder)

        if os.path.isdir(speaker_path):
            for filename in os.listdir(speaker_path):
                if filename.endswith(".wav"):
                    file_path = os.path.join(speaker_path, filename)
                    features = extract_features(file_path)
                    data.append(features)
                    labels.append(speaker_folder)

    return np.array(data), np.array(labels)

# Change the path to the folder where your dataset is stored
dataset_folder = 'audio'

# Collect data
X, y = collect_data(dataset_folder)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Build the CNN model
model = models.Sequential([
    layers.Reshape((13, 1), input_shape=(13,)),
    layers.Conv1D(64, 3, activation='relu'),
    layers.MaxPooling1D(2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[callbacks.EarlyStopping(patience=3)])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy}')

# Save the model
model.save('speaker_identification_model.h5')

# Save label encoder
np.save('label_encoder.npy', label_encoder.classes_)


ModuleNotFoundError: No module named 'resampy'

This error is lazily reported, having originally occured in
  File C:\Users\braje\anaconda3\Lib\site-packages\librosa\core\audio.py, line 32, in <module>

----> resampy = lazy.load("resampy")

In [14]:
# Install required libraries
# pip install librosa scikit-learn soundfile SpeechRecognition

import os
import numpy as np
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import soundfile as sf
import joblib

# Function to extract features from audio data during training
def extract_features(audio_data, sample_rate):
    # Normalize the audio data to the range [0, 1]
    audio_data_normalized = (audio_data - np.min(audio_data)) / (np.max(audio_data) - np.min(audio_data))

    # Extract MFCC features
    # Example with adjusted parameters
    mfccs = librosa.feature.mfcc(y=audio_data_normalized, sr=sample_rate, n_mfcc=13, n_fft=1024, hop_length=512)

    # Flatten MFCC features
    features = mfccs.flatten()

    # Ensure a consistent shape by padding if needed
    max_feature_length = 12 * 13  # Adjust based on the actual feature dimensions
    if len(features) < max_feature_length:
        features = np.pad(features, (0, max_feature_length - len(features)), mode='constant', constant_values=(0, 0))
    else:
        features = features[:max_feature_length]

    return features

# Function to collect audio samples and labels from a dataset folder
def generate_data(dataset_folder, chunk_size=10):
    for speaker_folder in os.listdir(dataset_folder):
        speaker_path = os.path.join(dataset_folder, speaker_folder)

        if os.path.isdir(speaker_path):
            for filename in os.listdir(speaker_path):
                if filename.endswith(".wav"):
                    file_path = os.path.join(speaker_path, filename)

                    # Read the audio file using soundfile
                    audio_data, sample_rate = sf.read(file_path)

                    # Extract features in chunks
                    for i in range(0, len(audio_data), int(sample_rate * chunk_size)):
                        chunk = audio_data[i:i + int(sample_rate * chunk_size)]
                        features = extract_features(chunk, sample_rate)
                        
                        # Calculate padding values
                        pad_length = max(0, max_feature_length - len(features))
                        pad_before = pad_length // 2
                        pad_after = pad_length - pad_before

                        # Pad features
                        features = np.pad(features, (pad_before, pad_after), mode='constant', constant_values=(0, 0))

                        yield features, speaker_folder

                    # Clear variables to free up memory
                    del audio_data
                    del features

# ...



# Main program for training the model
if __name__ == "__main__":
    # Change the path to the folder where your dataset is stored
    dataset_folder = 'audio'

    # Initialize variables
    data = []
    labels = []
    max_feature_length = 0

    # Generate and collect data in chunks
    data_generator = generate_data(dataset_folder)
    for features, label in data_generator:
        data.append(features)
        labels.append(label)
        max_feature_length = max(max_feature_length, len(features))

    # Pad features to ensure a consistent shape
    data = [np.pad(features, (0, max_feature_length - len(features)), mode='constant', constant_values=(0, 0)) for features in data]

    # Convert data and labels to numpy arrays
    data = np.array(data)
    labels = np.array(labels)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Train the model
    clf = RandomForestClassifier(n_estimators=100, random_state=40)
    clf.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(clf, 'speaker_identification_model.joblib')

    # Evaluate the model
    accuracy = clf.score(X_test, y_test)
    print("Accuracy: {:.2f}%".format(accuracy * 100))


  util.MAX_MEM_BLOCK // (np.prod(y_frames.shape[:-1]) * y_frames.itemsize)


Accuracy: 62.50%


In [15]:
from sklearn.metrics import classification_report

# Test the model
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        Ajay       1.00      1.00      1.00         1
      Hardik       0.00      0.00      0.00         3
     Kratika       1.00      1.00      1.00         1
      Sonali       1.00      1.00      1.00         2
     Vedansh       0.25      1.00      0.40         1

    accuracy                           0.62         8
   macro avg       0.65      0.80      0.68         8
weighted avg       0.53      0.62      0.55         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
