In [1]:
import tensorflow as tf
import numpy as np
import librosa
import random

# Simulate loading MFCC features from 3 speakers

In [2]:
def generate_speaker_mfccs(num_speakers=3, samples_per_speaker=50, max_len=100, n_mfcc=13):
    data, labels = [], []
    for speaker_id in range(num_speakers):
        for _ in range(samples_per_speaker):
            signal = np.sin(np.linspace(0, 2 * np.pi * (random.uniform(100, 300)), 16000))  # Simulated tone
            mfcc = librosa.feature.mfcc(y=signal.astype(np.float32), sr=16000, n_mfcc=n_mfcc)
            mfcc = mfcc.T[:max_len]  # Trim/pad to max_len
            if mfcc.shape[0] < max_len:
                pad_width = max_len - mfcc.shape[0]
                mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
            data.append(mfcc)
            labels.append(speaker_id)
    return np.array(data), tf.keras.utils.to_categorical(labels, num_classes=num_speakers)

# Generate dataset

In [3]:
X, y = generate_speaker_mfccs()

# Split into training and testing sets

In [4]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Build LSTM-based speaker ID model

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X.shape[1], X.shape[2])),     # (max_len, n_mfcc)
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')    # Output: num_speakers
])

# Compile and train

In [6]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 212ms/step - accuracy: 0.5456 - loss: 1.0958 - val_accuracy: 0.0000e+00 - val_loss: 1.1244
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.4987 - loss: 1.0781 - val_accuracy: 0.0000e+00 - val_loss: 1.1633
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.5729 - loss: 1.0516 - val_accuracy: 0.0000e+00 - val_loss: 1.2312
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.5339 - loss: 1.0108 - val_accuracy: 0.0000e+00 - val_loss: 1.3880
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.5299 - loss: 0.9278 - val_accuracy: 0.0000e+00 - val_loss: 2.1939
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.5417 - loss: 0.7632 - val_accuracy: 0.0000e+00 - val_loss: 4.9786
Epoch 7/10
[1m3/3[0

<keras.src.callbacks.history.History at 0x148edd3ba40>

# Evaluate on test set

In [7]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"🎤 Speaker ID Accuracy: {acc:.2f}")

🎤 Speaker ID Accuracy: 0.00
