In [45]:
from pydub import AudioSegment
import wave
import os

# Input .wav file
input_file = "prakash.wav"

# Output folder to save the segments
output_folder = "samples/prakash"
os.makedirs(output_folder, exist_ok=True)

# Load the audio file
audio = AudioSegment.from_wav(input_file)

# Duration of each segment in milliseconds (0.01 seconds)
segment_duration = 2000

# Total duration of the audio in milliseconds
total_duration = len(audio)

# Calculate the number of segments
num_segments = total_duration // segment_duration

# Split the audio into segments
for i in range(num_segments):
    # Calculate the start and end time of the segment
    start_time = i * segment_duration
    end_time = (i + 1) * segment_duration
    
    # Extract the segment
    segment = audio[start_time:end_time]
    
    # Save the segment to a new .wav file
    output_file = os.path.join(output_folder, f"{i}.wav")
    segment.export(output_file, format="wav")

print(f"Split {input_file} into {num_segments} segments in {output_folder}.")


Split prakash.wav into 7 segments in samples/prakash.


In [46]:

import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models

# Function to extract spectrogram features from audio file
def extract_features(file_path, n_mels=128, n_fft=2048, hop_length=512):
    signal, sr = librosa.load(file_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec


# Define a function to load data from the given path
def load_data(path):
    features = []
    labels = []
    for speaker_folder in os.listdir(path):
        speaker_path = os.path.join(path, speaker_folder)
        for audio_file in os.listdir(speaker_path):
            file_path = os.path.join(speaker_path, audio_file)
            features.append(extract_features(file_path))
            labels.append(speaker_folder)
    return np.array(features), np.array(labels)

# Load data
data_path = "samples"
X, y = load_data(data_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Add channel dimension for CNN
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Build CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=X_train[0].shape),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

for i in range(10):
    # Train model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate model
    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_acc}")

# Save the model
model.save("speaker_identification_model.h5")

Epoch 1/10


  super().__init__(
2024-04-15 23:34:53.767450: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 30336768 exceeds 10% of free system memory.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.5455 - loss: 1.1730 - val_accuracy: 0.6667 - val_loss: 95.0443
Epoch 2/10


2024-04-15 23:34:53.980519: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 29162880 exceeds 10% of free system memory.
2024-04-15 23:34:54.016703: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 30336768 exceeds 10% of free system memory.
2024-04-15 23:34:54.178387: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 30336768 exceeds 10% of free system memory.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - accuracy: 0.4545 - loss: 153.5219 - val_accuracy: 0.6667 - val_loss: 24.6747
Epoch 3/10


2024-04-15 23:34:54.276179: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 29162880 exceeds 10% of free system memory.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - accuracy: 0.4545 - loss: 39.7213 - val_accuracy: 0.3333 - val_loss: 31.7319
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step - accuracy: 0.5455 - loss: 21.3652 - val_accuracy: 0.3333 - val_loss: 22.0314
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step - accuracy: 0.5455 - loss: 14.8992 - val_accuracy: 0.3333 - val_loss: 6.8983
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - accuracy: 0.5455 - loss: 4.5525 - val_accuracy: 0.6667 - val_loss: 3.1592
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step - accuracy: 0.4545 - loss: 5.1281 - val_accuracy: 0.6667 - val_loss: 1.4713
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step - accuracy: 0.4545 - loss: 2.4143 - val_accuracy: 0.3333 - val_loss: 2.3415
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━



Test accuracy: 1.0


In [49]:
from tensorflow.keras.models import load_model
import librosa
import numpy as np

# Load the saved model
model = load_model('speaker_identification_model.h5')

# Function to extract spectrogram features from audio file
def extract_features(file_path, n_mels=128, n_fft=2048, hop_length=512):
    signal, sr = librosa.load(file_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec

# Example audio file for prediction
sample_file_path = "/media/asifr/work/VA-AI-Backend/samples/xomu/2.wav"
sample_features = extract_features(sample_file_path)
sample_features = np.expand_dims(sample_features, axis=-1)  # Add channel dimension for CNN
sample_features = np.expand_dims(sample_features, axis=0)   # Add batch dimension

# Make prediction
predictions = model.predict(sample_features)
predicted_speaker_index = np.argmax(predictions)
print(predictions)
predicted_speaker = label_encoder.inverse_transform([predicted_speaker_index])[0]
print("Predicted Speaker:", predicted_speaker)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[[5.2252930e-04 9.9947745e-01]]
Predicted Speaker: xomu


In [41]:
from tensorflow.keras.models import load_model
import librosa
import numpy as np

# Load the saved model
model = load_model('speaker_identification_model.h5')

# Function to extract spectrogram features from audio file
def extract_features(file_path, n_mels=128, n_fft=2048, hop_length=512):
    signal, sr = librosa.load(file_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec

# Example audio file for prediction
sample_file_path = "/media/asifr/work/VA-AI-Backend/xomu.wav"
sample_features = extract_features(sample_file_path)
sample_features = np.expand_dims(sample_features, axis=-1)  # Add channel dimension for CNN
sample_features = np.expand_dims(sample_features, axis=0)   # Add batch dimension

# Make prediction
predictions = model.predict(sample_features)
predicted_speaker_index = np.argmax(predictions)
print(predictions)
predicted_speaker = label_encoder.inverse_transform([predicted_speaker_index])[0]
print("Predicted Speaker:", predicted_speaker)



ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 3584, but received input with shape (1, 284928)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 128, 1292, 1), dtype=float32)
  • training=False
  • mask=None