In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import os
import librosa
from sklearn.model_selection import train_test_split

class VoiceClassifier:
    def __init__(self, data_dir='dataset', test_size=0.2, desired_shape=(128, 128), sr=22050, hop_length=512, n_mels=128):
        self.data_dir = data_dir
        self.test_size = test_size
        self.desired_shape = desired_shape
        self.sr = sr
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.labels = {}
        self.model = None
        self.input_shape = None

    def create_model(self, input_shape, num_classes):
        model = models.Sequential([
            layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
            layers.MaxPooling2D((2, 2)),
            layers.Conv2D(64, (3, 3), activation='relu'),
            layers.MaxPooling2D((2, 2)),
            layers.Flatten(),
            layers.Dense(64, activation='relu'),
            layers.Dense(num_classes, activation='softmax')
        ])
        return model

    def preprocess_data(self):
        X = []
        y = []
        for i, label in enumerate(os.listdir(self.data_dir)):
            label_dir = os.path.join(self.data_dir, label)
            self.labels[i] = label
            for file in os.listdir(label_dir):
                file_path = os.path.join(label_dir, file)
                y_, sr = librosa.load(file_path, sr=self.sr)
                n_fft = min(2048, len(y_))
                hop_length = n_fft // 4
                spectrogram = librosa.feature.melspectrogram(y=y_, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=self.n_mels)
                spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
                pad_width = self.desired_shape[1] - spectrogram.shape[1]
                if pad_width > 0:
                    spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
                else:
                    spectrogram = spectrogram[:, :self.desired_shape[1]]
                X.append(spectrogram)
                y.append(i)
    
        X = np.array(X)
        y = np.array(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
        self.input_shape = X_train.shape[1:]
        # Add batch size dimension
        X_train = np.expand_dims(X_train, axis=3)  # or -1
        X_test = np.expand_dims(X_test, axis=3)    # or -1
        return X_train, X_test, y_train, y_test

    def train_model(self, epochs=4, batch_size=32):
        num_classes = len(self.labels)
        X_train, X_test, y_train, y_test = self.preprocess_data()
        self.model = self.create_model(self.input_shape, num_classes)
        self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
        test_loss, test_acc = self.model.evaluate(X_test, y_test)
        print('Test accuracy:', test_acc)

    def preprocess_audio(self, audio_file):
        y_, sr = librosa.load(audio_file)
        spectrogram = librosa.feature.melspectrogram(y=y_, sr=sr)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        current_shape = spectrogram.shape
        if current_shape[1] > self.desired_shape[1]:
            spectrogram = spectrogram[:, :self.desired_shape[1]]
        elif current_shape[1] < self.desired_shape[1]:
            pad_width = self.desired_shape[1] - current_shape[1]
            spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
        if current_shape[0] != self.desired_shape[0]:
            spectrogram = librosa.util.fix_length(spectrogram, self.desired_shape[0], axis=0)
        spectrogram = np.expand_dims(spectrogram, axis=-1)
        return spectrogram

    def predict_audio(self, audio_file):
        if not self.model:
            print("Error: Model not trained. Please train the model first.")
            return
        preprocessed_audio = self.preprocess_audio(audio_file)
        predictions = self.model.predict(np.expand_dims(preprocessed_audio, axis=0))
        predicted_class_index = np.argmax(predictions)
        predicted_class = self.labels[predicted_class_index]
        print("Predicted class:", predicted_class)

# Usage example
if __name__ == "__main__":
    voice_classifier = VoiceClassifier()
    voice_classifier.train_model()
    audio_file = 'woman_talking_cut.wav'
    voice_classifier.predict_audio(audio_file)


ValueError: Input 0 of layer "conv2d_8" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (None, 128, 128)