In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

import os
import librosa
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model
import random
import soundfile as sf

In [2]:
# # Create the model
# def create_model(input_shape, num_classes):
#     model = models.Sequential([
#         layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
#         layers.MaxPooling2D((2, 2)),
#         layers.Conv2D(64, (3, 3), activation='relu'),
#         layers.MaxPooling2D((2, 2)),
#         layers.Conv2D(64, (3, 3), activation='relu'),
#         layers.Flatten(),
#         layers.Dense(64, activation='relu'),
#         layers.Dense(num_classes, activation='softmax')
#     ])
#     return model

In [3]:
# Update the create_model function to accept input_shape
def create_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

In [4]:
# # Define a function to load and preprocess audio files
# def preprocess_data(data_dir, num_classes, test_size=0.2):
#     X = []
#     y = []
    
#     # Iterate through each subdirectory (each voice type)
#     for label in os.listdir(data_dir):
#         label_dir = os.path.join(data_dir, label)
        
#         # Iterate through each audio file in the subdirectory
#         for file in os.listdir(label_dir):
#             file_path = os.path.join(label_dir, file)
            
#             # Load the audio file and convert it to a spectrogram
#             y_, sr = librosa.load(file_path)
#             spectrogram = librosa.feature.melspectrogram(y=y_, sr=sr)
#             spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            
#             # Resize the spectrogram to a fixed size (if necessary)
#             # spectrogram = resize(spectrogram, (desired_height, desired_width))
            
#             X.append(spectrogram)
#             y.append(int(label))  # Assuming label directories are named with integers
    
#     # Convert lists to numpy arrays
#     X = np.array(X)
#     y = np.array(y)
    
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
#     return X_train, X_test, y_train, y_test

In [5]:
# Define a function to load and preprocess audio files
def preprocess_data(data_dir, test_size=0.2, desired_shape=(128, 128)):
    X = []
    y = []
    labels = {}
    
    # Iterate through each subdirectory (each voice type)
    for i, label in enumerate(os.listdir(data_dir)):
        label_dir = os.path.join(data_dir, label)
        labels[i] = label  # Store the label for reference
        
        # Iterate through each audio file in the subdirectory
        for file in os.listdir(label_dir):
            file_path = os.path.join(label_dir, file)
            
            # Load the audio file and convert it to a spectrogram
            y_, sr = librosa.load(file_path)
            spectrogram = librosa.feature.melspectrogram(y=y_, sr=sr)
            spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
            
            # Resize the spectrogram to a fixed size
            pad_width = desired_shape[1] - spectrogram.shape[1]
            if pad_width > 0:
                spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
            else:
                spectrogram = spectrogram[:, :desired_shape[1]]
            
            X.append(spectrogram)
            y.append(i)  # Use label index as the target value
    
    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Add batch dimension to the input data
    X_train = np.expand_dims(X_train, axis=-1)
    X_test = np.expand_dims(X_test, axis=-1)
    
    return X_train, X_test, y_train, y_test, labels

In [6]:
# Set the directory containing your dataset
data_dir = 'dataset'

# Preprocess the data
X_train, X_test, y_train, y_test, labels = preprocess_data(data_dir)

# Ensure the input shape is correct
input_shape = X_train.shape[1:]

# Create and compile the model
model = create_model(input_shape, len(labels))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Print the labels
print("Labels:", labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9642857313156128
Labels: {0: 'alto', 1: 'bass', 2: 'soprano', 3: 'tenor'}


In [7]:
def preprocess_audio(audio_file, desired_shape=(128, 128)):
    # Load the audio file and convert it to a spectrogram
    y_, sr = librosa.load(audio_file)
    spectrogram = librosa.feature.melspectrogram(y=y_, sr=sr)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    
    # Resize the spectrogram to the desired shape
    current_shape = spectrogram.shape
    if current_shape[1] > desired_shape[1]:
        # If the spectrogram has more columns than desired, trim it
        spectrogram = spectrogram[:, :desired_shape[1]]
    elif current_shape[1] < desired_shape[1]:
        # If the spectrogram has fewer columns than desired, pad it
        pad_width = desired_shape[1] - current_shape[1]
        spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
    
    # Resize the spectrogram to the desired number of rows
    if current_shape[0] != desired_shape[0]:
        spectrogram = librosa.util.fix_length(spectrogram, desired_shape[0], axis=0)
    
    # Add channel dimension
    spectrogram = np.expand_dims(spectrogram, axis=-1)
    
    return spectrogram

In [8]:
# Function to apply time stretching
def time_stretch(audio, rate):
    stretched_audio = librosa.effects.time_stretch(audio, rate=rate)
    return stretched_audio

# Function to apply pitch shifting
def pitch_shift(audio, sr, n_steps):
    shifted_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
    return shifted_audio

# Function to add background noise
def add_background_noise(audio, noise_factor):
    # Load background noise audio file (e.g., white noise)
    noise_audio, _ = librosa.load('background/crowd_bg.wav', sr=None)
    # Ensure noise audio length is at least as long as the original audio
    if len(noise_audio) < len(audio):
        repeat_times = int(np.ceil(len(audio) / len(noise_audio)))
        noise_audio = np.tile(noise_audio, repeat_times)[:len(audio)]
    # Add noise to the original audio
    noisy_audio = audio + noise_factor * noise_audio[:len(audio)]
    return noisy_audio

# Function to apply time shifting
def time_shift(audio, sr, max_shift_ms=100):
    # Convert maximum shift from milliseconds to samples
    max_shift_samples = int(max_shift_ms * sr / 1000)
    # Generate random shift amount
    shift_amount = random.randint(-max_shift_samples, max_shift_samples)
    # Apply time shift
    shifted_audio = np.roll(audio, shift_amount)
    return shifted_audio

# Function to change speed
def change_speed(audio, speed_factor):
    # Resample the audio with the speed factor
    sped_audio = librosa.effects.time_stretch(audio, rate=speed_factor)
    return sped_audio

# Function to apply audio filters (e.g., reverb, echo, equalization)
def apply_audio_filter(audio):
    # Apply a random filter to the audio (e.g., reverb, echo, equalization)
    # Example: apply reverb filter
    reverb_audio = audio * np.random.uniform(0.5, 1.5)
    return reverb_audio

# Function to randomly crop or pad the audio
def random_crop_or_pad(audio, target_length):
    # Randomly crop or pad the audio to the target length
    if len(audio) < target_length:
        # Pad the audio
        pad_length = target_length - len(audio)
        padded_audio = np.pad(audio, (0, pad_length), mode='constant')
        return padded_audio
    elif len(audio) > target_length:
        # Randomly crop the audio
        start_idx = np.random.randint(0, len(audio) - target_length)
        cropped_audio = audio[start_idx:start_idx + target_length]
        return cropped_audio
    else:
        return audio

# Function to resample the audio
def resample_audio(audio, target_sr):
    resampled_audio = librosa.resample(audio, orig_sr=len(audio), target_sr=target_sr)
    return resampled_audio

In [9]:
# # Load an example audio file
# audio_file = 'bassvoice.wav'
# audio, sr = librosa.load(audio_file, sr=None)

# # Apply data augmentation techniques
# augmented_audios = []
# for i in range(10):  # Generate 10 augmented samples
#     augmented_audio = audio.copy()  # Make a copy of the original audio

#     # Apply random data augmentation techniques
#     rate = np.random.uniform(0.8, 1.2)
#     augmented_audio = time_stretch(augmented_audio, rate)
#     augmented_audio = pitch_shift(augmented_audio, sr, n_steps=np.random.randint(-3, 3))
#     augmented_audio = add_background_noise(augmented_audio, noise_factor=np.random.uniform(0.001, 0.01))
#     augmented_audio = time_shift(augmented_audio, sr, max_shift_ms=50)
#     augmented_audio = change_speed(augmented_audio, speed_factor=np.random.uniform(0.8, 1.2))
#     augmented_audio = apply_audio_filter(augmented_audio)
#     augmented_audio = random_crop_or_pad(augmented_audio, target_length=len(audio))
#     augmented_audio = resample_audio(augmented_audio, target_sr=sr)

#     augmented_audios.append(augmented_audio)

# # Save or use augmented audio samples as needed

In [19]:
# Preprocess the voice recording
audio_file = 'DannyBoy.wav'  # Change this to your voice recording file
preprocessed_audio = preprocess_audio(audio_file)

# Ensure that the input shape matches the model's input shape
input_shape = preprocessed_audio.shape[1:]

# Make predictions using your existing model
predictions = model.predict(np.expand_dims(preprocessed_audio, axis=0))

# Get the predicted class
predicted_class_index = np.argmax(predictions)
predicted_class = labels[predicted_class_index]

print("Predicted class:", predicted_class)

Predicted class: alto


In [109]:
# # Directory containing WAV files to augment
# data_dir = 'dataset/bass'

# # Iterate through each WAV file in the directory
# for file_name in os.listdir(data_dir):
#     if file_name.endswith('.wav'):
#         file_path = os.path.join(data_dir, file_name)
        
#         # Load the original WAV file
#         audio, sr = librosa.load(file_path, sr=None)
        
#         # Apply data augmentation techniques
#         rate = np.random.uniform(0.8, 1.2)
#         augmented_audio = time_stretch(audio, rate)
#         augmented_audio = pitch_shift(augmented_audio, sr, n_steps=np.random.randint(-3, 3))
#         augmented_audio = add_background_noise(augmented_audio, noise_factor=np.random.uniform(0.001, 0.01))
#         augmented_audio = time_shift(augmented_audio, sr, max_shift_ms=50)
#         augmented_audio = change_speed(augmented_audio, speed_factor=np.random.uniform(0.8, 1.2))
#         augmented_audio = apply_audio_filter(augmented_audio)
#         augmented_audio = random_crop_or_pad(augmented_audio, target_length=len(audio))
#         augmented_audio = resample_audio(augmented_audio, target_sr=sr)
        
#         # Save the augmented audio
#         output_file = os.path.join(data_dir, 'augmented_' + file_name)
#         sf.write(output_file, augmented_audio, sr)

In [115]:
# # Directory containing WAV files to augment

# classlist = ['bass', 'tenor', 'alto', 'soprano']



# # Iterate through each WAV file in the directory
# for class_voice in classlist:
#     data_dir = 'dataset/'+class_voice
#     for file_name in os.listdir(data_dir):
#         if file_name.endswith('.wav'):
#             file_path = os.path.join(data_dir, file_name)
            
#             # Load the original WAV file
#             audio, sr = librosa.load(file_path, sr=None)
            
#             # Apply data augmentation techniques
#             rate = np.random.uniform(0.8, 1.2)
            
#             # Load the original WAV file
#             audio, sr = librosa.load(file_path, sr=None)
            
#             # Apply time stretching
#             rate = np.random.uniform(0.8, 1.2)
#             augmented_audio = time_stretch(audio, rate)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'time_stretch_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Apply pitch shifting
#             n_steps = np.random.randint(-3, 3)
#             augmented_audio = pitch_shift(audio, sr, n_steps=n_steps)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'pitch_shift_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Apply background noise
#             noise_factor = np.random.uniform(0.001, 0.01)
#             augmented_audio = add_background_noise(audio, noise_factor=noise_factor)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'background_noise_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Apply time shifting
#             augmented_audio = time_shift(audio, sr, max_shift_ms=50)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'time_shift_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Apply speed change
#             speed_factor = np.random.uniform(0.8, 1.2)
#             augmented_audio = change_speed(audio, speed_factor=speed_factor)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'change_speed_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Apply audio filter
#             augmented_audio = apply_audio_filter(audio)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'audio_filter_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Randomly crop or pad the audio
#             augmented_audio = random_crop_or_pad(audio, target_length=len(audio))
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'random_crop_pad_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
    
#             # Resample the audio
#             augmented_audio = resample_audio(audio, target_sr=sr)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'resampled_' + file_name)
#             sf.write(output_file, augmented_audio, sr)
            
#             # Save the augmented audio
#             output_file = os.path.join(data_dir, 'augmented_' + file_name)
#             sf.write(output_file, augmented_audio, sr)