## Gender Recognition using ANN

In [7]:
# Install NumPy
%pip install numpy

# Install Pandas
%pip install pandas

# Install tqdm for progress bars
%pip install tqdm

# Install TensorFlow for deep learning models
%pip install tensorflow

# Install Librosa for audio analysis
%pip install librosa

# Install SciPy for scientific computing
%pip install scipy

# Install PyDub for audio file manipulation
%pip install pydub

# Install Matplotlib for plotting
%pip install matplotlib

# Install scikit-learn for machine learning tools
%pip install scikit-learn

# Install resampy for resampling purposes
%pip install resampy


^C
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/mac/anaconda3/envs/tf/lib/python3.11/site-packages/pip/__main__.py", line 24, in <module>
    sys.exit(_main())
             ^^^^^^^
  File "/Users/mac/anaconda3/envs/tf/lib/python3.11/site-packages/pip/_internal/cli/main.py", line 79, in main
    return command.main(cmd_args)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mac/anaconda3/envs/tf/lib/python3.11/site-packages/pip/_internal/cli/base_command.py", line 101, in main
    return self._main(args)
           ^^^^^^^^^^^^^^^^
  File "/Users/mac/anaconda3/envs/tf/lib/python3.11/site-packages/pip/_internal/cli/base_command.py", line 236, in _main
    self.handle_pip_version_check(options)
  File "/Users/mac/anaconda3/envs/tf/lib/python3.11/site-packages/pip/_internal/cli/req_command.py", line 191, in handle_pip_version_check
    pip_self_version_check(session, options)
  File

In [6]:
#path to your csv file of balanced data
balanced_data_path = "/Users/mac/Desktop/data/data/Balanced_data.csv"

#path to the cv-other-train folder of the common voice database downloaded
training_data_path = "/Users/mac/Desktop/archive/cv-other-train" + "/"


#path to the cv-other-dev folder of the common voice database downloaded
example_data_path = "/Users/mac/Desktop/archive/cv-other-dev" + "/"

In [2]:
import numpy as np 
import scipy
import pandas as pd 
import csv
import os
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split
import os
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from tqdm import tqdm

import glob
import shutil
from pydub import AudioSegment
import librosa



## Second try using Mel spectrogram Frequency, MFCC, Chroma, Contrast and Tonnetz

## Using Pydub.AudioSegment : It works

#### Fixing the sample rate issue

In [1]:
%pip install resampy

Collecting resampy
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import resampy

def resample(audio_file, target_sr):
    x, sr_orig = librosa.load(audio_file, sr=None, mono=False)
    audio_data = resampy.resample(audio_data, sr_orig, target_sr)

In [6]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")

    # Load the audio file with pydub
    audio = AudioSegment.from_file(file_name)
    sample_rate = audio.frame_rate

    # Convert to NumPy array
    audio_data = np.array(audio.get_array_of_samples())
    if audio.channels == 2:  # Check if stereo and convert to mono
        audio_data = audio_data.reshape((-1, 2))
        audio_data = audio_data.mean(axis=1)
    audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max  # Normalize

    # Feature extraction
    result = np.array([])
    if chroma or contrast or tonnetz:
        stft = np.abs(librosa.stft(audio_data))
    
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13).T, axis=0)
        result = np.hstack((result, mfccs))

    if chroma:
        chroma_feature = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_feature))

    if mel:
        mel_feature = np.mean(librosa.feature.melspectrogram(y=audio_data, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_feature))

    if contrast:
        contrast_feature = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, contrast_feature))

    if tonnetz:
        tonnetz_feature = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(audio_data), sr=sample_rate).T, axis=0)
        result = np.hstack((result, tonnetz_feature))

    return result


### Once we have a good Balanced_data.csv containing paths to the audios.mp3 + their labels, saved in a folder named Data, we extract their features

### Loading the data, spliting into train-test-validation, defining the Model

In [7]:

def load_data(vector_length=128):
    """A function to load gender recognition dataset from `data` folder
    After the second run, this will load from results/features.npy and results/labels.npy files
    as it is much faster!"""
    # make sure results folder exists
    if not os.path.isdir("results"):
        os.mkdir("results")
    # if features & labels already loaded individually and bundled, load them from there instead
    if os.path.isfile("results/features.npy") and os.path.isfile("results/labels.npy"):
        X = np.load("results/features.npy")
        y = np.load("results/labels.npy")
        return X, y
    # read dataframe
    df = pd.read_csv(balanced_data_path)
    # get total samples
    n_samples = len(df)
    # get total male samples
    n_male_samples = len(df[df['gender'] == 0])
    # get total female samples
    n_female_samples = len(df[df['gender'] == 1])
    print("Total samples:", n_samples)
    print("Total male samples:", n_male_samples)
    print("Total female samples:", n_female_samples)
    # initialize an empty array for all audio features
    X = np.zeros((n_samples, vector_length))
    # initialize an empty array for all audio labels (1 for male and 0 for female)
    y = np.zeros((n_samples, 1))
    for i, (filename, gender) in tqdm(enumerate(zip(df['filename'], df['gender'])), "Loading data", total=n_samples):

        features = extract_feature(training_data_path + filename,  mel = True)
        X[i] = features
        y[i] = gender

    # save the audio features and labels into files
    # so we won't load each one of them next run
    np.save("results/features", X)
    np.save("results/labels", y)
    return X, y


def split_data(X, y, test_size=0.1, valid_size=0.1):
    # split training set and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)
    # split training set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size, random_state=7)
    # return a dictionary of values
    return {
        "X_train": X_train,
        "X_valid": X_valid,
        "X_test": X_test,
        "y_train": y_train,
        "y_valid": y_valid,
        "y_test": y_test
    }


def create_model(vector_length=128):
    """5 hidden dense layers from 256 units to 64, not the best model, but why not."""
    model = Sequential()
    model.add(Dense(256, input_shape=(vector_length,)))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    # one output neuron with sigmoid activation function, 0 means female, 1 means male
    model.add(Dense(1, activation="sigmoid"))
    # using binary crossentropy as it's male/female classification (binary)
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    # print summary of the model
    model.summary()
    return model

In [8]:

# load the dataset
X, y = load_data()
print(X)
# split the data into training, validation and testing sets
data = split_data(X, y, test_size=0.1, valid_size=0.1)
# construct the model
model = create_model()

# use tensorboard to view metrics
tensorboard = TensorBoard(log_dir="logs")
# define early stopping to stop training after 5 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=5, restore_best_weights=True)

batch_size = 64
epochs = 100

# train the model using the training set and validating using validation set
model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),
          callbacks=[tensorboard, early_stopping])


# save the model to a file
model.save("results/model.h5")

# evaluating the model using the testing set
print(f"Evaluating the model using {len(data['X_test'])} samples...")
loss, accuracy = model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

[[7.14310864e-03 1.61338057e-02 2.30557639e-02 ... 2.54096366e-09
  2.48413712e-09 2.43777998e-09]
 [6.23296248e-03 2.46039266e-03 5.73755614e-03 ... 2.53370858e-09
  2.49579712e-09 2.53083354e-09]
 [9.16805002e-04 2.04855460e-03 1.92967569e-03 ... 2.49615750e-09
  2.46899279e-09 2.52861510e-09]
 ...
 [8.76070652e-03 1.24021536e-02 2.79637665e-01 ... 2.58005972e-09
  2.63226174e-09 2.61906941e-09]
 [1.70558915e-01 4.28008175e+00 1.27596388e+01 ... 2.52021515e-09
  2.64013122e-09 2.65393796e-09]
 [7.96847194e-02 1.80307794e+00 1.50008879e+01 ... 2.42274423e-09
  2.47013765e-09 2.49715715e-09]]


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               33024     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               1

## Prediction

In [12]:
from tensorflow.keras.models import load_model

def preprocess_audio(file_path):
    # Extract the same features as you did for training
    features = extract_feature(file_path, mel=True)
    return features

def predict_gender(file_path, model):
    # Preprocess the file
    features = preprocess_audio(file_path)
    # Reshape features to match the input shape of the model
    features = np.reshape(features, (1, -1))
    # Make a prediction
    prediction = model.predict(features)[0]
    # Interpret the result
    if prediction <= 0.5:
        return "Female",prediction
    else:
        return "Male",prediction

# Load the model
model = load_model("results/model.h5")

# Path to the new audio file
file_path = example_data_path + "cv-other-dev/sample-000001.mp3"

# Predict the gender
gender = predict_gender(file_path, model)
print(f"The predicted gender is: {gender}")

The predicted gender is: ('Male', array([0.786178], dtype=float32))
