In [None]:
#Import required packages
import pandas as pd
import numpy as np
import librosa
import librosa.display
import pathlib
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
import warnings
import glob
from scipy import signal
import tqdm
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

In [None]:
%cd "/content/drive/Shareddrives/CIS_519_Final_Project"

/Users/Vera/Desktop/CIS519


In [None]:
#Read audio classification file and clean

audio_class_df = pd.read_csv("audioclassification_meta.csv")
c_names = audio_class_df.columns.tolist()
c_names = c_names[0].replace(" ", "_").split("\t")

audio_class_df[c_names] = audio_class_df['VoxCeleb1 ID\tVGGFace1 ID\tGender\tNationality\tSet'].\
                        str.split("\t", expand = True)
audio_class_df = audio_class_df[c_names]

#Set as dictionary
audio_class_dict = audio_class_df.set_index("VoxCeleb1_ID").T.to_dict('list')

#View data
audio_class_df.head()


Unnamed: 0,VoxCeleb1_ID,VGGFace1_ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev


In [None]:
def pull_id_npz(file_name):
    container_list = []
    container = np.load(file_name)
    container_list.append([container[key] for key in container])
    return container_list

#Get irish speakers
%cd "/content/drive/Shareddrives/CIS_519_Final_Project/irish_npz_files"
irish_npz_dict = {file_name: pull_id_npz(file_name) for file_name in glob.glob("*.npz")}

#Get US females
%cd "/content/drive/Shareddrives/CIS_519_Final_Project/usa_f_files"
usaf_npz_dict = {file_name: pull_id_npz(file_name) for file_name in glob.glob("*.npz")}

#Get US males
%cd "/content/drive/Shareddrives/CIS_519_Final_Project/usa_m_files"
usam_npz_dict = {file_name: pull_id_npz(file_name) for file_name in glob.glob("*.npz")}

#Get all other English speaking nationalities
%cd "/content/drive/Shareddrives/CIS_519_Final_Project/non_usa_eng"
nonusa_npz_dict = {file_name: pull_id_npz(file_name) for file_name in glob.glob("*.npz")}

#Get all non-English speaking nationalities
%cd "/content/drive/Shareddrives/CIS_519_Final_Project/non_eng"
test_npz_dict = {file_name: pull_id_npz(file_name) for file_name in glob.glob("*.npz")}

#Concatenate all training data (english speaking nationalities)
train_npz_dict = {**irish_npz_dict, **usaf_npz_dict, **usam_npz_dict, **nonusa_npz_dict}


/Users/Vera/Desktop/CIS519/irish_npz_files
/Users/Vera/Desktop/CIS519/usa_f_files
/Users/Vera/Desktop/CIS519/usa_m_files
/Users/Vera/Desktop/CIS519/non_usa_eng
/Users/Vera/Desktop/CIS519/non_eng


In [None]:
#Pre-Processing: Filtering

#Sampling rate of audio data
fs = 16000

def butter_lowpass(data, lowcut = 5000, fs=16000, order = 4):
    '''
    Function: Apply lowpass butterworth filter
    Inputs:
        - data: numpy array of wave data
        - lowcut: cutoff frequency, default set to 5000 Hz 
        - fs: sampling rate, default set to 16000 Hz (based on VoxCeleb data)
        - order: filter order, default set to 4. Larger order = sharper cutoff
    Output:
        - y: numpy array, filtered version of wave data
    '''
    nyq = 0.5 * fs
    low = lowcut / nyq
    b, a = signal.butter(order, low, btype='low')
    y = signal.lfilter(b, a, data)
    return y

def butter_highpass(data, highcut = 75, fs=16000, order = 4):
    '''
    Function: Apply lowpass butterworth filter
    Inputs:
        - data: numpy array of wave data
        - highcut: cutoff frequency, default set to 75 Hz (below this is mostly noise)
        - fs: sampling rate, default set to 16000 Hz (based on VoxCeleb data)
        - order: filter order, default set to 4. Larger order = sharper cutoff
    Output:
        - y: numpy array, filtered version of wave data
    '''
    nyq = 0.5 * fs
    high = highcut / nyq
    b, a = signal.butter(order, high, btype='high')
    y = signal.lfilter(b, a, data)
    return y

def plot_signal(data):
    '''
    Function: plot time-series signal data
    Inputs: 
        - data: numpy array of wave data
    '''
    #Use librosa display to show wave plot
    librosa.display.waveplot(data, sr = 16000)
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.show()

def plot_fft(data, fs=16000):
    '''
    Function: plot FFT
    Inputs: 
        - data: numpy array of wave data
        - fs: sampling frequency, default is 16000 Hz
    '''
    #Take FFT of data
    fft = np.fft.fft(data)
    #Calcualte magnitude and frequency
    magnitude = np.abs(fft)
    frequency = np.linspace(0, fs, len(magnitude))
    #Since symmetric, only take left side and plot
    left_frequency = frequency[:int(len(frequency)/2)]
    left_magnitude = magnitude[:int(len(frequency)/2)]
    plt.plot(left_frequency, left_magnitude)
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Magnitude")
    plt.title("FFT")
    plt.show()

def get_freq_data(data, fs = 16000, plot = False):
    '''
    Function: get frequency data of signal (magnitude and corresponding frequencies)
    Inputs: 
        - data: numpy array of wave data
        - fs: sampling frequency, default is 16000 Hz
        - plot: Boolean for whether to plot or not. Default is false
    Outputs:
        left_frequency: numpy array of frequency values
        left_magnitude: numpy array of magnitudes corresponding to frequency values
    '''
    #Take FFT of data and calculate magnitudes and frequencies
    fft = np.fft.fft(data)
    magnitude = np.abs(fft)
    frequency = np.linspace(0, fs, len(magnitude))
    left_frequency = frequency[:int(len(frequency)/2)]
    left_magnitude = magnitude[:int(len(frequency)/2)]
    if plot == True:
        plot_fft(data, fs=16000)
    return left_frequency, left_magnitude

def filter_signals(all_npz_dict):
    '''
    Function: Filter all signals in npz dictionaries with lowpass and highpass filters
    Inputs: 
        - all_npz_dict: dictionary keyed by folders, contains lists of numpy arrays representing wave files
    Outputs:
        - filtered_npz: dictionary keyed by folders, contains list of numpy arrays representing filtered wav files
    '''
    filtered_npz = {}
    #Loop through all keys in dictionary
    for key in all_npz_dict:
        #Get list of all unfiltered data (all_npz_dict has two nested lists of data)
        unfiltered_list = all_npz_dict[key][0]
        filtered_list = []
        #Loop through all wave files
        for j, wave in enumerate(unfiltered_list):
            #Pass through lowpass and highpass filters
            filtered_wave = butter_lowpass(wave, 5000, 16000, 5)
            filtered_wave = butter_highpass(filtered_wave, 75, 16000, 5)
            filtered_list.append(filtered_wave)
        filtered_npz[key] = filtered_list
    return filtered_npz

In [None]:
def get_labels(all_npz_dict, audio_class_dict):
    '''
    Function: Obtain gender and nationality labels for each folder
    Inputs:
        -all_npz_dict: dictionary keyed by folder, list of arrays representing wav files
        -audio_class_dict: dictionary keyed by speaker ID, contains list of all IDs belonging to speaker
    Outputs:
        - label_dict: dictionary keyed by folder, contains list of gender and nationality
    '''
    label_dict = {}
    for key in all_npz_dict:
        #Save gender and nationality in list
        label_dict[key] = [audio_class_dict[key.strip(".npz")][1], audio_class_dict[key.strip(".npz")][2]]
    return label_dict

from sklearn.model_selection import train_test_split

def split_train_validation(label_dict, feats):
    '''
    Function: Split data into training and validation sets
    Inputs:
        -label_dict: dictionary keyed by folder number, contains list with gender and nationality of that folder
        -feats: dictionary keyed by folder number of lists of 1x128 array melspectrograms
    Outputs:
        - X_train: list of lists containing melspectrogram data 
        - y_train: list of 0 or 1 corresponding to female/male in X_train
        - X_dev: list of lists containing melspectrogram data
        - y_dev: list of 0 or 1 corresponding to female/male in X_dev
    '''
    X = []
    y = []
    for key in feats:
        for j, wave in enumerate(feats[key]):
            #Save array as list in X
            X.append(wave.tolist())
            #Label is 1 if male, 0 if female
            if label_dict[key][0] == 'm':
                y.append(1)
            else:
                y.append(0)
    #Use train_test_split to get training and validation sets, use validation of 30%
    #Used random state of 8 to seed (replace with any integer)
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3, random_state=8)
    return X_train, y_train, X_dev, y_dev

def get_feats(filtered_npz, sr = 16000):
    '''
    Function: Get melspectrogram features for each wav file. Outputs 128-element vector
    Inputs: 
        - filtered_npz: dictionary keyed by folder number, contains list of arrays representing wav files
        - sr: sampling rate (set to 16000 Hz)
    Outputs:
        - feats: dictionary keyed by folder number, contains list of 1x128 arrays representing melspec features for each wav
    '''
    feats = {}
    #Loop through all folders
    for key in filtered_npz:
        feats_list = []
        for j, wave in enumerate(filtered_npz[key]):
            feats_list.append(np.array([]))
            #Get melspectrogram features, give it time series data, take mean across 0 axis to get 1x128 vector
            melspec = np.mean(librosa.feature.melspectrogram(y=wave, sr=sr).T,axis=0)
            feats_list[j] = np.hstack((feats_list[j], melspec))
        feats[key] = feats_list
    return feats

def prep_test(label_dict, feats):
    '''
    Function: prepare test data for evaluating accuracy of model
    Inputs:
        -label_dict_test: dictionary keyed by folder number for test set, contains list with gender and nationality of that folder
        -feats_test: dictionary keyed by folder number of lists of 1x128 array melspectrograms for test set
    Outputs:
        - X: list of lists containing melspectrogram data for test set
        - y: list of 0 or 1 corresponding to female/male in X
    '''
    X = []
    y = []
    for key in feats:
        for j, wave in enumerate(feats[key]):
            X.append(wave.tolist())
            if label_dict[key][0] == 'm':
                y.append(1)
            else:
                y.append(0)
    return X, y

In [None]:
#Get labels 
label_dict_train = get_labels(train_npz_dict, audio_class_dict)
label_dict_test = get_labels(test_npz_dict, audio_class_dict)
#Filter signals
filtered_npz_train = filter_signals(train_npz_dict)
filtered_npz_test = filter_signals(test_npz_dict)
#Extract melspectrograms
feats_train = get_feats(filtered_npz_train)
feats_test = get_feats(filtered_npz_test)
#Get split data
X_train, y_train, X_dev, y_dev = split_train_validation(label_dict_train, feats_train)
#Get test data
X_test, y_test = prep_test(label_dict_test, feats_test)

In [None]:
#NN Model for Predicting Gender

def create_model():
    '''
    Function: initialize neural network with 4 layers for training to predict gender. 
    Contains 3 hidden layers with relu activation going from 256 nodes to 64 nodes (divide by 2 at each layer)
    Last layer uses sigmoid function with just one output neuron (0 or 1 to show female or male)
    Between each layer is dropout of 10%
    Inputs: None
    Outputs:
        - NNmodel: neural network framework
    '''
    #3 layers of dense networks using relu activation, 1 layer using sigmoid to get 1 output neuron (0 or 1)
    NNmodel = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation='sigmoid')])
    
    #Use binary crossentropy loss for binary classification (M/F)
    #Use accuracy for metrics
    #Use adam optimization: SGD method, use default lr of 0.001
    NNmodel.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    
    return NNmodel

In [None]:
gender_NNmodel = create_model()

#Tensorboard to view losses and accuracies
tensorboard = TensorBoard(log_dir="logs")
# Stop training if in 5 epochs accuracy is not improving, save weights that get best accuracy
early_stopping = EarlyStopping(mode="min", patience=5, restore_best_weights=True)

#Run on maximum of 75 epochs (usually only takes about 50 to converge)
#Use batch size of 64 (common based on size of data)
gender_NNmodel.fit(X_train, y_train, epochs=75, batch_size=64, validation_data=(X_dev, y_dev), callbacks=[tensorboard, early_stopping])

#Get summary of model 
gender_NNmodel.summary()

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
___________________________________________________________

In [None]:
#Calculate test accuracy and loss
test_loss, test_acc = gender_NNmodel.evaluate(X_test, y_test)
print(test_loss)
print(test_acc)

0.2547049820423126
0.9240703582763672
