# Importing the necessary library to process the Audio files and extract the features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import librosa as lr
import re
from IPython.display import Audio
from scipy.fftpack import fft, rfft
from scipy import stats
from librosa.core import piptrack
from librosa.feature import mfcc
from scipy.stats.mstats import gmean

# Appending the directories for the audio files in a list, to open them later.

In [2]:
audio_files = glob('..//audios_to_test//to_test//*.wav')

In [3]:
len(audio_files)

50

In [4]:
audio_files[1]

'..//audios_to_test//to_test\\f0001_us_f0001_00014.wav'

## Opening the audio files using librosa, the autputs are:
- audio = audio time series
- sampling_rate = sampling rate of audio

In [5]:
# TO open an audio file at a time and listen to it in the notebook

audio, sampling_rate = lr.load(audio_files[5])
Audio(audio, rate=sampling_rate)

## To plot each audio file in the time domain

In [6]:
# for file in range(0, len(audio_files), 1):
#     audio, sampling_rate = lr.load(audio_files[file])
#     time = np.arange(0, len(audio)) / sampling_rate
#     plt.plot(time, audio)
#     plt.xlabel('Time (secs)')
#     plt.show()

# Creating a function to extract the features needed out of an Audio file and then append the features into a dictionary

In [7]:
def feature_extractor(audio: np.ndarray, sampling_rate: int) -> dict:
    frequency_spectrum = np.abs(np.fft.rfft(audio))
    frequency = np.fft.rfftfreq(len(audio), d=1 / sampling_rate)
    frequency_spectrum = np.abs(frequency_spectrum)
    amplitude = frequency_spectrum / frequency_spectrum.sum()
    mean_frequency = (frequency * amplitude).sum()
    freq_standerd_deviation = np.sqrt(np.sum(amplitude * ((frequency - mean_frequency) ** 2)))
    amplitude_cumulative_sum = np.cumsum(amplitude)
    median_frequency = frequency[len(amplitude_cumulative_sum[amplitude_cumulative_sum <= 0.5]) + 1]
    mode_frequency = frequency[amplitude.argmax()]
    quartile_25 = frequency[len(amplitude_cumulative_sum[amplitude_cumulative_sum <= 0.25]) + 1]
    quartile_75 = frequency[len(amplitude_cumulative_sum[amplitude_cumulative_sum <= 0.75]) + 1]
    interquartile_range = quartile_75 - quartile_25
    deviation_from_mean = amplitude - amplitude.mean()
    amplitude_std = amplitude.std()
    skewness = ((deviation_from_mean ** 3).sum() / (len(frequency_spectrum) - 1)) / amplitude_std ** 3
    kurtosis = ((deviation_from_mean ** 4).sum() / (len(frequency_spectrum) - 1)) / amplitude_std ** 4
    centroid_frequency = lr.feature.spectral_centroid(y=audio, sr=sampling_rate)
    spectral_flatness = lr.feature.spectral_flatness(y=audio)
    pitches, magnitudes = piptrack(y=audio, sr=sampling_rate, fmax = 280)
    mfccs = mfcc(y=audio, sr=sampling_rate)
    root_mean_square = lr.feature.rms(audio)

    dictionary_of_features = {
        'Mean_freq': mean_frequency/1000,
        'Std': freq_standerd_deviation/1000,
        'Median_freq': median_frequency/1000,
        'Mode_freq': mode_frequency/1000,
        'First_quartile': quartile_25/1000,
        'Third_quartile': quartile_75/1000,
        'Interquantile_range': interquartile_range/1000,
        'Skewness': skewness,
        'Kurtosis': kurtosis,
        'Centroid_freq': np.mean(centroid_frequency)/1000,
        'Spectral_flatness_measure': np.mean(spectral_flatness),
        #'Mean_fundamental_freq': (pitches[np.nonzero(pitches)].mean())/1000,
        #'Min_fundamental_freq': (pitches[np.nonzero(pitches)].min())/1000,
        #'Max_fundamental_freq': (pitches[np.nonzero(pitches)].max())/1000,
        #'Mean_MFCCs' : mfccs.mean(),                            #Mel-frequency cepstral coefficients (MFCCs)
        'Std_MFCCs': mfccs.std(),
        'Root_mean_square': root_mean_square.mean(),
        'Mean_Magnitude' : magnitudes.mean(),
        'Max_MFCCs' : mfccs.max(),
        'Mean_RMS': root_mean_square.mean(),
        'Min_RMS': root_mean_square.min(),
        'Std_RMS': root_mean_square.std()
    }

    return dictionary_of_features

# To open all the audio files, plot the frequency domain of ech file and apply the feature_extractor function to pull the features from the audio files and then append them into a list to use in creating a dataframe later.

### I added a condition to the loop to use the file name and append a new key: values pair in the dictionary, if the file starts with 'f' it appends 1 to a Gender key in the dictionary, if the file starts with anything else 'm', it appends 0 to the Gender key in the dictionary. 

In [8]:
list_of_dict = []
for file in range(0, len(audio_files), 1):
    audio, sampling_rate = lr.load(audio_files[file])
    frequencies = rfft(audio)
    #plt.plot(abs(frequencies))
    #plt.show()
    #print(audio_files[file])
    dictionary_of_features = feature_extractor(audio, sampling_rate)
    if audio_files[file].startswith('..//audios_to_test//to_test\\f'):
        dictionary_of_features['Gender'] = 1
    elif  audio_files[file].startswith('..//audios_to_test//to_test\\m'):
        dictionary_of_features['Gender'] = 0
    list_of_dict.append(dictionary_of_features)

## Printing the list of dictionaries to make sure that all the dictionaries are in it.

In [9]:
#list_of_dict

## Creating a dataframe for the features extracted from the audio files

In [10]:
voices_to_test = pd.DataFrame(list_of_dict)

In [11]:
voices_to_test

Unnamed: 0,Centroid_freq,First_quartile,Gender,Interquantile_range,Kurtosis,Max_MFCCs,Mean_Magnitude,Mean_RMS,Mean_freq,Median_freq,Min_RMS,Mode_freq,Root_mean_square,Skewness,Spectral_flatness_measure,Std,Std_MFCCs,Std_RMS,Third_quartile
0,2.152131,0.523264,1,5.25625,52.217993,190.980225,0.003693,0.010456,3.131546,2.463889,0.000146,0.171528,0.010456,5.636343,0.002239,2.643366,116.764236,0.008613,5.779514
1,2.056542,0.407639,1,2.703472,86.178423,187.353149,0.002873,0.006767,2.018625,0.973264,0.000109,0.169792,0.006767,7.238272,0.003245,2.094005,121.125092,0.006478,3.111111
2,1.869626,0.5125,1,5.110887,40.029582,197.986969,0.004227,0.012037,2.543995,0.989516,0.000179,0.200806,0.012037,5.233602,0.001114,2.644615,117.38842,0.009754,5.623387
3,1.897819,0.489655,1,3.374138,70.435179,191.927948,0.005069,0.012865,2.239866,0.859483,0.000153,0.172989,0.012865,6.531244,0.002018,2.417163,110.53511,0.011797,3.863793
4,2.588577,0.723387,1,5.796505,39.632941,175.44397,0.003629,0.011705,3.641216,3.561022,0.000135,0.202688,0.011705,4.407204,0.003072,2.853425,117.590744,0.011431,6.519892
5,2.534452,0.536688,1,5.517208,52.591077,165.976776,0.003014,0.01154,2.775327,1.02013,0.000206,0.524675,0.01154,5.954488,0.002971,2.75606,115.855392,0.012475,6.053896
6,2.292589,0.5485,1,3.43825,54.822608,169.079849,0.002528,0.012303,2.388986,1.00275,0.000151,0.4615,0.012303,6.363861,0.002942,2.392075,116.274353,0.01526,3.98675
7,2.33378,0.475368,1,4.289338,64.79963,151.39798,0.00293,0.013764,2.635799,1.223529,0.000189,0.540809,0.013764,6.889934,0.001765,2.616988,115.395187,0.013988,4.764706
8,2.461775,0.510197,1,5.552303,59.818616,164.688477,0.002008,0.006804,3.169806,2.778289,0.000127,0.223684,0.006804,6.383507,0.003494,2.672547,129.209183,0.008754,6.0625
9,2.560441,0.526261,1,5.603151,31.58465,206.786087,0.003205,0.012363,2.950385,1.492437,0.00017,0.203361,0.012363,4.391032,0.001953,2.748842,110.54203,0.009842,6.129412


In [12]:
voices_to_test.to_csv('..//datasets//voices_to_test.csv', index=False)