# Machine Learning project: gender recognition from speech

The following acoustic properties of each voice are measured:
- **duration**: length of signal <span style="color:red">**NOT USED!!**</span>
- **meanfreq**: mean frequency (in kHz)
- **sd**: standard deviation of frequency
- **median**: median frequency (in kHz)
- **Q25**: first quantile (in kHz)
- **Q75**: third quantile (in kHz)
- **IQR**: interquantile range (in kHz)
- **skew**: skewness (see note in specprop description)
- **kurt**: kurtosis (see note in specprop description)
- **sp.ent**: spectral entropy
- **sfm**: spectral flatness
- **mode**: mode frequency
- **centroid**: frequency centroid (see specprop)
- **peakf**: peak frequency (frequency with highest energy) <span style="color:red">**NOT USED!!**</span>
- **meanfun**: average of fundamental frequency measured across acoustic signal
- **minfun**: minimum fundamental frequency measured across acoustic signal
- **maxfun**: maximum fundamental frequency measured across acoustic signal
- **meandom**: average of dominant frequency measured across acoustic signal
- **mindom**: minimum of dominant frequency measured across acoustic signal
- **maxdom**: maximum of dominant frequency measured across acoustic signal
- **dfrange**: range of dominant frequency measured across acoustic signal
- **modindx**: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range

In [None]:
# pandas
import pandas as pd
pd.options.display.max_columns = None

# sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# pyplot
import matplotlib.pyplot as plt

# seaborn
import seaborn as sns

# numpy
import numpy as np

# avoid warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data and create data frame
data = pd.read_csv('dataSet.csv')
df = pd.DataFrame(data)

# Dataset analysis

In [None]:
# transform column label into numbers
#     - male:   0
#     - female: 1
df.replace('male', 0, inplace=True)
df.replace('female', 1, inplace=True)

In [None]:
# display data frame info
print(df.info())

In [None]:
# display data frame
df.head()

In [None]:
# prepare training values:
#     - x: what we know
#     - y: what we want to know
x = df.drop('label', axis=1)
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [None]:
# create random fores classifier
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
# train rfc
rfc.fit(x_train, y_train)

# cross validate scores
scores = cross_val_score(rfc, x, y, cv = 10)
print('Accuracy:', round(scores.mean()*100,2), '%')

In [None]:
# make predictions (Female Data)
prediction = rfc.predict([[
    0.2022728,
    0.04060666,
    0.2129694,
    0.1821243,
    0.227241,
    0.04511674,
    3.040879,
    17.07277,
    0.8827420,
    0.2635666,
    0.1200658,
    0.2022728,
    0.1497998,
    0.04319295,
    0.2791139,
    0.3374789,
    0,
    1.593457,
    1.593457,
    0.11383929
]])

print("Male" if prediction[0]==0 else "Female")

In [None]:
# correlation Matrix

plt.figure(figsize=(14,12))
plt.title('Correlation Matrix')
sns.heatmap(df.corr(), linewidths=0.1, annot=True)

In [None]:
# pairplot

sns.pairplot(df)

In [None]:
# KDE plot

sns.set(style="darkgrid")
male = df.query("label == '0'")
female = df.query("label == '1'")

# Set up the figure
f, ax = plt.subplots(figsize=(8, 8))
ax.set_aspect("equal")

# Draw the two density plots
ax = sns.kdeplot(male.meanfun, male.IQR,
                 cmap="Reds", shade=True, shade_lowest=False)
ax = sns.kdeplot(female.meanfun, female.IQR,
                 cmap="Blues", shade=True, shade_lowest=False)

# Add labels to the plot
red = sns.color_palette("Reds")[-2]
blue = sns.color_palette("Blues")[-2]
ax.text(0.2,0.07, "Female", size=16, color=blue)
ax.text(0.05, 0.15,"Male", size=16, color=red)

# Are the other classifiers more accurate than RandomForest?

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True, gamma='scale'),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(solver='lbfgs')]

scores = []
for clf in classifiers:
    clf.fit(x_train, y_train)
    cv_results = cross_validate(clf, x_test, y_test, cv=5, return_train_score=True)
    scores.append(np.mean(cv_results['test_score']))
    
plt.figure(figsize=(10,8))
plt.title('Scores Plot')
sns.barplot(y=[n.__class__.__name__  for n in classifiers], x=scores)

As we can see, there's no classifier significantly more accurate than RandomForest, but even if we will stick to that one, we will try to improve the SVC due to it's really poor performance

# Modification of the SVC Parameter C

In [None]:
svc_class = [
    
    SVC(probability=True, gamma='scale'),
    SVC(probability=True, C=1.5, gamma='scale'),
    SVC(probability=True, C=200, gamma='scale'),
    SVC(probability=True, C=500,gamma='scale'),
    SVC(probability=True, C=1000,gamma='scale'),
    SVC(probability=True, C=10000,gamma='scale'),
    SVC(probability=True, C=100000,gamma='scale'),
    SVC(probability=True, C=500000,gamma='scale'),
]

svc_scores = []
for clf in svc_class:
    clf.fit(x_train, y_train)
    cv_results = cross_validate(clf, x_test, y_test, cv=5, return_train_score=True)
    svc_scores.append(np.mean(cv_results['test_score']))
    
sns.barplot(x=[n for n in range(len(svc_class))],y=svc_scores).set(xlabel="C Parameter Changed", ylabel="Accuracy")

# PCA

In [None]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components = 10)
pca.fit(x)

y_variance = pca.explained_variance_ratio_

from sklearn.model_selection import cross_val_score
import time
n = 1
scores = [] # Scores array 
times = [] # Times array

# Loop to add a score and the execution time of the PCA with n components
while n < len(data.columns):
    start = time.time()
    pca = PCA(n_components = n)
    pca.fit(x)
    y_variance = pca.explained_variance_ratio_
    x_pca = pca.transform(x)
    rfc = RandomForestClassifier(n_estimators=100)
    scores.append(cross_val_score(rfc, x_pca, y, cv=5).mean())
    end = time.time()
    totaltime = end-start
    times.append(totaltime)
    n += 1
    
plt.figure(figsize=(8,6))
plt.title("Accuracy according to n_components")
sns.barplot(x=[i for i in range(1, 21)], y=[round(n*100, 2) for n in scores]).set(xlabel="n_components", ylabel="Accuracy")

# Neural Network

In [None]:
# Neural Network

import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

# We added 3 layers of 32 neurons each with a SoftSign activation
model = Sequential()
model.add(Dense(32, activation='softsign', input_dim=20))
model.add(Dense(32, activation='softsign'))
model.add(Dense(32, activation='softsign'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

df = pd.read_csv("dataSet.csv")
X = df.drop('label', axis=1).values
df.replace('male',0, inplace=True)
df.replace('female',1, inplace=True)
c = df['label'].values

model.fit(x,c, epochs=100, batch_size=10, validation_data=(x_test, y_test))

Now let's make a prediction with the trained neural network:

In [None]:
# We insert a male data array
Xnew = np.array([[0.1984445, 0.06684052, 0.2157356, 0.1375283, 0.264536, 0.1270077, 3.38914, 20.50335, 0.8929154, 0.3376926, 0.1200362, 0.1984445, 0.1396227, 0.04349112, 0.2791139, 0.4190832, 0, 5.081836, 5.081836, 0.07727807]])
ynew = model.predict_classes(Xnew)

print("Male" if ynew[0]==0 else "Female")

# Clustering (KMEANS)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2)
kmeans.fit(df[['meanfun','IQR']])

kmeans.labels_

kmeans.inertia_

df['KMEANS'] = kmeans.labels_

# KMeans vs Results
plt.figure(figsize=(50,8))
plt.grid(False)
plt.style.use("dark_background")
plt.title("KMEANS vs Label", fontsize=28)
plt.step([i for i in range(0,3168)] ,df['KMEANS'])
plt.step([i for i in range(0,3168)] ,df['label'], lw=10, c='r')