In [13]:
import os
import time
import tqdm
from tqdm import tqdm
import scipy 
import numpy as np
import pandas as pd
import audb
import audiofile
from scipy.io.wavfile import read as read_wav
from matplotlib import pyplot as plt
import opensmile

import sklearn 
from sklearn.model_selection import train_test_split #train/test split 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC #Model
from sklearn.metrics import confusion_matrix #Metrics 
from sklearn.metrics import f1_score 
from sklearn.model_selection import cross_val_score #Cross-validation
from sklearn.model_selection import KFold


import random #for shuffling values 
from scipy import stats

#List of prosodic features we will observe
features_prosody= ['F0semitoneFrom27.5Hz_sma3nz_amean','loudness_sma3_amean',
'jitterLocal_sma3nz_amean','shimmerLocaldB_sma3nz_amean',
'hammarbergIndexV_sma3nz_amean','HNRdBACF_sma3nz_amean']


#Gets functional features of all audios 
def functional_features(list_with_files, path):
    path= path + '/'
    
    #  Define feature extractor
    smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,)

    df_data= []

    for i in tqdm((list_with_files)):
        f= path + i
        sampling_rate, data = read_wav(f) #gets sampling rate and dimensions of data
        #print(f"Sampling rate of {f}:{sampling_rate}")
        
        db = audb.load('emodb',
        version='1.1.1',
        format='wav',
        mixdown=True,
        sampling_rate=sampling_rate,
        media='wav/03a01.*',  # load subset
        full_path=False,
        verbose=False,)

        #extract features
        signal, sampling_rate = audiofile.read(f, duration=120, always_2d=True)
        data= smile.process_signal(signal, sampling_rate)
        df_data.append(data.loc[:,features_prosody])

    df= pd.concat(df_data, ignore_index=True)
    return df
        
def t_test(dataframe):
    for i in features_prosody:
        class_0= dataframe[i][dataframe['label']==0].values
        class_1= dataframe[i][dataframe['label']== 1].values 
        t_statistic, p_value = stats.ttest_ind(class_0, class_1)
        print(f"T-statistic {i}:", t_statistic)
        print(f"P-value {i}:", p_value)
    

In [14]:
labels= open('labels_extension.txt', 'r').read().split('\n')
labels= [int(label) for label in labels]

In [15]:
#Process wav files 
path='audios/'
dir_list = os.listdir(path)
feature_data= functional_features(dir_list, path)


100%|█████████████████████████████████████████████████████████████████████████████████| 541/541 [07:22<00:00,  1.22it/s]


In [16]:
#Add labels 

df= feature_data
df['label']=labels

In [17]:
# Select the column you want to normalize
column_to_normalize = ['F0semitoneFrom27.5Hz_sma3nz_amean']

for i in column_to_normalize:
    #Column as a numpy array
    column_data= df[i].values.reshape(-1, 1)
    # Normalize the column
    scaler = StandardScaler()
    normalized_column = scaler.fit_transform(column_data)
    # Replace the original column with the normalized values
    df[i] = normalized_column

df


Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,loudness_sma3_amean,jitterLocal_sma3nz_amean,shimmerLocaldB_sma3nz_amean,hammarbergIndexV_sma3nz_amean,HNRdBACF_sma3nz_amean,label
0,-2.118186,1.619823,0.044742,1.496739,24.104216,3.076883,0
1,1.319590,4.397794,0.041089,1.232881,12.559448,2.495436,1
2,-0.318944,2.001439,0.032084,1.032401,24.854361,6.320629,0
3,-0.915292,0.762132,0.049578,1.650914,18.634897,1.913031,1
4,-1.321453,0.666024,0.044419,1.481411,23.117395,3.364432,1
...,...,...,...,...,...,...,...
536,-1.430690,0.998517,0.022291,1.077695,27.654383,6.666949,1
537,0.293895,0.661941,0.018036,1.011724,12.586220,6.244031,1
538,0.902010,2.230912,0.045722,1.285447,15.154710,4.899701,1
539,0.615633,0.596897,0.041760,1.511945,20.778763,4.671585,1


In [18]:
#Dividing samples for the two collection phases 

first_sampling= df[:100]
full_sampling= df


In [19]:
#First T-Test

t_test(first_sampling)

T-statistic F0semitoneFrom27.5Hz_sma3nz_amean: -2.0346614297256576
P-value F0semitoneFrom27.5Hz_sma3nz_amean: 0.04458600830738519
T-statistic loudness_sma3_amean: -0.26209509148512905
P-value loudness_sma3_amean: 0.7937982532299028
T-statistic jitterLocal_sma3nz_amean: -3.0338757475746183
P-value jitterLocal_sma3nz_amean: 0.003090915483098893
T-statistic shimmerLocaldB_sma3nz_amean: -1.1244490499632347
P-value shimmerLocaldB_sma3nz_amean: 0.26356923602308757
T-statistic hammarbergIndexV_sma3nz_amean: 0.7560549719213243
P-value hammarbergIndexV_sma3nz_amean: 0.45142980238883945
T-statistic HNRdBACF_sma3nz_amean: 0.9321172431846726
P-value HNRdBACF_sma3nz_amean: 0.3535666652922296


In [20]:
#Full samples 

t_test(full_sampling)

T-statistic F0semitoneFrom27.5Hz_sma3nz_amean: -2.079535621258287
P-value F0semitoneFrom27.5Hz_sma3nz_amean: 0.03804024622940207
T-statistic loudness_sma3_amean: 0.3794378086616753
P-value loudness_sma3_amean: 0.7045122201146741
T-statistic jitterLocal_sma3nz_amean: -1.3966407501204499
P-value jitterLocal_sma3nz_amean: 0.1630964386990832
T-statistic shimmerLocaldB_sma3nz_amean: -0.9451559303267155
P-value shimmerLocaldB_sma3nz_amean: 0.34500276462535817
T-statistic hammarbergIndexV_sma3nz_amean: 0.4775674379224918
P-value hammarbergIndexV_sma3nz_amean: 0.6331517099794851
T-statistic HNRdBACF_sma3nz_amean: 0.14005468620376813
P-value HNRdBACF_sma3nz_amean: 0.888669102289307


## First approach to SVM classification 

In [21]:
#Train/Test split 
y_axis= first_sampling.iloc[:,[-1]]
y= y_axis.to_numpy().flatten() #Labels (class 0 and 1)
X_axis= first_sampling.drop(y_axis.columns,axis = 1)
X= X_axis.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [27]:
#Cross-validation (Overfitting)

C_list= np.linspace(0.1, 10, 20)
gamma_list= np.logspace(-4, -2, 20)
precision = np.zeros((len(C_list), len(gamma_list)))



for i in range(len(C_list)):
     for j in range(len(gamma_list)):
        svc= SVC(C=C_list[i], gamma=gamma_list[j], kernel='rbf')
        svc.fit(X_train, y_train)
        y_pred= svc.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_pred, y_train).ravel()
        precision[i, j] = float(tp + tn) / (tp + fp + fn + tn)

index_max = np.unravel_index(np.argmax(precision), precision.shape)

print('Max. Precision:{}'.format(precision[index_max]))
C_opt = C_list[index_max[0]]
gamma_opt = gamma_list[index_max[1]]
print('Optimal C: {}'.format(C_opt))
print('Optimal gamma:{}'.format(gamma_opt))

Max. accuracy:0.671875
Optimal C: 7.915789473684211
Optimal gamma:0.01


In [28]:
#Overfitting 

svc= SVC(kernel='rbf', class_weight='balanced', C=C_opt, gamma=gamma_opt)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_train)
cm= confusion_matrix(y_train, y_pred)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
recall = float(tp) / (tp + fn)
precision= float(tp + tn) / (tp + fn + fp + tn)
print(cm)
print('Recall: {}'.format(recall)) 
print('Precision {}'.format(precision))

[[28  6]
 [12 18]]
Recall: 0.6
Precision 0.71875
