In [1]:
import librosa
import soundfile
import os, glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pandas as pd
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [2]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

In [3]:
def extract_feature(file_name, mfcc, chroma,spectral_centroid,spectral_bandwidth,
                    spectral_rolloff,spectral_contrast,rms,spectral_flatness):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
            
            
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))             
           
            
        if spectral_centroid:
            spectral_centroid=np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
            result=np.hstack((result, spectral_centroid)) 
        
        if spectral_bandwidth:
           spectral_bandwidth=np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate).T)
#           print(spectral_bandwidth)
           result=np.hstack((result, spectral_bandwidth)) 
           
        if spectral_rolloff:
           spectral_rolloff=np.mean(librosa.feature.spectral_rolloff(y=X, sr=sample_rate).T)
#           print(spectral_rolloff)
           result=np.hstack((result, spectral_rolloff))
        
        if spectral_contrast:
           spectral_contrast=np.mean(librosa.feature.spectral_contrast(y=X, sr=sample_rate))
           result=np.hstack((result, spectral_contrast))
           
        if rms:
           rms=np.mean(librosa.feature.rms(y=X).T,axis=0)
           result=np.hstack((result, rms))
           
        if spectral_flatness:
           spectral_flatness=np.mean(librosa.feature.spectral_flatness(y=X))
           result=np.hstack((result, spectral_flatness))
        
        return result


In [6]:
def load_data():
    x,y=[],[]
    for file in glob.glob("D:\\DBDA\PROJECT\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                                spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
        x.append(feature)
        y.append(emotion)

    return x,y

In [7]:
x_data,y_data=load_data()
features=[]
for i in range(1,59):
    features.append("feat"+str(i))

le = preprocessing.LabelEncoder()
y=le.fit_transform(y_data)
y=pd.DataFrame(y,columns=["emotions"])

In [8]:
X=pd.DataFrame(x_data, columns=features)
X_train,X_test,y_train,y_test=train_test_split(X,y, stratify=y, random_state=42,test_size=0.05)

In [9]:
scaler = MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_train_scaled=pd.DataFrame(X_train_scaled,index=X_train.index, columns=X_train.columns)
X_test_scaled=scaler.fit_transform(X_test)       
X_test_scaled=pd.DataFrame(X_test_scaled,index=X_test.index, columns=X_test.columns)
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 58


In [10]:
kfold = KFold(n_splits=5,random_state=42, shuffle=True)

In [11]:
############ Model-1 

model_svc=SVC(kernel='rbf',random_state=42)
param_svc={'gamma':['scale'],'decision_function_shape':['ovo']}
grid_svc=GridSearchCV(model_svc,param_grid=param_svc,verbose=3,n_jobs=-1)
grid_svc.fit(X_train_scaled,y_train)

svc=grid_svc.best_estimator_
print("SVC ",grid_svc.best_score_)
print(grid_svc.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished
  y = column_or_1d(y, warn=True)


SVC  0.5306970401860912
{'decision_function_shape': 'ovo', 'gamma': 'scale'}


In [None]:
############ Model-2

model_rf=RandomForestClassifier(random_state=42)
param_rf={'n_estimators':[6000,7000,8000],
          'max_depth':[29,23,25,27],
          'min_samples_split':[2,4],
          'oob_score':[True],
          'max_features':['auto','sqrt','log2']}

grid_rf=GridSearchCV(model_rf,param_grid=param_rf,verbose=4,n_jobs=-1)
grid_rf.fit(X_train_scaled,y_train)
rf=grid_rf.best_estimator_
print("RF ",grid_rf.best_score_)
print(grid_rf.best_params_)

In [None]:
############ Model-3

model_mlp=MLPClassifier(random_state=42)
#(350,410,330)
param_mlp={'learning_rate_init':np.linspace(0.01,0.3,20),
          'hidden_layer_sizes':[(124,123,132),(210,230,260)],
          'activation':['logistic','tanh','relu'],
          'learning_rate':['constant', 'invscaling', 'adaptive']}

grid_mlp=GridSearchCV(model_mlp, param_grid=param_mlp, verbose=3, cv=kfold,n_jobs=-1)
grid_mlp.fit(X_train_scaled,y_train)
mlp=grid_mlp.best_estimator_

print("MLP ",grid_mlp.best_score_)
print(grid_mlp.best_params_)

In [None]:
###### Now level 2 model XGBoost ###########
clf = XGBClassifier(random_state=2019)

lr_range = [0.01, 0.1, 0.2,0.3,0.5]
n_est_range = [90,50,100,200]
md_range = [2,4,6,8,10]

parameters = {'learning_rate':lr_range,
                  'n_estimators':n_est_range,
                  'max_depth':md_range}

grid_xgb=GridSearchCV(clf, param_grid=parameters, verbose=3, cv=kfold,n_jobs=-1)
grid_xgb.fit(X_train_scaled,y_train)
xgb=grid_xgb.best_estimator_
print("xgb ",grid_xgb.best_score_)
print(grid_xgb.best_params_)


In [None]:
#
#ourEstimators = [
#    ('SVC :',SVC(random_state=42, kernel='rbf', decision_function_shape='ovo',gamma='scale')),
#    ('RF :',RandomForestClassifier(random_state=42, max_depth=25,max_features='auto', 
                                        #min_samples_split=2,n_estimators=6000,oob_score=True)),
#    ('MLP :', MLPClassifier(random_state=42,activation='tanh', hidden_layer_sizes=(124,123,132),
                                    #learning_rate='constant',learning_rate_init=0.01))
#]

In [None]:
#reg = StackingClassifier(
#    estimators=ourEstimators,cv=kfold,
#    final_estimator=XGBClassifier(random_state=42,learning_rate=0.2,max_depth=4, n_estimators=175),
#    passthrough=True,verbose=3,n_jobs=-1
#)
#    
#reg.fit(X_train_scaled, y_train)
#y_pred_stack=reg.predict(X_test_scaled)
#print("Accuracy=",accuracy_score(y_test, y_pred_stack))

In [None]:
ourEstimators1 = [
    ('SVC :',svc),
    ('RF :',rf),
    ('MLP :',mlp),
    ('xgb:',xgb)    
]

In [None]:
Voting = VotingClassifier(estimators=ourEstimators1,voting='hard',weights=[0.5,1,3,2],n_jobs=-1)
Voting.fit(X_train_scaled,y_train)
y_pred_vot=Voting.predict(X_test_scaled)
print("Accuracy=",accuracy_score(y_test, y_pred_vot))