################ IMPORTING THE REQUIRED LIBRARIES

In [90]:
import librosa
import soundfile
import os, glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout,Flatten,BatchNormalization
from keras.optimizers import SGD,RMSprop
from keras.layers.convolutional import Conv1D, MaxPooling1D

############## EMOTIONS INCLUDED IN THE DATASET

In [2]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

############## EXTRACTING FEATURES FROM THE AUDIO SIGNAL USING LIBROSA

In [3]:
def extract_feature(file_name, mfcc, chroma,spectral_centroid,spectral_bandwidth,
                    spectral_rolloff,spectral_contrast,rms,spectral_flatness):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
            
            
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))             
           
            
        if spectral_centroid:
            spectral_centroid=np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
            result=np.hstack((result, spectral_centroid)) 
        
        if spectral_bandwidth:
           spectral_bandwidth=np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate).T)
#           print(spectral_bandwidth)
           result=np.hstack((result, spectral_bandwidth)) 
           
        if spectral_rolloff:
           spectral_rolloff=np.mean(librosa.feature.spectral_rolloff(y=X, sr=sample_rate).T)
#           print(spectral_rolloff)
           result=np.hstack((result, spectral_rolloff))
        
        if spectral_contrast:
           spectral_contrast=np.mean(librosa.feature.spectral_contrast(y=X, sr=sample_rate))
           result=np.hstack((result, spectral_contrast))
           
        if rms:
           rms=np.mean(librosa.feature.rms(y=X).T,axis=0)
           result=np.hstack((result, rms))
           
        if spectral_flatness:
           spectral_flatness=np.mean(librosa.feature.spectral_flatness(y=X))
           result=np.hstack((result, spectral_flatness))
        
        return result


############## LOADING THE DATASET AND EXTRACTING ALL THE FEATURES FROM IT 

In [9]:
def load_data():
    x,y=[],[]
    for file in glob.glob(r"F:\speech_project\speech-emotion-recognition-ravdess-data\Actor_*\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                                spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
        x.append(feature)
        y.append(emotion)
    return x,y


############## SHUFFLING OF OBS AND RESETTING THE OBS INDEXES

In [18]:
X,y=load_data()
X=pd.DataFrame(X)
y=pd.DataFrame(y)
data=pd.concat([X,y],axis="columns")
data=data.sample(frac=1).reset_index(drop=True)
X=data.iloc[:,0:58]
y=data.iloc[:,58]

############## LABEL ENCODING THE RESPONSE VARIABLES

In [19]:
le = preprocessing.LabelEncoder()
y=le.fit_transform(y)

############## SPLITTING THE DATA INTO TRAIN(95%) AND TEST(5%)

In [20]:
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.05, random_state=42,stratify=y)
print((x_train.shape[0], x_test.shape[0]))
print(f'Features extracted: {x_train.shape[1]}')

(1345, 71)
Features extracted: 58


############## RESHAPING PREDICTORS

In [16]:
x_traincnn = np.expand_dims(x_train, axis=2)
x_testcnn = np.expand_dims(x_test, axis=2)
print(x_traincnn.shape)
print(x_testcnn.shape)

(1345, 58, 1)
(71, 58, 1)
Features extracted: 58


############## LABEL ENCODING THE RESPONSE VARIABLES AND CREATING DUMMIES

In [21]:
lb = LabelEncoder()
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
print(y_train.shape)
print(y_test.shape)

(1345, 8)
(71, 8)


############ 1D CNN #############

In [92]:
model = Sequential()
model.add(Conv1D(256, 13, padding='same',input_shape=(x_traincnn.shape[1],1)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Conv1D(256, 13, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(256, 7, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Conv1D(64, 7, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(8))
model.add(Activation('softmax'))
opt =SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
opt1=RMSprop()

#################### TRAINING THE MODEL

In [93]:
model.compile(optimizer=opt1, loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x_traincnn,y_train,validation_data=(x_testcnn,y_test),verbose=2,epochs=100)

Train on 1345 samples, validate on 71 samples
Epoch 1/100
 - 8s - loss: 3.9878 - acc: 0.1264 - val_loss: 2.3697 - val_acc: 0.1549
Epoch 2/100
 - 7s - loss: 2.1433 - acc: 0.1286 - val_loss: 2.2764 - val_acc: 0.1549
Epoch 3/100
 - 6s - loss: 2.1125 - acc: 0.1338 - val_loss: 2.1020 - val_acc: 0.0986
Epoch 4/100
 - 6s - loss: 2.0953 - acc: 0.1234 - val_loss: 2.0864 - val_acc: 0.1408
Epoch 5/100
 - 6s - loss: 2.0816 - acc: 0.1502 - val_loss: 2.1010 - val_acc: 0.1268
Epoch 6/100
 - 6s - loss: 2.0769 - acc: 0.1509 - val_loss: 2.1566 - val_acc: 0.1268
Epoch 7/100
 - 6s - loss: 2.0855 - acc: 0.1242 - val_loss: 2.0600 - val_acc: 0.1690
Epoch 8/100
 - 7s - loss: 2.0752 - acc: 0.1442 - val_loss: 2.0492 - val_acc: 0.1690
Epoch 9/100
 - 6s - loss: 2.0735 - acc: 0.1323 - val_loss: 2.0728 - val_acc: 0.1408
Epoch 10/100
 - 6s - loss: 2.0737 - acc: 0.1145 - val_loss: 2.0785 - val_acc: 0.1268
Epoch 11/100
 - 6s - loss: 2.0736 - acc: 0.1227 - val_loss: 2.0917 - val_acc: 0.1268
Epoch 12/100
 - 6s - loss: 2

KeyboardInterrupt: 

#################### PREDICTION ON TEST SET

In [36]:
y_pred_prob = model.predict(x_testcnn)
y_pred_prob.shape

(71, 8)

In [37]:
loss, acc = model.evaluate(x_testcnn, y_test,verbose=0)

print('Test loss = {:.4f} '.format(loss))
print('Test acc = {:.4f} '.format(acc))

Test loss = 2.0563 
Test acc = 0.1408 


############# SAVING THE MODEL

In [39]:
import pickle
pickle.dump(model, open("C:\\Users\\dbda\\Desktop\\project\\CNN.pkl", 'wb'))
print("Model Saved!!")

Model Saved!!


#################### Prediction on unseen data

In [94]:

def load_test_data():
    x_test,y_test=[],[]
    
    file="F:\\speech_project\\UnSeenSet\\03-01-02-01-02-01-18.wav"
    emotion=emotions[file.split("-")[2]]
    feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                            spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
    
    x_test.append(feature)
    y_test.append(emotion)
    print(x_test)
    return x_test,y_test

In [95]:
X_test_data,y_test_data=load_test_data()
X_test_data=np.asarray(X_test_data)
print(y_test_data)

print(X_test_data.shape)

[array([-6.77559326e+02,  4.86538773e+01,  4.00112343e+00,  6.29271889e+00,
       -2.63242054e+00,  4.15418506e-01, -1.20165043e+01, -8.39356899e+00,
       -6.19712973e+00, -4.59199858e+00, -6.12935257e+00, -5.18731499e+00,
       -2.05171275e+00, -6.01581001e+00, -5.04369974e+00, -5.99067354e+00,
       -7.66018677e+00, -5.44005013e+00, -6.23430157e+00, -6.40192604e+00,
       -6.71265650e+00, -4.87261820e+00, -6.49819517e+00, -5.79686689e+00,
       -3.54625845e+00, -2.15704918e+00,  2.97151184e+00,  2.96098375e+00,
        3.67459607e+00,  1.74638510e+00, -6.77530944e-01,  6.43206596e-01,
        3.11514688e+00,  5.58218002e+00,  3.97386026e+00,  3.56227517e+00,
        2.52387166e-01,  1.62207997e+00,  3.05497026e+00,  1.44830656e+00,
        3.69383901e-01,  3.87627304e-01,  4.08708304e-01,  4.29015309e-01,
        4.54207987e-01,  5.29577851e-01,  5.19585311e-01,  4.74213362e-01,
        4.88786906e-01,  5.42983055e-01,  5.24420142e-01,  4.18756694e-01,
        1.83156139e+03, 

############## RESHAPING PREDICTORS

In [96]:
X_test_data = np.expand_dims(X_test_data, axis=2)
print(x_traincnn.shape)

(1345, 58, 1)


########### LOADING SAVED MODEL

In [97]:

import pickle
loaded_model = pickle.load(open("C:\\Users\\dbda\\Desktop\\project\\CNN.pkl", 'rb'))
result = loaded_model.predict_classes(X_test_data)
print(result)

[6]


In [98]:
print(le.inverse_transform(result))

['sad']
