################ IMPORTING THE REQUIRED LIBRARIES

In [1]:
import librosa
import soundfile
import os, glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint,TensorBoard,ProgbarLogger
from keras.utils import np_utils
from sklearn import metrics 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


############## EMOTIONS INCLUDED IN THE DATASET

In [2]:

emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}


############## EXTRACTING FEATURES FROM THE AUDIO SIGNAL USING LIBROSA

In [3]:
def extract_feature(file_name, mfcc, chroma,spectral_centroid,spectral_bandwidth,spectral_rolloff,
                    spectral_contrast,rms,spectral_flatness):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
            
            
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))             
           
            
        if spectral_centroid:
            spectral_centroid=np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
            result=np.hstack((result, spectral_centroid)) 
        
        if spectral_bandwidth:
           spectral_bandwidth=np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate).T)
#           print(spectral_bandwidth)
           result=np.hstack((result, spectral_bandwidth)) 
           
        if spectral_rolloff:
           spectral_rolloff=np.mean(librosa.feature.spectral_rolloff(y=X, sr=sample_rate).T)
#           print(spectral_rolloff)
           result=np.hstack((result, spectral_rolloff))
        
        if spectral_contrast:
           spectral_contrast=np.mean(librosa.feature.spectral_contrast(y=X, sr=sample_rate))
           result=np.hstack((result, spectral_contrast))
           
        if rms:
           rms=np.mean(librosa.feature.rms(y=X).T,axis=0)
           result=np.hstack((result, rms))
           
        if spectral_flatness:
           spectral_flatness=np.mean(librosa.feature.spectral_flatness(y=X))
           result=np.hstack((result, spectral_flatness))
        
        return result


############## LOADING THE DATASET AND EXTRACTING ALL THE FEATURES FROM IT

In [12]:
def load_data():
    x,y=[],[]
    for file in glob.glob(r"F:\speech_project\speech-emotion-recognition-ravdess-data\Actor_*\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                                spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
        x.append(feature)
        y.append(emotion)
    # Create scaler: scaler
    scaler = MinMaxScaler()
    X=scaler.fit_transform(x)
    X=pd.DataFrame(X)
    return X,y

############## SHUFFLING OF OBS AND RESETTING THE OBS INDEXES

In [13]:
X,y=load_data()
X=pd.DataFrame(X)
y=pd.DataFrame(y)
data=pd.concat([X,y],axis="columns")
data=data.sample(frac=1).reset_index(drop=True)
X=data.iloc[:,0:58]
y=data.iloc[:,58]

############## LABEL ENCODING THE RESPONSE VARIABLES

In [14]:
le = preprocessing.LabelEncoder()
y=le.fit_transform(y)

############## SPLITTING THE DATA INTO TRAIN(95%) AND TEST(5%)

In [15]:
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.05, random_state=9,stratify=y)
print((x_train.shape[0], x_test.shape[0]))

print(f'Features extracted: {x_train.shape[1]}')
x_train=x_train.to_numpy()
x_test=x_test.to_numpy()


(1345, 71)
Features extracted: 58


############ Creating a data structure with 60 timesteps and 1 output

In [16]:
X_train = []
Y_train = []
for i in range(0, 1345):
    X_train.append(x_train[i].reshape(58,1))
    Y_train.append(y_train[i])
X_train, Y_train = np.array(X_train), np.array(Y_train)

X_test=[]
Y_test=[]
for j in range(0,71):
    X_test.append(x_test[j].reshape(58,1))
    Y_test.append(y_test[j])
X_test, Y_test = np.array(X_test), np.array(Y_test)

In [17]:
# Reshaping
X_train = np.reshape(np.array(X_train), (X_train.shape[0], X_train.shape[1], 1))

X_test = np.reshape(np.array(X_test), (X_test.shape[0], X_test.shape[1], 1))
lb = LabelEncoder()
Y_train = np_utils.to_categorical(lb.fit_transform(Y_train))
Y_test = np_utils.to_categorical(lb.fit_transform(Y_test))

#################### LSTM RNN  ###########################

In [18]:
print('Build LSTM RNN model ...')
model = Sequential()
model.add(LSTM(units=452, dropout=0.05, recurrent_dropout=0.20, return_sequences=True,input_shape = (X_train.shape[1],1)))
model.add(LSTM(units=250, dropout=0.05, recurrent_dropout=0.20, return_sequences=True))
model.add(LSTM(units=250, dropout=0.05, recurrent_dropout=0.20, return_sequences=True))
model.add(LSTM(units=120, dropout=0.05, recurrent_dropout=0.20, return_sequences=False))
model.add(Dense(150, activation='relu'))
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='Adamax', metrics=['acc'])
model.summary()


Build LSTM RNN model ...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 58, 452)           820832    
_________________________________________________________________
lstm_5 (LSTM)                (None, 58, 250)           703000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 58, 250)           501000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 120)               178080    
_________________________________________________________________
dense_2 (Dense)              (None, 150)               18150     
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 1208      
Total params: 2,222,270
Trainable params: 2,222,270
Non-trainable params: 0
_________________________________________

####### Training LSTM ###########

In [None]:
# saved model checkpoint file
best_model_file=r"F:\speech_project\Dipali & Sharmila Project\best_model_trained.hdf5"

MAX_PATIENT=12
MAX_EPOCHS=60
MAX_BATCH=32

# callbacks
# removed EarlyStopping(patience=MAX_PATIENT)
callback=[ReduceLROnPlateau(patience=MAX_PATIENT, verbose=2),
          ModelCheckpoint(filepath=best_model_file, monitor='val_acc',
                          verbose=2, save_best_only=True)]

print ("training started..... please wait.")
# training
history=model.fit(X_train, Y_train, 
                  batch_size=MAX_BATCH, 
                  epochs=MAX_EPOCHS,
                  verbose=2,
                  validation_data=(X_test,Y_test),
                  callbacks=callback) 

print ("training finised!")

training started..... please wait.
Train on 1345 samples, validate on 71 samples
Epoch 1/60
 - 29s - loss: 2.0765 - acc: 0.1115 - val_loss: 2.0639 - val_acc: 0.1549

Epoch 00001: val_acc improved from -inf to 0.15493, saving model to F:\speech_project\Dipali & Sharmila Project\best_model_trained.hdf5
Epoch 2/60
 - 23s - loss: 2.0708 - acc: 0.1286 - val_loss: 2.0654 - val_acc: 0.1408

Epoch 00002: val_acc did not improve from 0.15493
Epoch 3/60
 - 23s - loss: 2.0639 - acc: 0.1227 - val_loss: 2.0640 - val_acc: 0.1268

Epoch 00003: val_acc did not improve from 0.15493
Epoch 4/60
 - 22s - loss: 2.0628 - acc: 0.1309 - val_loss: 2.0652 - val_acc: 0.1408

Epoch 00004: val_acc did not improve from 0.15493
Epoch 5/60
 - 22s - loss: 2.0628 - acc: 0.1428 - val_loss: 2.0615 - val_acc: 0.1690

Epoch 00005: val_acc improved from 0.15493 to 0.16901, saving model to F:\speech_project\Dipali & Sharmila Project\best_model_trained.hdf5
Epoch 6/60
 - 24s - loss: 2.0621 - acc: 0.1331 - val_loss: 2.0601 - v

In [None]:
y_pred_vot1=model.predict_classes(X_test)
print("Accuracy=",accuracy_score(y_test, y_pred_vot1))


In [None]:
############# SAVING THE MODEL
import pickle
pickle.dump(model, open("C:\\Users\\dbda\\Desktop\\project\\LSTM.pkl", 'wb'))
print("Saved model to disk")


In [None]:
#################### Prediction on unseen data
def load_test_data():
    x_test,y_test=[],[]
    
    file="F:\\speech_project\\UnSeenSet\\03-01-02-01-02-01-18.wav"
    emotion=emotions[file.split("-")[2]]
    feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                            spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
    
    x_test.append(feature)
    y_test.append(emotion)
    print(x_test)
    return x_test,y_test

In [None]:
X_test_data,y_test_data=load_test_data()
print(y_test_data)

########### LOADING SAVED MODEL

In [None]:
import pickle
loaded_model = pickle.load(open("C:\\Users\\dbda\\Desktop\\project\\LSTM.pkl", 'rb'))
result = loaded_model.predict(X_test_data)
print(result)

In [None]:
print(le.inverse_transform(result)[0])