################ IMPORTING THE REQUIRED LIBRARIES

In [2]:
import librosa
import soundfile
import os, glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout,Flatten,BatchNormalization
from keras.optimizers import SGD
from keras.layers.convolutional import Conv1D, MaxPooling1D

Using TensorFlow backend.


############## EMOTIONS INCLUDED IN THE DATASET

In [3]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}


############## EXTRACTING FEATURES FROM THE AUDIO SIGNAL USING LIBROSA

In [4]:

def extract_feature(file_name, mfcc, chroma,spectral_centroid,spectral_bandwidth,
                    spectral_rolloff,spectral_contrast,rms,spectral_flatness):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
            
            
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))             
           
            
        if spectral_centroid:
            spectral_centroid=np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
            result=np.hstack((result, spectral_centroid)) 
        
        if spectral_bandwidth:
           spectral_bandwidth=np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate).T)
#           print(spectral_bandwidth)
           result=np.hstack((result, spectral_bandwidth)) 
           
        if spectral_rolloff:
           spectral_rolloff=np.mean(librosa.feature.spectral_rolloff(y=X, sr=sample_rate).T)
#           print(spectral_rolloff)
           result=np.hstack((result, spectral_rolloff))
        
        if spectral_contrast:
           spectral_contrast=np.mean(librosa.feature.spectral_contrast(y=X, sr=sample_rate))
           result=np.hstack((result, spectral_contrast))
           
        if rms:
           rms=np.mean(librosa.feature.rms(y=X).T,axis=0)
           result=np.hstack((result, rms))
           
        if spectral_flatness:
           spectral_flatness=np.mean(librosa.feature.spectral_flatness(y=X))
           result=np.hstack((result, spectral_flatness))
        
        return result


############## LOADING THE DATASET AND EXTRACTING ALL THE FEATURES FROM IT 

In [5]:

def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("F:\\speech_project\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                                spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
        x.append(feature)
        y.append(emotion)
    # Create scaler: scaler
    scaler = MinMaxScaler()
    X=scaler.fit_transform(x)
    X=pd.DataFrame(X)
    ############ 
    tsne = TSNE(learning_rate=200,random_state=2019)
    # Apply fit_transform to samples: tsne_features
    tsne_features = tsne.fit_transform(X)
    return tsne_features,y

############## SHUFFLING OF OBS AND RESETTING THE OBS INDEXES

In [None]:
X,y=load_data()
X=pd.DataFrame(X)
y=pd.DataFrame(y)
data=pd.concat([X,y],axis="columns")
data=data.sample(frac=1).reset_index(drop=True)
X=data.iloc[:,0:58]
y=data.iloc[:,58]

############## LABEL ENCODING THE RESPONSE VARIABLES

In [None]:
le = preprocessing.LabelEncoder()
y=le.fit_transform(y)

############## SPLITTING THE DATA INTO TRAIN(95%) AND TEST(5%)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.05, random_state=42)
print((x_train.shape[0], x_test.shape[0]))
print(f'Features extracted: {x_train.shape[1]}')

############## RESHAPING PREDICTORS

In [None]:
x_traincnn = np.expand_dims(x_train, axis=2)
x_testcnn = np.expand_dims(x_test, axis=2)
print(x_traincnn.shape)
print(x_testcnn.shape)

############## LABEL ENCODING THE RESPONSE VARIABLES AND CREATING DUMMIES

In [12]:
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
print(y_train.shape)
print(y_test.shape)

Features extracted: 2
(1311, 8)
(69, 8)


############ 1D CNN #############

In [13]:
model = Sequential()
model.add(Conv1D(332, 7, padding='same',input_shape=(x_traincnn.shape[1],1)))
model.add(Activation('relu'))
model.add(Conv1D(280, 7, padding='same'))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=(2),strides=2))
model.add(Conv1D(305, 5, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Conv1D(325, 5, padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(1),strides=1))
model.add(Conv1D(201, 5, padding='same'))
model.add(Activation('relu'))
model.add(Conv1D(300, 5, padding='same'))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(8))
model.add(Activation('softmax'))
#opt =SGD(lr=0.001, momentum=0.0, decay=0.0, nesterov=False)







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


#################### TRAINING THE MODEL

In [14]:
model.compile(optimizer="adam", loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x_traincnn,y_train,validation_data=(x_testcnn,y_test),verbose=2,epochs=150)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 1311 samples, validate on 69 samples
Epoch 1/150





 - 4s - loss: 1.9937 - acc: 0.2006 - val_loss: 2.0693 - val_acc: 0.1594
Epoch 2/150
 - 3s - loss: 1.9468 - acc: 0.2159 - val_loss: 2.0081 - val_acc: 0.1449
Epoch 3/150
 - 3s - loss: 1.9207 - acc: 0.2182 - val_loss: 2.1065 - val_acc: 0.1304
Epoch 4/150
 - 3s - loss: 1.9125 - acc: 0.2365 - val_loss: 2.1528 - val_acc: 0.1739
Epoch 5/150
 - 3s - loss: 1.8863 - acc: 0.2517 - val_loss: 2.0415 - val_acc: 0.2029
Epoch 6/150
 - 3s - loss: 1.8756 - acc: 0.2639 - val_loss: 2.0559 - val_acc: 0.1594
Epoch 7/150
 - 3s - loss: 1.8713 - acc: 0.2685 - val_loss: 1.9770 - val_acc: 0.1304
Epoch 8/150
 - 3s - loss: 1.8711 - acc: 0.2693 - val_loss: 2.0194 - val_acc: 0.1739
Epoch 9/150
 - 3s - loss: 1.8368 - acc: 0.2784 - val_loss: 2.0733 - val_acc: 0.1739
Epoch 10/150
 - 3s - loss: 1.8212 - acc: 0.2838 - val_loss: 2.1466 - val_acc: 0.1594
Epoch 11/

Epoch 71/150
 - 3s - loss: 1.4206 - acc: 0.4424 - val_loss: 2.1613 - val_acc: 0.2754
Epoch 72/150
 - 3s - loss: 1.4132 - acc: 0.4287 - val_loss: 2.0416 - val_acc: 0.2754
Epoch 73/150
 - 3s - loss: 1.4050 - acc: 0.4310 - val_loss: 1.9253 - val_acc: 0.3768
Epoch 74/150
 - 3s - loss: 1.4122 - acc: 0.4500 - val_loss: 2.0414 - val_acc: 0.3043
Epoch 75/150
 - 3s - loss: 1.3898 - acc: 0.4592 - val_loss: 2.0836 - val_acc: 0.3188
Epoch 76/150
 - 3s - loss: 1.3932 - acc: 0.4333 - val_loss: 2.1376 - val_acc: 0.2899
Epoch 77/150
 - 3s - loss: 1.4026 - acc: 0.4302 - val_loss: 1.9808 - val_acc: 0.3478
Epoch 78/150
 - 3s - loss: 1.3831 - acc: 0.4500 - val_loss: 2.1053 - val_acc: 0.2899
Epoch 79/150
 - 3s - loss: 1.3811 - acc: 0.4630 - val_loss: 1.8913 - val_acc: 0.3768
Epoch 80/150
 - 3s - loss: 1.3699 - acc: 0.4607 - val_loss: 1.8980 - val_acc: 0.3768
Epoch 81/150
 - 3s - loss: 1.3464 - acc: 0.4622 - val_loss: 1.9354 - val_acc: 0.4058
Epoch 82/150
 - 3s - loss: 1.3747 - acc: 0.4676 - val_loss: 1.927

<keras.callbacks.History at 0x52676c8>

#################### PREDICTION ON TEST SET

In [9]:
y_pred_prob = model.predict(x_testcnn)


loss, acc = model.evaluate(x_testcnn, y_test,verbose=0)

print('Test loss = {:.4f} '.format(loss))
print('Test acc = {:.4f} '.format(acc))

Test loss = 2.2054 
Test acc = 0.3913 


############# SAVING THE MODEL

In [None]:
import pickle
pickle.dump(model, open("C:\\Users\\dbda\\Desktop\\project\\TSNE_CNN.pkl", 'wb'))
print("Model Saved!!")

#################### Prediction on unseen data

In [None]:
def load_test_data():
    x_test,y_test=[],[]
    
    file="F:\\speech_project\\UnSeenSet\\03-01-02-01-02-01-18.wav"
    emotion=emotions[file.split("-")[2]]
    feature=extract_feature(file, mfcc=True, chroma=True,spectral_centroid=True,spectral_bandwidth=True,
                            spectral_rolloff=True,spectral_contrast=True,rms=True,spectral_flatness=True)
    
    x_test.append(feature)
    y_test.append(emotion)
    # Create scaler: scaler
    scaler = MinMaxScaler()
    X=scaler.fit_transform(x)
    X=pd.DataFrame(X)
    ############ 
    tsne = TSNE(learning_rate=200,random_state=2019)
    # Apply fit_transform to samples: tsne_features
    tsne_features = tsne.fit_transform(X)
    return x_test,y_test

In [None]:
X_test_data,y_test_data=load_test_data()
print(y_test_data)

############## RESHAPING PREDICTORS

In [None]:
X_test_data = np.expand_dims(X_test_data, axis=2)
print(x_traincnn.shape)

########### LOADING SAVED MODEL

In [None]:
import pickle
loaded_model = pickle.load(open("C:\\Users\\dbda\\Desktop\\project\\TSNE_CNN.pkl", 'rb'))
result = loaded_model.predict(X_test_data)
print(result)

In [None]:
print(le.inverse_transform(result))