In [1]:
import numpy as np
import pandas as pd
import numpy as np
import os
import sys
import librosa.display
import matplotlib.pyplot as plt
from IPython.display import Audio
import wave
from scipy.io.wavfile import write
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
#import tensorflow as tf
#from tf.keras.models import Sequential
#from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
#from keras.utils import np_utils, to_categorical
#from keras.callbacks import ModelCheckpoint

In [2]:
data = pd.read_csv("../speech_emotion_reco/data/merged_dataset.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,gender,emotion,path,duration
0,0,male,sad,../speech_emotion_reco/data/savee/JK_sa01.wav,4.511837
1,1,male,sad,../speech_emotion_reco/data/savee/JK_sa15.wav,6.05873
2,2,male,neutral,../speech_emotion_reco/data/savee/DC_n13.wav,2.788889
3,3,male,surprise,../speech_emotion_reco/data/savee/DC_su09.wav,3.433968
4,4,male,neutral,../speech_emotion_reco/data/savee/DC_n07.wav,4.051791


In [4]:
data["emotion"].value_counts()

happy       1924
sad         1923
fear        1923
disgust     1923
angry       1923
neutral     1895
surprise     452
unknown      200
Name: emotion, dtype: int64

In [5]:
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    librosa.display.waveplot(data, sr=sr)
    plt.show()

In [6]:
def create_spectrogram(data, sr, e):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')   
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

In [1]:
data

NameError: name 'data' is not defined

In [8]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [9]:
len(data)

12163

In [6]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally
    
    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

In [7]:
def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching 
    stretch_data = stretch(data)
    res3= extract_features(stretch_data)
    result= np.vstack((result, res3))
    
    
    #pitch
    pitch_data= pitch(data, sample_rate)
    res4= extract_features(pitch_data)
    result= np.vstack((result, res4))
    
    #shift
    shift_data= shift(data)
    res5= extract_features(shift_data)
    result= np.vstack((result, res5))   
    
    
    return result

In [16]:
data_sample = data[0:100]

In [17]:
data_sample

Unnamed: 0.1,Unnamed: 0,gender,emotion,path,duration
0,0,male,sad,../speech_emotion_reco/data/savee/JK_sa01.wav,4.511837
1,1,male,sad,../speech_emotion_reco/data/savee/JK_sa15.wav,6.058730
2,2,male,neutral,../speech_emotion_reco/data/savee/DC_n13.wav,2.788889
3,3,male,surprise,../speech_emotion_reco/data/savee/DC_su09.wav,3.433968
4,4,male,neutral,../speech_emotion_reco/data/savee/DC_n07.wav,4.051791
5,5,male,neutral,../speech_emotion_reco/data/savee/JK_n20.wav,3.328934
6,6,male,neutral,../speech_emotion_reco/data/savee/JK_n08.wav,3.544354
7,7,male,sad,../speech_emotion_reco/data/savee/JE_sa08.wav,4.133288
8,8,male,fear,../speech_emotion_reco/data/savee/JK_f15.wav,5.911973
9,9,male,fear,../speech_emotion_reco/data/savee/JK_f01.wav,4.140454


In [24]:
X, Y = [], []
for path, emotion in zip(data["path"], data["emotion"]):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)



In [25]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,0.022190,0.574264,0.483570,0.460849,0.481321,0.555484,0.593383,0.527021,0.487356,0.539324,...,4.877737e-05,1.731863e-05,5.487894e-06,1.681743e-06,1.302462e-06,1.284306e-06,1.304934e-06,1.355323e-06,1.416036e-06,1.307451e-06
1,0.024034,0.600490,0.566040,0.508995,0.517747,0.564668,0.628690,0.570408,0.504077,0.523364,...,7.905381e-04,6.941725e-04,7.065819e-04,7.598123e-04,7.191862e-04,7.346970e-04,7.477590e-04,7.097683e-04,7.210168e-04,7.125463e-04
2,0.026794,0.555865,0.481998,0.469905,0.485150,0.561153,0.599452,0.515703,0.467281,0.533736,...,2.526547e-05,8.615414e-06,3.091155e-06,1.309249e-06,1.101190e-06,1.086457e-06,1.102687e-06,1.146277e-06,1.203981e-06,1.116437e-06
3,0.030215,0.569947,0.537033,0.472541,0.482542,0.531870,0.609241,0.538805,0.472551,0.501115,...,9.257737e-05,2.941957e-05,9.670855e-06,4.210385e-06,2.024705e-06,1.428292e-06,1.410470e-06,1.415407e-06,1.303740e-06,1.116235e-06
4,0.021918,0.572923,0.493467,0.462107,0.478409,0.544882,0.593050,0.524531,0.479771,0.534424,...,4.645133e-05,1.651665e-05,4.865162e-06,1.300631e-06,9.656201e-07,9.702152e-07,1.023277e-06,1.085091e-06,1.059010e-06,8.766840e-07
5,0.018410,0.635455,0.514304,0.476947,0.516035,0.538225,0.572309,0.577087,0.545820,0.521525,...,5.941955e-06,1.704344e-06,4.997832e-07,2.244980e-07,2.392561e-07,2.810812e-07,3.590882e-07,5.512045e-07,8.656837e-07,1.010199e-06
6,0.021358,0.674326,0.573935,0.544495,0.582778,0.596076,0.606969,0.584312,0.551458,0.539451,...,1.077356e-03,1.098892e-03,1.010258e-03,1.076277e-03,1.053284e-03,1.040401e-03,1.102087e-03,1.056235e-03,1.106351e-03,1.080339e-03
7,0.022215,0.615592,0.488605,0.476182,0.524209,0.535569,0.563341,0.556105,0.503203,0.519024,...,2.970176e-06,8.456141e-07,2.989775e-07,1.825489e-07,2.035096e-07,2.394244e-07,3.062109e-07,4.720540e-07,7.480351e-07,8.740788e-07
8,0.024039,0.678372,0.599596,0.484495,0.498956,0.522771,0.551485,0.574064,0.549995,0.485936,...,4.486512e-06,3.085529e-06,1.559661e-06,6.063991e-07,3.045934e-07,3.326508e-07,4.369107e-07,6.566398e-07,8.995153e-07,8.957224e-07
9,0.018528,0.620906,0.506448,0.474249,0.514571,0.531597,0.566020,0.576135,0.545980,0.529531,...,5.863161e-06,1.613714e-06,3.967908e-07,1.068409e-07,9.866380e-08,1.173109e-07,1.585820e-07,2.418386e-07,3.021565e-07,2.446617e-07


In [100]:
pd.DataFrame(Y)[0].value_counts()

happy       5772
sad         5769
fear        5769
disgust     5769
angry       5769
neutral     5685
surprise    1356
unknown      600
Name: 0, dtype: int64

In [26]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features1.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.02219,0.574264,0.48357,0.460849,0.481321,0.555484,0.593383,0.527021,0.487356,0.539324,...,1.7e-05,5e-06,2e-06,1.302462e-06,1.284306e-06,1e-06,1e-06,1e-06,1.307451e-06,sad
1,0.024034,0.60049,0.56604,0.508995,0.517747,0.564668,0.62869,0.570408,0.504077,0.523364,...,0.000694,0.000707,0.00076,0.0007191862,0.000734697,0.000748,0.00071,0.000721,0.0007125463,sad
2,0.026794,0.555865,0.481998,0.469905,0.48515,0.561153,0.599452,0.515703,0.467281,0.533736,...,9e-06,3e-06,1e-06,1.10119e-06,1.086457e-06,1e-06,1e-06,1e-06,1.116437e-06,sad
3,0.030215,0.569947,0.537033,0.472541,0.482542,0.53187,0.609241,0.538805,0.472551,0.501115,...,2.9e-05,1e-05,4e-06,2.024705e-06,1.428292e-06,1e-06,1e-06,1e-06,1.116235e-06,sad
4,0.021918,0.572923,0.493467,0.462107,0.478409,0.544882,0.59305,0.524531,0.479771,0.534424,...,1.7e-05,5e-06,1e-06,9.656201e-07,9.702152e-07,1e-06,1e-06,1e-06,8.76684e-07,sad


In [None]:
#drop unknown et surprise
Features=Features[Features.labels != 'unknown']
Features=Features[Features.labels != 'surprise']

In [102]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [103]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [104]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((27366, 162), (27366, 8), (9123, 162), (9123, 8))

In [105]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((27366, 162), (27366, 8), (9123, 162), (9123, 8))

In [106]:
# making our data compatible to model.
#x_train = np.expand_dims(x_train, axis=2)
#x_test = np.expand_dims(x_test, axis=2)
#x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [107]:
x_train

array([[ 1.78867448,  1.5145362 ,  1.44875584, ..., -0.17566308,
        -0.1617455 , -0.15360386],
       [-1.27257576,  0.37301025, -0.65820539, ..., -0.22554736,
        -0.21287732, -0.20400592],
       [ 2.07835725,  0.61000927,  0.2370615 , ..., -0.2122487 ,
        -0.20791637, -0.20245501],
       ...,
       [ 0.27679   ,  0.25155924,  0.44317351, ..., -0.18887466,
        -0.17612799, -0.16585193],
       [-0.51122219,  0.54016688,  0.63991329, ..., -0.22563789,
        -0.21301984, -0.20417274],
       [-0.01761639,  0.90060402,  0.54478415, ..., -0.22556401,
        -0.21299148, -0.20417192]])

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

model = RandomForestClassifier()

cv = cross_validate(model, x_train, y_train, cv = 5)

In [114]:
cv["test_score"].mean()

0.32427127398119615