# MFCC-based audio sentiment classification

Dataset is available at http://www.emodb.bilderbar.info/navi.html

The files and their 'Recognised (erkannt)' and 'Naturally (natuerlich)' attributes are in **erkennung.txt**

The datafiles of the audio contains the labels for each audio. 
The key for the labels is:
        1. W -> Trouble
		2. L -> Boredom
		3. E -> Disgust
		4. A -> Angst
		5. F -> Joy
		6. T -> Mourning
		7. N -> Neutral 

Exported the **erkennung.txt** to **erkennung.csv** using Numbers(Excel) export.

In [2]:
import pandas as pd #to manage the dataset

erkennung = '/Users/orionis/Computers/Audio Analysis/erkennung.csv'

df = pd.read_csv(erkennung, index_col=0)
df.head()

Unnamed: 0_level_0,Sentence,Recognised,Naturally
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,03a01Ab.wav,6500,3500
2,03a01Eb.wav,6000,3500
3,03a01Fa.wav,9000,7500
4,03a01Lb.wav,7000,3500
5,03a01Nc.wav,10000,9500


## Cleaning dataset

### 1. Removing file entries with no audio files
Using the `os` module will get the list of audio files, then remove the entries for which there is no audio file

In [3]:
import os
import pandas as pd

In [4]:
#list of audio files
files_loc = '/Users/orionis/Computers/Audio Analysis/download/wav'
all_audios = os.listdir(files_loc)

#making Sentence the index column, so that dropping unneccesary rows becomes easy
df = pd.read_csv(erkennung, index_col=1)
#list of all entries
all_entries = list(df.index)

In [5]:
remove_audios = []

for audio in all_entries:
    if audio not in all_audios:
        remove_audios.append(audio)
#list of unneccesary entries
# remove_audios

In [6]:
#removing the unneccesary entries from erkennung.csv
df.drop(remove_audios, inplace=True)

df.head()

Unnamed: 0_level_0,Number,Recognised,Naturally
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
03a01Fa.wav,3,9000,7500
03a01Nc.wav,5,10000,9500
03a01Wa.wav,7,9500,9000
08a01Ab.wav,9,8500,5500
08a01Fd.wav,11,9000,5000


We will have to drop the column `Number` because after removing unneccesary entries, it's value has become jumbled

In [7]:
#axis=1 signifies that we have to delete a whole column
df.drop('Number', axis=1, inplace=True)

#writing this to a csv
df.to_csv('clean_audio.csv')

df.head()

Unnamed: 0_level_0,Recognised,Naturally
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
03a01Fa.wav,9000,7500
03a01Nc.wav,10000,9500
03a01Wa.wav,9500,9000
08a01Ab.wav,8500,5500
08a01Fd.wav,9000,5000


### 2. Replacing commas with periods in the percentage columns for typecasting to integers


In [8]:
df = pd.read_csv('clean_audio.csv')
df.head()

Unnamed: 0,Sentence,Recognised,Naturally
0,03a01Fa.wav,9000,7500
1,03a01Nc.wav,10000,9500
2,03a01Wa.wav,9500,9000
3,08a01Ab.wav,8500,5500
4,08a01Fd.wav,9000,5000


In [9]:
#lambda function helps in applying a particular operation to every row of the dataframe
df.Recognised = df.Recognised.apply(lambda x: x.replace(',', '.'))
df.Naturally = df.Naturally.apply(lambda x: x.replace(',', '.'))

#writing this to a csv
df.to_csv('clean_audio.csv')
df.head()

Unnamed: 0,Sentence,Recognised,Naturally
0,03a01Fa.wav,90.0,75.0
1,03a01Nc.wav,100.0,95.0
2,03a01Wa.wav,95.0,90.0
3,08a01Ab.wav,85.0,55.0
4,08a01Fd.wav,90.0,50.0


## Adding labels

In [10]:
df = pd.read_csv('clean_audio.csv', index_col=0)

#In the audio file name, the fifth character is denoting the label of that audio file
df['Emotion'] = df.Sentence.apply(lambda x: x[5])

#write this to a csv
df.to_csv('labelled_audio.csv')
df.head()

Unnamed: 0,Sentence,Recognised,Naturally,Emotion
0,03a01Fa.wav,90.0,75.0,F
1,03a01Nc.wav,100.0,95.0,N
2,03a01Wa.wav,95.0,90.0,W
3,08a01Ab.wav,85.0,55.0,A
4,08a01Fd.wav,90.0,50.0,F


## Extracting MFCC

In [11]:
import os
import librosa #to extract MFCC
import pandas as pd
from datetime import datetime #to time the extraction time
import sklearn.preprocessing as sp #for preprocessing the MFCC

In [12]:
files_loc = '/Users/orionis/Computers/Audio Analysis/download/wav/'
all_audios = os.listdir(files_loc)

In [13]:
'''Accepts a file, extracts its MFCC, creates a dictionary. Then
it makes a dataframe from that dictionary and adds a column with filename
to merge on it. Return a dataframe'''
def extract_mfcc(file):
    file_loc = files_loc + file
    data, sample_rate = librosa.load(file_loc)
    mfccs = librosa.feature.mfcc(data,sr=sample_rate)
    #Scaling MFCC features
    mfccs = sp.scale(mfccs, axis=1)
    #creating a dictionary of mfccs with key being the coefficients and the value of that coefficient for each frame being the value
    mfcc_dict = {}
    
    for counter in range(len(mfccs)):
        mfcc_dict['M_{}'.format(counter)] = mfccs[counter]
    
    #print(mfcc_dict)
        
    df = pd.DataFrame(mfcc_dict)
    #Creating a new column for each dataframe so that we can merge on this 
    df['Sentence'] = file
    return df

In [14]:
dataframes = [] #list to hold all the dataframes returned by extract_mfcc

start = datetime.now()

for file in all_audios:
    dataframes.append(extract_mfcc(file))

print("Time taken:", datetime.now()-start)


Time taken: 0:00:27.538122


In [15]:
df = pd.read_csv('labelled_audio.csv', index_col=0)

#combine all the dataframes in the list into a single dataframe

combined_mfcc = pd.concat(dataframes)

#write this to csv
combined_mfcc.to_csv('combined_mfcc.csv')
combined_mfcc.head()

Unnamed: 0,M_0,M_1,M_2,M_3,M_4,M_5,M_6,M_7,M_8,M_9,...,M_11,M_12,M_13,M_14,M_15,M_16,M_17,M_18,M_19,Sentence
0,-2.624497,-0.739436,1.971092,-1.655031,1.390185,-0.720173,0.923796,0.738661,1.215384,1.192241,...,0.998111,-0.139205,1.271257,0.805803,1.493068,0.784684,1.370507,1.144447,0.201124,16a02Lb.wav
1,-2.701389,-0.899317,1.983438,-1.609951,1.464169,-0.628228,0.933995,0.72045,1.219367,1.368975,...,1.32403,0.250345,1.209975,0.541998,1.66321,0.895256,1.217031,1.211316,0.375881,16a02Lb.wav
2,-2.867961,-1.41871,1.772862,-1.562292,1.7591,-0.158442,1.215917,0.971181,1.226372,1.314711,...,1.771689,1.042719,1.530129,0.405677,1.474771,0.788186,1.152959,1.577291,0.8566,16a02Lb.wav
3,-2.276688,-1.296687,1.018589,-1.166343,1.277585,-0.256387,1.152007,-0.100632,1.67037,1.672466,...,1.89619,1.501971,1.814007,0.13126,0.971504,0.901921,1.198477,1.075405,1.526938,16a02Lb.wav
4,-0.590484,-0.281102,0.118992,-0.014005,0.203977,-0.233153,0.218712,-0.88199,1.943134,0.962821,...,0.688567,1.710943,2.342106,0.269121,1.24978,0.772795,0.390299,-0.791247,1.186679,16a02Lb.wav


Merge this `combined_mfcc` dataframe with the `labelled_audio` dataframe

In [16]:
mfcc_dataframe = pd.merge(df, combined_mfcc, on='Sentence')

mfcc_dataframe.to_csv('mfcc_dataframe.csv')
mfcc_dataframe.head()

Unnamed: 0,Sentence,Recognised,Naturally,Emotion,M_0,M_1,M_2,M_3,M_4,M_5,...,M_10,M_11,M_12,M_13,M_14,M_15,M_16,M_17,M_18,M_19
0,03a01Fa.wav,90.0,75.0,F,-2.318348,-1.514336,1.301276,-0.473541,1.265903,-0.151309,...,-0.140783,2.252262,0.574939,1.61232,0.805134,1.759847,0.477768,0.079236,0.097404,0.769596
1,03a01Fa.wav,90.0,75.0,F,-2.431766,-1.385458,1.658972,-0.689407,1.26895,-0.395731,...,0.04899,1.894129,0.361678,1.427029,0.225025,1.240077,0.447766,0.293423,0.23232,0.813908
2,03a01Fa.wav,90.0,75.0,F,-2.473676,-1.456407,1.788837,-0.710647,1.225802,-0.192081,...,0.394599,1.744057,0.256377,1.352112,-0.300278,1.000595,0.527247,0.566962,0.362979,0.778115
3,03a01Fa.wav,90.0,75.0,F,-2.454754,-1.50207,1.711363,-0.643471,1.146031,-0.204295,...,0.465475,1.811752,0.315559,1.421318,-0.361594,1.134843,0.667627,0.691369,0.384714,0.517091
4,03a01Fa.wav,90.0,75.0,F,-0.464091,-0.397371,0.226352,0.052778,0.298242,0.068719,...,0.012065,0.640038,1.832713,1.418862,-0.466209,-0.721211,0.14432,1.744628,1.457505,0.284002


## Label Encode
Any machine learning algorithm requires a numerical value in the labels, that is why we have to encode the labels to some integer.
Typically we also perform one-hot encoding of labels but that is detrimental to the working of a tree-based machine learning algorithm. 
Thus, only label encoding would be applied here.

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder #to perform the encoding

df = pd.read_csv('mfcc_dataframe.csv', index_col=0)

label = LabelEncoder()

#create a new column containing these encoded labels
df['Emotion_encoded'] = label.fit_transform(df.Emotion)

df.to_csv('mfcc_df_encoded.csv')
df.head()

Unnamed: 0,Sentence,Recognised,Naturally,Emotion,M_0,M_1,M_2,M_3,M_4,M_5,...,M_11,M_12,M_13,M_14,M_15,M_16,M_17,M_18,M_19,Emotion_encoded
0,03a01Fa.wav,90.0,75.0,F,-2.318348,-1.514336,1.301276,-0.473541,1.265903,-0.151309,...,2.252262,0.574939,1.61232,0.805134,1.759847,0.477768,0.079236,0.097404,0.769596,2
1,03a01Fa.wav,90.0,75.0,F,-2.431766,-1.385458,1.658972,-0.689407,1.26895,-0.395731,...,1.894129,0.361678,1.427029,0.225025,1.240077,0.447766,0.293423,0.23232,0.813908,2
2,03a01Fa.wav,90.0,75.0,F,-2.473676,-1.456407,1.788837,-0.710647,1.225802,-0.192081,...,1.744057,0.256377,1.352112,-0.300278,1.000595,0.527247,0.566962,0.362979,0.778115,2
3,03a01Fa.wav,90.0,75.0,F,-2.454754,-1.50207,1.711363,-0.643471,1.146031,-0.204295,...,1.811752,0.315559,1.421318,-0.361594,1.134843,0.667627,0.691369,0.384714,0.517091,2
4,03a01Fa.wav,90.0,75.0,F,-0.464091,-0.397371,0.226352,0.052778,0.298242,0.068719,...,0.640038,1.832713,1.418862,-0.466209,-0.721211,0.14432,1.744628,1.457505,0.284002,2


## Applying RandomForest

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime

In [19]:
df = pd.read_csv('mfcc_df_encoded.csv', index_col=0)
df.shape

(64316, 25)

In [25]:
labels = df.Emotion_encoded.values

#All MFCC coefficients
features_9 = df.iloc[:, 5:len(df.columns)-11]
features_all = df.iloc[:, 4:len(df.columns)-1]

feature_names = list(features_9.columns)
all_features = list(features_all.columns)
features_9 = features_9.values
features_all = features_all.values

print(feature_names, all_features)

['M_1', 'M_2', 'M_3', 'M_4', 'M_5', 'M_6', 'M_7', 'M_8', 'M_9'] ['M_0', 'M_1', 'M_2', 'M_3', 'M_4', 'M_5', 'M_6', 'M_7', 'M_8', 'M_9', 'M_10', 'M_11', 'M_12', 'M_13', 'M_14', 'M_15', 'M_16', 'M_17', 'M_18', 'M_19']


### Splitting the dataset into testing and training sections

In [26]:
#setting aside 20% of the data for testing
features_9_train, features_9_test, labels_9_train, labels_9_test = train_test_split(features_9,
                                                                           labels,
                                                                           test_size=0.2)
features_all_train, features_all_test, labels_all_train, labels_all_test = train_test_split(features_all,
                                                                           labels,
                                                                           test_size=0.2)


### Establishing the base prediction

In [27]:
#Computing the maximum frequency of a label
max_freq = np.bincount(labels_train).argmax()

#our baseline, frequency-based prediction
prediction = [max_freq for _ in range((len(labels_test)))]
prediction = np.array(prediction)

#calculating accuracy based on this prediction
accuracy = accuracy_score(labels_test, prediction)
print("Baseline accuracy:", accuracy)

Baseline accuracy: 0.22496890547263682


## Testing with only 1-9 MFCC coefficients

In [28]:
start = datetime.now()

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(features_9_train, labels_9_train)
labels_pred = rf.predict(features_9_test)
accuracy = accuracy_score(labels_9_test, labels_pred)
print("Accuracy:", accuracy)
print("Time taken:", datetime.now()-start)

Accuracy: 0.44029850746268656
Time taken: 0:02:57.841831


## Testing with all MFCC coefficients

In [29]:
start = datetime.now()

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(features_all_train, labels_all_train)
labels_pred = rf.predict(features_all_test)
accuracy = accuracy_score(labels_all_test, labels_pred)
print("Accuracy:", accuracy)
print("Time taken:", datetime.now()-start)

Accuracy: 0.5353700248756219
Time taken: 0:03:41.807175


## Applying SVM

In [84]:
from sklearn.svm import SVC

In [88]:
startTime = datetime.now()
svclassifier = SVC(kernel = 'poly', decision_function_shape = 'ovr')
svclassifier.fit(features_train, labels_train)
labels_pred = svclassifier.predict(features_test)
accuracy = accuracy_score(labels_test, labels_pred)
print("Accuracy:", accuracy)
print("Time taken:", datetime.now()-startTime)

Accuracy: 0.4010416666666667
Time taken: 0:02:53.298332
