In [None]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
import os

load the audio file as a timeseries in y and store sampling rate as sr(measured in Hz)
default sr is 22kHz

In [None]:
def getFeatures(path):
    y, sr = librosa.load(path)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)


    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    mfcc_delta = librosa.feature.delta(mfcc)
    
    features = []
    features.append(tempo)
    features.append(np.sum(beats))
    features.append(np.mean(chroma_stft))
    features.append(np.mean(rms))
    features.append(np.mean(cent))
    features.append(np.mean(spec_bw))
    features.append(np.mean(rolloff))
    features.append(np.mean(zcr))
    for coefficient in mfcc:
        features.append(np.mean(coefficient))
    return features   

## Dataset:
1. Our training dataset can be found [here](https://iiitaphyd-my.sharepoint.com/personal/devansh_manu_research_iiit_ac_in/_layouts/15/onedrive.aspx?originalPath=aHR0cHM6Ly9paWl0YXBoeWQtbXkuc2hhcmVwb2ludC5jb20vOmY6L2cvcGVyc29uYWwvZGV2YW5zaF9tYW51X3Jlc2VhcmNoX2lpaXRfYWNfaW4vRWxZZm1zQ3h0Q2xJcjJLSVBqcWEzYUFCdFVva2xha2cwNnBCMDJMMmlKaWVIdz9ydGltZT1UNjRHc1lqQzEwZw&viewid=5db72c70%2D223f%2D4887%2Dbf50%2Dbd0fbf14638f&id=%2Fpersonal%2Fdevansh%5Fmanu%5Fresearch%5Fiiit%5Fac%5Fin%2FDocuments%2FMMT%20%2D%20Hands%20on%20activity%2FEmotion%20Classification)
2. From the above link, we picked up the Arousal and Valence data.
3. From [this](https://onedrive.live.com/?authkey=%21ABJMt2rGTQvCxyM&id=3E1ACB43A24F0BDA%21352&cid=3E1ACB43A24F0BDA) link, we picked up the already-annotated Tension data.
4. Training of our model was done on the above mentioned Arousal, Valence and Tension dataset. 
5. Our testing, which you would see at the end of this notebook, is done on the un-annotated data mentioned in this [link](https://onedrive.live.com/?authkey=%21ABJMt2rGTQvCxyM&id=3E1ACB43A24F0BDA%21122&cid=3E1ACB43A24F0BDA)

Get training data-directory names

In [None]:
feature_set, y_labels = [], []
dir_list = ["../emotion_dataset/training/Arousal/", "../emotion_dataset/training/Valence/", "../emotion_dataset/training/Tension/"]

In [None]:
os.getcwd()

Extracting features for Arousal and Valence music tracks

In [None]:
for i in range(2):
    dirName1, dirName2 = dir_list[i]+"Positive/", dir_list[i]+"Negative/"
    dirContents = os.listdir(dirName1)
    temp_y_label = "A"
    if i == 1:
        temp_y_label = "V"
    for music in dirContents:
        currSongFeatures = getFeatures(dirName1+music)
        y_labels.append(temp_y_label+"P")
        feature_set.append(currSongFeatures)
    dirContents = os.listdir(dirName2)    
    for music in dirContents:
        currSongFeatures = getFeatures(dirName2+music)
        y_labels.append(temp_y_label+"N")
        feature_set.append(currSongFeatures)

This <font color="red">warning</font> will always occur when loading mp3 because libsndfile does not (yet/currently) support the mp3 format. Librosa tries to use libsndfile first, and if that fails, it will fall back on the audioread package, which is a bit slower and more brittle, but supports more formats.

The types of labels gathered uptill now, i.e. Arousal and Valence(both can be positive and negative)

In [None]:
print(set(y_labels))

In [None]:
dirName = dir_list[-1]
dirContents = os.listdir(dirName)
for music in dirContents:
    temp_y_label = music.split("_")[-1][:2]
    if temp_y_label == "TN" or temp_y_label == "TP":
        currSongFeatures = getFeatures(dirName+music)
        feature_set.append(currSongFeatures)
        y_labels.append(temp_y_label)

In [None]:
print(len(y_labels))
print(len(feature_set))

In [None]:
print(set(y_labels))

In [None]:
feature_set = np.array(feature_set)

In [None]:
feature_set.shape

In [None]:
y = np.array(y_labels)
y.shape

In [None]:
feature_list = ['tempo', 'beats', 'chromagram', 'rmse',
           'centroid', 'bandwidth', 'rolloff', 'zcr', 'mfcc1', 'mfcc2',
           'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9',
           'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15',
           'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20']

In [None]:
import seaborn as sns

In [None]:
sns.distplot(feature_set[:,0])

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

## model1:
this uses categorical crossentropy as the loss function

In [None]:
clear_session()
model1 = Sequential()
model1.add(Dense(128, input_shape=(feature_set.shape[1],), activation='relu'))
model1.add(Dense(256, activation='relu'))
model1.add(Dense(1024, activation='relu'))
model1.add(Dense(2048, activation='relu'))
model1.add(Dense(6, activation='softmax'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

In [None]:
model1.compile(loss='categorical_crossentropy', 
              optimizer="adam",
              metrics=["accuracy"])

In [None]:
model1.summary()

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split

In [None]:
encoder = LabelEncoder()
encoder.fit(y_labels)
encoded_Y = encoder.transform(y_labels)
Y = to_categorical(encoded_Y)

In [None]:
print(Y.shape)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(feature_set, Y, test_size=0.1, random_state=42)

In [None]:
print("shape of training data is: ", x_train.shape)
print("shape of training label is: ", y_train.shape)
print("shape of testing data is: ", x_test.shape)
print("shape of testing label is: ", y_test.shape)

In [None]:
results = model1.fit(x_train, y_train, epochs=20, callbacks=[es],validation_data=([x_val, y_val]), batch_size=32)

In [None]:
plt.plot(results.history['loss'], label='train')
plt.plot(results.history['val_loss'], label='val')
plt.legend()


In [None]:
import IPython.display as ipd
ipd.Audio("../emotion_dataset/Tension/20161058_TN2.mp3")

## model 2:
Here I try to add 1 extra layer(dense_4 with 1024 x 1), but with a lesser dimension as its previous layer(dense_3 with 2048 x 1)

In [None]:
clear_session()
model2 = Sequential()
model2.add(Dense(128, input_shape=(feature_set.shape[1],), activation='relu'))
model2.add(Dense(256, activation='relu'))
model2.add(Dense(1024, activation='relu'))
model2.add(Dense(2048, activation='relu'))
model2.add(Dense(1024, activation='relu'))
model2.add(Dense(6, activation='softmax'))
model2.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
model2.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=["accuracy"])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(feature_set, Y, test_size=0.2, random_state=42)

In [None]:
result2 = model2.fit(x_train, y_train, callbacks=[es], batch_size=16, validation_data=[x_test, y_test], epochs=20)

In [None]:
plt.plot(results.history['loss'], label='train_model1')
plt.plot(result2.history['loss'], label='train_model2')
plt.legend()

In [None]:
plt.plot(results.history['val_loss'], label='test_model1')
plt.plot(result2.history['val_loss'], label='test_model2')
plt.legend()

In [None]:
from tensorflow.keras.layers import InputLayer

In [None]:
clear_session()
model3 = Sequential()
model3.add(Dense(32, input_shape=(feature_set.shape[1],), activation='relu'))
model3.add(Dense(64, activation='tanh'))
model3.add(Dense(128, activation='tanh'))
model3.add(Dense(256, activation=tf.nn.sigmoid))
model3.add(Dense(512, activation='relu'))
model3.add(Dense(6, activation='softmax'))

In [None]:
model3.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
model3.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=["accuracy"])
x_train, x_test, y_train, y_test = train_test_split(feature_set, Y, test_size=0.3, random_state=42)

In [None]:
result3 = model3.fit(x_train, y_train, callbacks=[es], batch_size=8, validation_data=[x_test, y_test], epochs=20)

In [None]:
plt.plot(results.history['loss'], label='train_model1')
plt.plot(result2.history['loss'], label='train_model2')
plt.plot(result3.history['loss'], label='train_model3')
plt.legend()

In [None]:
plt.plot(results.history['val_loss'], label='test_model1')
plt.plot(result2.history['val_loss'], label='test_model2')
plt.plot(result3.history['val_loss'], label='test_model3')
plt.legend()

In [None]:
clear_session()
model4 = Sequential()
model4.add(Dense(32, input_shape=(feature_set.shape[1],), activation='relu'))
model4.add(Dense(64, activation='tanh'))
model4.add(Dense(128, activation='tanh'))
model4.add(Dense(256, activation=tf.nn.sigmoid))
model4.add(Dense(512, activation='relu'))
model4.add(Dense(2048, activation=tf.keras.activations.tanh))
model4.add(Dense(6, activation='softmax'))

In [None]:
model4.summary()

In [None]:
model4.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["accuracy"])
x_train, x_test, y_train, y_test = train_test_split(feature_set, Y, test_size=0.25, random_state=42)

In [None]:
result4 = model4.fit(x_train, y_train, epochs=20, validation_data=[x_test, y_test], batch_size=8)

In [None]:
plt.plot(result3.history['loss'], label='train3_loss')
plt.plot(result4.history['loss'], label='train4_loss')
plt.legend()

In [None]:
plt.plot(result3.history['val_loss'], label='test3_loss')
plt.plot(result4.history['val_loss'], label='test4_loss')
plt.legend()

In [None]:
x_1 = getFeatures("../emotion_dataset/Tension/20161058_TN2.mp3")
x_1 = np.array(x_1)
x_1.shape

In [None]:
x1 = x_1.reshape(1,x_1.shape[0])
x1.shape

In [None]:
y1_model3_pred = model3.predict(x1)
y1_model3_pred.shape

In [None]:
y1_model3_pred = y1_model3_pred.reshape(y1_model3_pred.shape[-1],)
print(y1_model3_pred.shape)
y1_model3_pred = encoder.inverse_transform([np.argmax(y1_model3_pred)])
print(y1_model3_pred)
y1_true = ['TN']

In [None]:
from sklearn.metrics import accuracy_score
# accuracy_score(y_true, y_pred)
accuracy_score(y1_true, y1_model3_pred)