In [None]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
import os

load the audio file as a timeseries in y and store sampling rate as sr(measured in Hz)
default sr is 22kHz

In [None]:
def getFeatures(path):
    y, sr = librosa.load(path)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)


    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    mfcc_delta = librosa.feature.delta(mfcc)
    
    features = []
    features.append(tempo)
    features.append(np.sum(beats))
    features.append(np.mean(chroma_stft))
    features.append(np.mean(rms))
    features.append(np.mean(cent))
    features.append(np.mean(spec_bw))
    features.append(np.mean(rolloff))
    features.append(np.mean(zcr))
    for coefficient in mfcc:
        features.append(np.mean(coefficient))
    return features   

## Dataset:
1. Our training dataset can be found [here](https://iiitaphyd-my.sharepoint.com/personal/devansh_manu_research_iiit_ac_in/_layouts/15/onedrive.aspx?originalPath=aHR0cHM6Ly9paWl0YXBoeWQtbXkuc2hhcmVwb2ludC5jb20vOmY6L2cvcGVyc29uYWwvZGV2YW5zaF9tYW51X3Jlc2VhcmNoX2lpaXRfYWNfaW4vRWxZZm1zQ3h0Q2xJcjJLSVBqcWEzYUFCdFVva2xha2cwNnBCMDJMMmlKaWVIdz9ydGltZT1UNjRHc1lqQzEwZw&viewid=5db72c70%2D223f%2D4887%2Dbf50%2Dbd0fbf14638f&id=%2Fpersonal%2Fdevansh%5Fmanu%5Fresearch%5Fiiit%5Fac%5Fin%2FDocuments%2FMMT%20%2D%20Hands%20on%20activity%2FEmotion%20Classification)
2. From the above link, we picked up the Arousal and Valence data.
3. From [this](https://onedrive.live.com/?authkey=%21ABJMt2rGTQvCxyM&id=3E1ACB43A24F0BDA%21352&cid=3E1ACB43A24F0BDA) link, we picked up the already-annotated Tension data.
4. Training of our model was done on the above mentioned Arousal, Valence and Tension dataset. 
5. Our testing, which you would see at the end of this notebook, is done on the un-annotated data mentioned in this [link](https://onedrive.live.com/?authkey=%21ABJMt2rGTQvCxyM&id=3E1ACB43A24F0BDA%21122&cid=3E1ACB43A24F0BDA)

Get training data-directory names

In [None]:
feature_set, y_labels = [], []
dir_list = ["../emotion_dataset/training/Arousal/", "../emotion_dataset/training/Valence/", "../emotion_dataset/training/Tension/"]

In [None]:
os.getcwd()

Extracting features for Arousal and Valence music tracks

In [None]:
for i in range(2):
    dirName1, dirName2 = dir_list[i]+"Positive/", dir_list[i]+"Negative/"
    dirContents = os.listdir(dirName1)
    temp_y_label = "A"
    if i == 1:
        temp_y_label = "V"
    for music in dirContents:
        currSongFeatures = getFeatures(dirName1+music)
        y_labels.append(temp_y_label+"P")
        feature_set.append(currSongFeatures)
    dirContents = os.listdir(dirName2)    
    for music in dirContents:
        currSongFeatures = getFeatures(dirName2+music)
        y_labels.append(temp_y_label+"N")
        feature_set.append(currSongFeatures)

This <font color="red">warning</font> will always occur when loading mp3 because libsndfile does not (yet/currently) support the mp3 format. Librosa tries to use libsndfile first, and if that fails, it will fall back on the audioread package, which is a bit slower and more brittle, but supports more formats.

The types of labels gathered uptill now, i.e. Arousal and Valence(both can be positive and negative)

In [None]:
print(set(y_labels))

In [None]:
dirName = dir_list[-1]
dirContents = os.listdir(dirName)
for music in dirContents:
    temp_y_label = music.split("_")[-1][:2]
    if temp_y_label == "TN" or temp_y_label == "TP":
        currSongFeatures = getFeatures(dirName+music)
        feature_set.append(currSongFeatures)
        y_labels.append(temp_y_label)

In [None]:
print(len(y_labels))
print(len(feature_set))

In [None]:
print(set(y_labels))

In [None]:
feature_set = np.array(feature_set)

In [None]:
feature_set.shape

In [None]:
y = np.array(y_labels)
y.shape

Plotting the distribution of classes in our entire training dataset

In [None]:
plt.hist(y)

We can clearly see that the training dataset is itself heavily biased w.r.t. the classes TN and TP

In [None]:
uniqueValues, occurCount = np.unique(y, return_counts=True)
print("Classes are: ", uniqueValues)
print("Frequency of each class is: ", occurCount)

In [None]:
numExamplesPerClass = min(occurCount)
numExamplesPerClass

In [None]:
print(set(y_labels[:81]))

In [None]:
new_y_labels = y_labels[:81]
new_feature_set = feature_set[:81]
print(len(new_y_labels))
print(len(new_feature_set))

In [None]:
tn_count, tp_count, vn_count, vp_count, an_count, ap_count = 0, 0, 0, 0, 0, 0
new_y_labels = []
new_feature_set = []
for l in range(len(y_labels)):
    if y_labels[l] == 'TN' and tn_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        tn_count += 1
    if y_labels[l] == 'TP' and tp_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        tp_count += 1
    if y_labels[l] == 'AP' and ap_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        ap_count += 1
    if y_labels[l] == 'AN' and an_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        an_count += 1
    if y_labels[l] == 'VN' and vn_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        vn_count += 1
    if y_labels[l] == 'VP' and vp_count < numExamplesPerClass:
        new_y_labels.append(y_labels[l])
        new_feature_set.append(feature_set[l])
        vp_count += 1

In [None]:
tn_count, tp_count = 0, 0
for l in range(81, len(y_labels)):
    if y_labels[l] == 'TN' and tn_count < 20:
        new_y_labels.append(y_labels[l])
        new_feature_set = np.append(new_feature_set,[feature_set[l]], axis=0)
        tn_count += 1
    if y_labels[l] == 'TP' and tp_count < 20:
        new_y_labels.append(y_labels[l])
        new_feature_set = np.append(new_feature_set,[feature_set[l]], axis=0)
        tp_count += 1

In [None]:
print(len(new_y_labels))
print(len(new_feature_set))

In [None]:
new_feature_set = np.array(new_feature_set)
new_feature_set.shape

In [None]:
plt.hist(new_y_labels)

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, normalize

In [None]:
encoder = LabelEncoder()
encoder.fit(new_y_labels)
encoded_Y = encoder.transform(new_y_labels)
Y = to_categorical(encoded_Y)

In [None]:
feature_list = ['tempo', 'beats', 'chromagram', 'rmse',
           'centroid', 'bandwidth', 'rolloff', 'zcr', 'mfcc1', 'mfcc2',
           'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9',
           'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15',
           'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20']

In [None]:
import seaborn as sns

In [None]:
fig, axes = plt.subplots(ncols=4, nrows=7)

for i, ax in zip(range(28), axes.flat):
    sns.distplot(new_feature_set[:,i], hist=False, ax=ax)
plt.show()

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session
from sklearn.model_selection import train_test_split

## Model5: Neural Net
Use self-normalised exponential units as activation for all layers

In [None]:
clear_session()
model5 = Sequential()
model5.add(Dense(128, input_shape=(new_feature_set.shape[-1],), activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal))
model5.add(Dense(128*2, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*4, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*8, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*16, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*32, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*64, activation='relu', kernel_initializer='normal'))
model5.add(Dense(128*128, activation='relu', kernel_initializer='normal'))    
model5.add(Dense(6, activation='softmax'))

In [None]:
model5.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["acc"])

In [None]:
x_train, x_val, y_train, y_val = train_test_split(new_feature_set, Y, test_size=0.15, random_state=12345)

In [None]:
result5 = model5.fit(x_train, y_train, batch_size=16, validation_data=[x_val, y_val], callbacks=[es], epochs=100)

## model 5 predictions:

### Get the testing data as mentioned above

In [None]:
testing_directory = "../emotion_dataset/testing/"
dirContents = os.listdir(testing_directory)
x_test, y_test = [], []
for music in dirContents:
    labelled_music = music.split('_')[1]
    if labelled_music[1] == 'P' or labelled_music[1] == 'N':
        music_features = getFeatures(testing_directory+music)
        x_test.append(music_features)
        y_test.append(labelled_music[:2])

In [None]:
x_test = np.array(x_test)
x_test.shape

In [None]:
y_pred = []
for i in range(x_test.shape[0]):
    x_i = x_test[i].reshape(1, x_test[i].shape[0])
    y_i_pred = model5.predict(x_i)
    y_i_pred = y_i_pred.reshape(y_i_pred.shape[-1],)
    y_i_pred = encoder.inverse_transform([np.argmax(y_i_pred)])
    y_pred.append(y_i_pred)

In [None]:
print(y_test)

In [None]:
plt.hist(y_test)

In [None]:
print(list(map(lambda x: x[0], y_pred)))

In [None]:
plt.hist(list(map(lambda x: x[0], y_pred)))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
y_pred = []
for i in range(x_val.shape[0]):
    x_i = x_val[i].reshape(1, x_val[i].shape[0])
    y_i_pred = model5.predict(x_i)
    y_i_pred = y_i_pred.reshape(y_i_pred.shape[-1],)
    y_i_pred = encoder.inverse_transform([np.argmax(y_i_pred)])
    y_pred.append(y_i_pred)

In [None]:
print(len(y_pred))

In [None]:
print(y_pred)

In [None]:
y_val_labelled = []
for i in y_val:
    t = encoder.inverse_transform([np.argmax(i)])
    y_val_labelled.append(t[0])

In [None]:
print(len(y_val_labelled))

In [None]:
plt.hist(y_val_labelled)

In [None]:
plt.hist(list(map(lambda x: x[0], y_pred)))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val_labelled, y_pred)

## Gaussian Mixture Model

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)

In [None]:
result6 = gmm.fit(x_train)

In [None]:
y_val = result6.predict(x_val)

In [None]:
y_val

## Random Forest Classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

In [None]:
y_pred = []
for i in range(x_val.shape[0]):
    x_i = x_val[i].reshape(1, x_val[i].shape[0])
    y_i_pred = rf.predict(x_i)
    y_i_pred = y_i_pred.reshape(y_i_pred.shape[-1],)
    y_i_pred = encoder.inverse_transform([np.argmax(y_i_pred)])
    y_pred.append(y_i_pred)

In [None]:
y_val_labelled = []
for i in y_val:
    t = encoder.inverse_transform([np.argmax(i)])
    y_val_labelled.append(t[0])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val_labelled, y_pred)

In [None]:
plt.hist(y_val_labelled)

In [None]:
print(y_pred)