In [1]:
# Mandatorio al inicio de cada notebook
import os
import warnings
os.chdir(os.path.abspath(".."))
warnings.filterwarnings('ignore')

import json
import numpy as np
import pandas as pd
from src.emotion_translation import ekman_emotion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization

### Input Data

In [3]:
df_input = pd.read_csv(f'data/FEATURES/OPENSMILE/opensmile_features.csv')

### Target Data

In [4]:
# Obtenemos el json con los intervalos
with open('data/targets_mean_vote.json', 'r') as f:
    targets_mean_vote = json.load(f)

In [5]:
# Loop principal para obtener los tiempos y target en pandas
df_ranges = pd.DataFrame()

for _key in targets_mean_vote.keys():
    print(_key)
    X = []
    for segment, target in zip(targets_mean_vote[_key]['rangos'], targets_mean_vote[_key]['targets']):
        x = []
        x.append(segment)
        x.append(target)
        X.append(x)
        
    df = pd.DataFrame(X, columns = ['Time','Target'])
    df['Audio_Name'] = _key
    df_ranges = pd.concat([df_ranges, df], ignore_index = True)

MSP-Conversation_0021.wav
MSP-Conversation_0023.wav
MSP-Conversation_0035.wav
MSP-Conversation_0043.wav
MSP-Conversation_0046.wav
MSP-Conversation_0047.wav
MSP-Conversation_0053.wav
MSP-Conversation_0054.wav
MSP-Conversation_0055.wav
MSP-Conversation_0061.wav
MSP-Conversation_0067.wav
MSP-Conversation_0079.wav
MSP-Conversation_0081.wav
MSP-Conversation_0083.wav
MSP-Conversation_0087.wav
MSP-Conversation_0088.wav
MSP-Conversation_0094.wav
MSP-Conversation_0101.wav
MSP-Conversation_0103.wav
MSP-Conversation_0110.wav
MSP-Conversation_0114.wav
MSP-Conversation_0125.wav
MSP-Conversation_0130.wav
MSP-Conversation_0135.wav
MSP-Conversation_0140.wav
MSP-Conversation_0147.wav
MSP-Conversation_0153.wav
MSP-Conversation_0156.wav
MSP-Conversation_0160.wav
MSP-Conversation_0166.wav
MSP-Conversation_0167.wav
MSP-Conversation_0172.wav
MSP-Conversation_0180.wav
MSP-Conversation_0183.wav
MSP-Conversation_0184.wav
MSP-Conversation_0190.wav
MSP-Conversation_0197.wav
MSP-Conversation_0201.wav
MSP-Conversa

In [6]:
# Join para obtener dataset final
df_ranges['Time'] = df_ranges['Time'].astype(str) 
df_final = pd.merge(df_input, df_ranges, how = 'inner', left_on = ['Time','Audio_Name'], right_on = ['Time','Audio_Name'])

In [7]:
df_final.head()

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,Time,Audio_Name,Target
0,0.551549,0.015873,0.539683,0.167293,0.317384,0.509439,0.15009,0.192055,0.342145,0.147738,...,1.749765,7.762097,0.592759,97.517975,37.66875,68.26676,50.541595,"[1206.6, 1207.3]",MSP-Conversation_2277.wav,"[0.08579601990049782, 29.539046434494196, 42.2..."
1,0.690996,0.046512,0.55814,0.325169,0.588467,0.697907,0.263298,0.109439,0.372738,0.193365,...,2.067588,-4.847102,0.526376,106.3773,39.065525,86.45745,54.577053,"[1206.1, 1206.6]",MSP-Conversation_2277.wav,"[-3.2808762254901955, 30.46595588235294, 43.38..."
2,0.432872,0.769231,0.0,0.264284,0.374606,0.567551,0.110321,0.192945,0.303266,0.176463,...,-0.469664,0.586188,0.0,93.73682,48.426956,75.64173,0.0,"[1205.9, 1206.1]",MSP-Conversation_2277.wav,"[-1.9068840579710133, 34.25552173913043, 42.65..."
3,1.237264,0.329519,0.881007,0.278488,0.487534,0.739226,0.209047,0.251691,0.460738,0.04996,...,2.011584,-19.782593,0.431766,93.567024,49.33782,88.56083,57.653084,"[1201.06, 1205.5]",MSP-Conversation_2277.wav,"[-5.735706099058535, 39.18666189111748, 48.285..."
4,0.077799,0.0,0.666667,0.368016,0.38575,0.406915,0.017735,0.021165,0.038899,0.35099,...,0.674784,-0.0,0.0,0.0,0.0,-0.146716,0.0,"[1200.32, 1200.42]",MSP-Conversation_2277.wav,"[-0.5096428571428557, 37.87074404761905, 44.18..."


### Remover Duplicados

In [8]:
df_final['Duplicated'] = df_final['Time'] + df_final['Audio_Name']
df_final = df_final.drop_duplicates(subset = 'Duplicated')
df_final = df_final.drop('Duplicated', axis = 1)

### Votación

In [9]:
df_final['Ekman'] = [ekman_emotion(i[0],i[1],i[2]) for i in df_final['Target']]

### Sacar nulos

In [10]:
df_final.isna().sum()

audspec_lengthL1norm_sma_range         85
audspec_lengthL1norm_sma_maxPos        85
audspec_lengthL1norm_sma_minPos        85
audspec_lengthL1norm_sma_quartile1     85
audspec_lengthL1norm_sma_quartile2     85
                                     ... 
mfcc_sma_de[14]_stddevFallingSlope     85
Time                                    0
Audio_Name                              0
Target                                  0
Ekman                                 760
Length: 6377, dtype: int64

In [11]:
df_final = df_final[~df_final['Ekman'].isna()]

In [12]:
df_final = df_final[~df_final['audspec_lengthL1norm_sma_range'].isna()]

In [13]:
df_final.isna().sum()

audspec_lengthL1norm_sma_range        0
audspec_lengthL1norm_sma_maxPos       0
audspec_lengthL1norm_sma_minPos       0
audspec_lengthL1norm_sma_quartile1    0
audspec_lengthL1norm_sma_quartile2    0
                                     ..
mfcc_sma_de[14]_stddevFallingSlope    0
Time                                  0
Audio_Name                            0
Target                                0
Ekman                                 0
Length: 6377, dtype: int64

### Pre procesamiento

#### Distribución de la data

In [20]:
df_final.groupby('Ekman').count().reset_index()

Unnamed: 0,Ekman,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,...,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,Time,Audio_Name,Target
0,anger,7101,7101,7101,7101,7101,7101,7101,7101,7101,...,7101,7101,7101,7101,7101,7101,7101,7101,7101,7101
1,disgust,5900,5900,5900,5900,5900,5900,5900,5900,5900,...,5900,5900,5900,5900,5900,5900,5900,5900,5900,5900
2,joy,18883,18883,18883,18883,18883,18883,18883,18883,18883,...,18883,18883,18883,18883,18883,18883,18883,18883,18883,18883
3,surprise,7484,7484,7484,7484,7484,7484,7484,7484,7484,...,7484,7484,7484,7484,7484,7484,7484,7484,7484,7484


#### Por un tema de investigación voy a sacar sadness

In [19]:
df_final = df_final[df_final['Ekman'] != 'sadness']

In [16]:
X = df_final.iloc[:,:-4].values
Y = df_final['Ekman'].values

In [17]:
# Encoder de las emociones
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [18]:
# split de la data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((29542, 6373), (29542, 5), (9848, 6373), (9848, 5))

In [19]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((29542, 6373), (29542, 5), (9848, 6373), (9848, 5))

In [20]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((29542, 6373, 1), (29542, 5), (9848, 6373, 1), (9848, 5))

In [21]:
model=Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=len(df_final['Ekman'].unique()), activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 6373, 256)         1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 3187, 256)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 3187, 256)         327936    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1594, 256)        0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 1594, 128)         163968    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 797, 128)         0

In [22]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history = model.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[rlrp])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

KeyboardInterrupt: 

In [None]:
print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

epochs = [i for i in range(50)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

In [None]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))