## This notebook includes:

* Creating a Bidirectional LSTM based model (results reported in Table IV and V of the paper)
* Running the model on multimodal ((audio, text, visual) data for classification for evaluation

In [1]:
#import libraries
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers

#for PCA
from sklearn.decomposition import PCA


In [2]:
# read lexical-acoustic data
al_data = np.load("regularOnly.npy", allow_pickle = True)
al_data = np.delete(al_data, 3889, axis=1)
al_data = np.delete(al_data, 3894, axis=1)
#print(al_data.shape)

#getting columns names ready for merging
feature_columns = ['feature_{}'.format(i) for i in range (2623, 9491)]
other_columns = ['id', 'speaker', 'label']
feature_cols = other_columns + feature_columns

#get final dataframe containing verbal (acoustic-prosodic and linguistic data)
al_data = pd.DataFrame(al_data, columns = feature_cols)
al_data

# get visual data
vis_data = pd.read_csv('/notebooks/visual_features/data_OpenFace_1.csv', index_col = [0])

# merge acoustic-prosodic and visual data
final_dataset = pd.merge(vis_data, al_data.drop(columns =['speaker', 'label']), how = 'right', on = 'id', sort = True)
final_dataset = final_dataset.rename(columns = {"speaker_x":"speaker", "label_x": "label"})
final_dataset = final_dataset.dropna()

Unnamed: 0,id,speaker,label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_9481,feature_9482,feature_9483,feature_9484,feature_9485,feature_9486,feature_9487,feature_9488,feature_9489,feature_9490
0,s01e01-1,Dom Joly,1.0,0.005139,0.029560,-0.075024,-0.035568,-0.017763,0.110714,-0.089801,...,0.018490338698029518,0.08476332575082779,-0.11359433084726334,0.3834567070007324,0.2739666700363159,0.1074366569519043,0.46812331676483154,0.1457563191652298,0.12616167962551117,-0.029206672683358192
1,s01e01-2,Duncan Bannatyne,1.0,0.007152,0.088031,-0.045368,-0.046058,0.067916,0.189048,-0.013381,...,-0.09824812412261963,0.16229026019573212,0.16081875562667847,0.32505500316619873,0.34945249557495117,-0.09781775623559952,0.23907776176929474,0.09811551123857498,0.052620500326156616,-0.08417987823486328
2,s01e01-3,Natalie Cassidy,0.0,0.048681,-0.008859,-0.113927,-0.007059,0.075472,0.131093,0.009876,...,0.08289637416601181,0.21903474628925323,-0.07876124978065491,0.1481068730354309,0.18827761709690094,-0.14032037556171417,0.05450335144996643,0.13083963096141815,0.04116374999284744,-0.15561625361442566
3,s01e01-6,Dom Joly,0.0,0.031647,0.064775,-0.043998,-0.019308,0.028442,0.141178,-0.005597,...,0.09112167358398438,0.03759665787220001,0.19848668575286865,0.28718501329421997,-0.0735669955611229,0.061839908361434937,0.12194333225488663,0.263722687959671,-0.13163550198078156,-0.2921786606311798
4,s01e01-7,Frankie Boyle,1.0,0.003951,0.159030,-0.007606,-0.043233,0.070793,0.147742,-0.038791,...,0.37825965881347656,0.2722756564617157,-0.14988566935062408,0.24854867160320282,0.17865000665187836,-0.2249843329191208,-0.3870030343532562,0.11379333585500717,0.11868665367364883,-0.4633199870586395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,s14e08-3,Roisin Conaty,1.0,0.040339,-0.011892,-0.050313,-0.040630,0.020426,0.106055,0.018815,...,0.08795725554227829,0.0237351655960083,-0.032572004944086075,0.15692858397960663,0.10141167044639587,-0.17155317962169647,0.2923780679702759,0.26045724749565125,-0.09290068596601486,-0.22868959605693817
352,s14e08-4,Roman Kemp,1.0,-0.028110,0.169771,0.022322,-0.031299,0.017207,0.127590,-0.002619,...,0.12971431016921997,0.037154700607061386,0.026378804817795753,0.23197659850120544,0.1548031121492386,-0.14210765063762665,0.3862755596637726,0.3016236424446106,0.015786344185471535,-0.2246006727218628
353,s14e09-1,Alex Jones,1.0,0.027228,-0.017874,-0.041564,-0.041545,0.013167,0.119052,0.008405,...,0.17405150830745697,0.12252228707075119,0.1441657841205597,0.1589970886707306,0.15254013240337372,-0.07880286127328873,0.3323584198951721,0.34674471616744995,-0.025951789692044258,-0.1538473516702652
354,s14e09-3,Martin Lewis,1.0,-0.007263,0.050732,-0.050518,-0.090673,0.044292,0.085449,-0.058811,...,0.1738501787185669,0.09341058135032654,0.12290232628583908,-0.006683960556983948,0.08657991141080856,-0.01124399434775114,0.519538402557373,0.24855166673660278,-0.03214416280388832,-0.2781267464160919


In [3]:
# Optional: function to save the test set for summary statistics
def test(testIndex, test_Y, test_X, r):
    a = np.expand_dims(np.array(testIndex), axis = 1) # index
    b = np.expand_dims(np.array(test_Y), axis = 1)  # test labels
    
    final = np.concatenate((a, test_X), axis = 1)
    final = np.concatenate((final, b), axis = 1)

    np.save('/notebooks/data_{}.npy'.format(r), final)
    
    return

In [4]:
def train_test_split_by_speaker(data, test_size=0.2, pca=False, numPC=300):
    '''
    parameters:
    data: type to be decided, the dataset that we will train the model on
    test_size: the percentage of test data, by default 0.2
    pca: Bool. if to conduct PCA on the dataset, by default False
    numPca: number of principal components, by default 50
    
    return:
    train_x:
    train_y:
    test_x:
    test_y:
    '''
    
    data = np.array(data)
    
    #get the speaker name column
    name = data[:,1]
    nameList = []
    nameIndexDic = {}
    for i in range(name.shape[0]):
        try: 
            nameIndexDic[name[i]].append(i)
        except:
            nameList.append(name[i])
            nameIndexDic[name[i]] = [i]
    nameList = np.array(nameList)
    np.random.shuffle(nameList)


    numOfVector = round(len(nameList)*test_size)
    
    testIndex = []
    trainIndex = []
    
    #create index list for train and test 
    for j in range(len(nameList)):
        
        #first test dataset
        if j < numOfVector:
            testIndex = testIndex + nameIndexDic[nameList[j]]
        
        #rest goes to train set
        else: 
            trainIndex = trainIndex + nameIndexDic[nameList[j]]
    
    
    data = data[:,2:].astype(float)
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data[:, 1:])
    
    if pca == True:
        pcaModel = PCA(numPC) 
        pcaData = pcaModel.fit_transform(scaled_data)
        
        print('Explained variance:', np.sum(pcaModel.explained_variance_ratio_))
    
        scaled_data = pcaData
    
    #split data
    test = scaled_data[testIndex]
    train = scaled_data[trainIndex]

    #split labels
    test_l = data[testIndex]
    train_l = data[trainIndex]
    
    
    test_x = test.astype(float)
    test_y = test_l[:,0].astype(int)
    
    train_x = train.astype(float)
    train_y = train_l[:,0].astype(int)
    
    return train_x, train_y, test_x, test_y, nameIndexDic, testIndex

train_X, train_Y, test_X, test_Y, dic, testIndex = train_test_split_by_speaker(df, pca=True,test_size=0.2)

Explained variance: 0.9727965360097709


In [5]:
def run_model_n_times(n=10):
    #create the final statistics array
    final = []
    
    #initialize the model
    max_features = 1000 # ALL THE FEATURES
    maxlen = 100 

    # Input for variable-length sequences of integers
    inputs = keras.Input(shape=(None,), dtype="int32")
    
    # Embed each integer in a 128-dimensional vector
    x = layers.Embedding(max_features, 128)(inputs)
    # Add 1 bidirectional LSTMs
    x = layers.Bidirectional(layers.LSTM(256))(x)
    
    # Add a classifier
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    # model.summary()
    
    
    for r in range(n):
        print(r)
        train_X, train_Y, test_X, test_Y, dic, testIndex = train_test_split_by_speaker(df, test_size=0.2, pca=True, numPC=300)
        
        test(testIndex, test_Y, test_X, r)
        
        # test if there is empty result
        na = np.isnan(train_X)

        for i in range(na.shape[0]):
            for j in range(na.shape[1]):
                if bool(na[i,j])==True:
                    print(i,j)         

        train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=maxlen)
        test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=maxlen)
    
        model.compile("adam", "binary_crossentropy", metrics=["accuracy", keras.metrics.Precision(),keras.metrics.Recall(),keras.metrics.AUC()])
        result = model.fit(train_X, train_Y, batch_size=32, epochs=100, validation_data=(test_X, test_Y))
        
        #tf.saved_model.save(model, '/notebooks/best_model')
        model.save('/notebooks/best_model/model_{}.keras'.format(r))
        
        #from: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model 
        y_pred = model.predict(test_X, batch_size=32, verbose=1)
        y_pred_bool = np.round(y_pred)
        print(classification_report(test_Y, y_pred_bool))
        
        #get confusion matrix
        confusion = confusion_matrix(test_Y, y_pred_bool)
        #print(confusion)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion, display_labels=[1, 0])
        disp.plot()
        
        #plot statistics
        #pd.DataFrame(result.history).plot(figsize=(8,5))
        #plt.show()
        
        
        statistics = []
        index = 0
        for data in result.history.values():
            if index >= 6:
                statistics.append(data[-1]) 
            index += 1
        statistics.append(2*statistics[1]*statistics[2]/(statistics[1]+statistics[2]))
        final.append(statistics)
        
    final = pd.DataFrame(final, columns=["accuracy", "precision", "recall", "auc", "f1"])
    
    # final statistic array should look like
    # [accuracy, precision, recall, auc, f1]
    return final.quantile([0.25, 0.5, 0.75]), final.mean()
    

In [None]:
# run model 10 times
print("\n\n\n\n\n", run_model_n_times(10))