### This notebook generates the data input to the LSTM models. It reads the (aligned) input modalities and computes a serialised version of the data. That is it generates temporal data using a sliding window. It also computes the dynamics for each feature (1st order derivative). In my dissertation, timesteps of 10, 30 and 60 frames were investigated.

In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
def load(no_data=False, eGeMAPS=False, verbose=False):
    """load preprocessed visual and acoustic features 
    """
    visual_dir = '/home/ceccarelli/Work/Bipolar/LLDs_video_openface_processed'
    acoustic_dir = '/home/ceccarelli/Work/Bipolar/LLDs_audio_opensmile_aligned/MFCCs' if not eGeMAPS else '/home/ceccarelli/Work/Bipolar/LLDs_audio_opensmile_aligned/eGeMAPS'
    output_dir = '/home/ceccarelli/Work/Bipolar/aligned_AV' if not eGeMAPS else '/home/ceccarelli/Work/Bipolar/aligned_EAV' 

    if no_data:
        print("\nprocessed files exist, starting loading (w/o raw data) ...")
        
        y_train = pd.read_csv(os.path.join(output_dir, 'train_label.csv'), header=None) 
        inst_train = pd.read_csv(os.path.join(output_dir, 'train_inst.csv'), header=None) 
        y_dev = pd.read_csv(os.path.join(output_dir, 'dev_label.csv'), header=None)
        inst_dev = pd.read_csv(os.path.join(output_dir, 'dev_inst.csv'), header=None)

        if (verbose):
            print("--" * 20)
            print("train label size", y_train.T.shape)
            print("dev label size", y_dev.T.shape)
            print("train inst size", inst_train.T.shape)
            print("dev inst size", inst_dev.T.shape)
            print("--" * 20)
        
        return y_train.T.values, inst_train.T.values, y_dev.T.values, inst_dev.T.values

    elif os.path.isfile(os.path.join(output_dir, 'train_data_A.csv')):
        print("\nprocessed files exist, starting loading ...")
        X_train_A = pd.read_csv(os.path.join(output_dir, 'train_data_A.csv'), header=None) 
        X_dev_A = pd.read_csv(os.path.join(output_dir, 'dev_data_A.csv'), header=None) 
        X_test_A = pd.read_csv(os.path.join(output_dir, 'test_data_A.csv'), header=None)
        X_train_V = pd.read_csv(os.path.join(output_dir, 'train_data_V.csv'), header=None, low_memory=False)
        X_dev_V = pd.read_csv(os.path.join(output_dir, 'dev_data_V.csv'), header=None, low_memory=False) 
        X_test_V = pd.read_csv(os.path.join(output_dir, 'test_data_V.csv'), header=None, low_memory=False)
        y_train = pd.read_csv(os.path.join(output_dir, 'train_label.csv'), header=None) 
        inst_train = pd.read_csv(os.path.join(output_dir, 'train_inst.csv'), header=None) 
        y_dev = pd.read_csv(os.path.join(output_dir, 'dev_label.csv'), header=None) 
        inst_dev = pd.read_csv(os.path.join(output_dir, 'dev_inst.csv'), header=None)

        if (verbose==True):
            print("--" * 20)
            print("train data (A) size", X_train_A.shape)
            print("train data (V) size", X_train_V.shape)
            print("dev data (A) size", X_dev_A.shape)
            print("dev data (V) size", X_dev_V.shape)
            print("test data (A) size", X_test_A.shape)
            print("test data (V) size", X_test_V.shape)
            print("--" * 20)
            print("train label size", y_train.T.shape)
            print("dev label size", y_dev.T.shape)
            print("train inst size", inst_train.T.shape)
            print("dev inst size", inst_dev.T.shape)
            print("--" * 20)

        return X_train_A.iloc[:,1:], X_dev_A.iloc[:,1:], X_test_A.iloc[:,1:], X_train_V.iloc[:,1:], X_dev_V.iloc[:,1:], X_test_V.iloc[:,1:], y_train.T.values, inst_train.T.values, y_dev.T.values, inst_dev.T.values
    

In [3]:
#select indexes for choosing which modality to serialise
modalities = ["visual", "audio", "audioE"]
visual_modalities = ["facial", "gaze", "pose", "action"]
modality = modalities[2]
visual_modality = visual_modalities[0]

print ("Loading %s modality" %modality)

if (modality == "visual"):
    _, _, _, X_train_V, X_dev_V, _, _, _, _, _ = load(verbose=True)
    if (visual_modality=="facial"):
        print ("Visual (facial)")
        X_train_V = X_train_V.iloc[:, :136] 
        X_dev_V = X_dev_V.iloc[:, :136] 
    elif (visual_modality=="gaze"):
        print ("Visual (gaze)")
        X_train_V = X_train_V.iloc[:, 136:142]
        X_dev_V = X_dev_V.iloc[:, 136:142]
    elif (visual_modality=="pose"):
        print ("Visual (pose)")
        X_train_V = X_train_V.iloc[:, 142:148] 
        X_dev_V = X_dev_V.iloc[:, 142:148] 
    else:
        print ("Visual (action)")
        X_train_V = X_train_V.iloc[:, 148:] 
        X_dev_V = X_dev_V.iloc[:, 148:] 
            
            
    X_train = X_train_V.to_numpy()
    X_dev = X_dev_V.to_numpy()
    
    del X_train_V
    del X_dev_V
    
    print ("Autoencoder visual train data has this shape", X_train.shape)
    print ("Autoencoder visual dev data has this shape", X_dev.shape)
    
elif (modality == "audio"):
    X_train_A, X_dev_A, _, _, _, _, _, _, _, _ = load(verbose=True)
    X_train = X_train_A.to_numpy()
    X_dev = X_dev_A.to_numpy()
    
    del X_train_A
    del X_dev_A
    
    print ("Autoencoder audio train data has this shape", X_train.shape)
    print ("Autoencoder audio dev data has this shape", X_dev.shape)
    
else:
    X_train_A, X_dev_A, _, _, _, _, _, _, _, _ = load(eGeMAPS=True, verbose=True)
    X_train = X_train_A.to_numpy()
    X_dev = X_dev_A.to_numpy()
    
    del X_train_A
    del X_dev_A
    
    print ("Autoencoder audioE train data has this shape", X_train.shape)
    print ("Autoencoder audioE dev data has this shape", X_dev.shape)


Loading audioE modality

processed files exist, starting loading ...
----------------------------------------
train data (A) size (759575, 70)
train data (V) size (759575, 184)
dev data (A) size (317104, 70)
dev data (V) size (317104, 184)
test data (A) size (372734, 70)
test data (V) size (372734, 184)
----------------------------------------
train label size (759576, 1)
dev label size (317105, 1)
train inst size (759576, 1)
dev inst size (317105, 1)
----------------------------------------
Autoencoder audioE train data has this shape (759575, 69)
Autoencoder audioE dev data has this shape (317104, 69)


In [4]:
def get_dynamics(X_0th, time=0.1):
    """compute dynamics for data (1st)"""
    X_1st = np.zeros((X_0th.shape[0]-1, X_0th.shape[1]))
    for i in range(X_0th.shape[0]-1):
        X_1st[i] = (X_0th[i+1] - X_0th[i]) / time
    return X_1st

def frame2session(X, y, inst, verbose=False):
    # para X: data
    # para y: label
    # para inst: instance
    print(X.shape, y.shape, inst.shape)
    X = np.asarray(X)
    assert X.shape[0] == y.shape[0] == inst.shape[0]
    if y.shape[1] == 1:
        y = y[:,0]
    if inst.shape[1] == 1:
        inst = inst[:,0]
    
    max_inst = int(max(inst))
    min_inst = int(min(inst))
    X_sess, y_sess = [], []
    for i in range(min_inst, max_inst+1):
        idx = np.where(inst == i)[0]
        X_temp = X[idx]
        y_temp = y[idx]
        X_sess.append(X_temp)
        if len(set(y_temp)) == 1:
            y_sess.append(y_temp[0])
        if verbose:
            print("instance %d data shape" % i, X_temp.shape)
    assert max_inst == len(X_sess) == len(y_sess)
    return np.array(X_sess), np.array(y_sess)

In [5]:
y_train_frame, inst_train, y_dev_frame, inst_dev = load(no_data=True, verbose=True)
y_train_frame = y_train_frame[:-1,:]
inst_train = inst_train[:-1,:]
y_dev_frame = y_dev_frame[:-1,:]
inst_dev = inst_dev[:-1,:]


processed files exist, starting loading (w/o raw data) ...
----------------------------------------
train label size (759576, 1)
dev label size (317105, 1)
train inst size (759576, 1)
dev inst size (317105, 1)
----------------------------------------


In [6]:
X_train_session, y_train_session = frame2session(X_train, y_train_frame, inst_train, verbose=False)
X_dev_session, y_dev_session = frame2session(X_dev, y_dev_frame, inst_dev, verbose=False)

#save the frame session labels 
#change directory as needed
np.save("../TemporalData30/session_label_train.npy", np.asarray(y_train_session))
np.save("../TemporalData30/session_label_dev.npy", np.asarray(y_dev_session))

(759575, 69) (759575, 1) (759575, 1)
(317104, 69) (317104, 1) (317104, 1)


In [7]:
#serialise the data using a loop back of variable length (10, 30 and 60)
X_train = list()
x_train_length = list()

for i in X_train_session:

    look_back = 30
    tmp = get_dynamics(i)
    nb_samples = tmp.shape[0] - look_back
    x_train_reshaped = np.zeros((nb_samples, look_back, tmp.shape[1]))

    for k in range(nb_samples):
        y_position = k + look_back
        x_train_reshaped[k] = tmp[k:y_position]
    X_train.append(x_train_reshaped)
    x_train_length.append(x_train_reshaped.shape[0])

print ("Current length of train data is ", len(X_train))

X_dev = list()
x_dev_length = list()

for i in X_dev_session:

    look_back = 30
    tmp = get_dynamics(i)
    nb_samples = tmp.shape[0] - look_back
    x_dev_reshaped = np.zeros((nb_samples, look_back, tmp.shape[1]))

    for k in range(nb_samples):
        y_position = k + look_back
        x_dev_reshaped[k] = tmp[k:y_position]
    X_dev.append(x_dev_reshaped)
    x_dev_length.append(x_dev_reshaped.shape[0])
    
print ("Current length of dev data is ", len(X_dev))

#store the lenght of each video in terms of frames so it is easy to recontrust all the video sequences
x_train_length = np.asarray(x_train_length)
x_dev_length = np.asarray(x_dev_length)
np.save("../TemporalData30/train_length.npy", x_train_length)
np.save("../TemporalData30/dev_length.npy", x_dev_length)


#save the dynamic, sequential features 
X_train = np.vstack((X_train))
X_dev = np.vstack((X_dev))

#change path and name as needed
np.save("../TemporalData30/train_audioE.npy", X_train)
np.save("../TemporalData30/dev_audioE.npy", X_dev)
print (X_train.shape)
print (X_dev.shape)

Current length of train data is  104
Current length of dev data is  60
(756351, 30, 69)
(315244, 30, 69)
