In [113]:
import pandas as pd
import os

path = "/workspaces/codespaces-jupyter/data/YAAD/"
stimulus_descripttion_file = pd.read_excel(path + 'Stimulus_Description.xlsx')

fear_video_id = stimulus_descripttion_file.query('`Target Emotion` == "fear"').head(1).get('Video ID').item()

path_single_modal_data = path + "RawData/Singlemodal/ECG/"
path_multi_modal_data = path + "RawData/Multimodal/ECG/"

single_modal_data_files = os.listdir(path_single_modal_data)
multi_modal_data_files = os.listdir(path_multi_modal_data)

single_modal_dfs = []
for file in single_modal_data_files:
    if "v" + str(fear_video_id) in file:
        df = pd.read_csv(path_single_modal_data + file, delimiter='\t')
        single_modal_dfs.append(df)

multi_modal_dfs = []
for file in multi_modal_data_files:
    if "v" + str(fear_video_id) in file:
        df = pd.read_csv(path_multi_modal_data + file, delimiter='\t')
        multi_modal_dfs.append(df)

In [114]:
def str_dfs_to_real_dfs(dfs) -> pd.array:
    real_dfs = []
    for df in dfs:
        arr = df.columns.values
        str_vals = arr[0].split(',')
        real_vals = [float(each) for each in str_vals]
        real_vals_pd = pd.array(real_vals)
        real_dfs.append(real_vals_pd)
    return pd.array(real_dfs)

def mean_of_means(dfs) -> float:
    t = []
    for each in dfs:
        t.append(each.mean())
    return pd.array(t).mean()

def min_of_mins(dfs) -> float:
    t = []
    for each in dfs:
        t.append(each.min())
    return pd.array(t).min()

def max_of_maxs(dfs) -> float:
    t = []
    for each in dfs:
        t.append(each.max())
    return pd.array(t).max()

def mean_of_vars(dfs) -> float:
    t = []
    for each in dfs:
        t.append(each.var())
    return pd.array(t).mean()

In [115]:
real_sm_dfs = str_dfs_to_real_dfs(single_modal_dfs)
real_mm_dfs = str_dfs_to_real_dfs(multi_modal_dfs)

single refers to singlemodal data
multi refers to multimodal data

In [116]:
print("mean of means single: ", mean_of_means(real_sm_dfs))
print("mean of means multi: ", mean_of_means(real_mm_dfs))

print("min of mins single: ", min_of_mins(real_sm_dfs))
print("min of mins multi: ", min_of_mins(real_mm_dfs))

print("max of maxs single: ", max_of_maxs(real_sm_dfs))
print("max of maxs multi: ", max_of_maxs(real_mm_dfs))

print("mean of vars single: ", mean_of_vars(real_sm_dfs))
print("mean of vars multi: ", mean_of_vars(real_mm_dfs))

mean of means single:  -18.522971550372002
mean of means multi:  -10.029048553400726
min of mins single:  -297.21
min of mins multi:  -21.106
max of maxs single:  18.678
max of maxs multi:  8.1532
mean of vars single:  0.13076550723324526
mean of vars multi:  0.041651835869248255


In [117]:
single_modal_df = []
columns = list(range(1, 5001)) + ["Vtype"]

# We put vtype 1 for fear sessions, and vtype 0 for everything else
for file in single_modal_data_files:
    df = pd.read_csv(path_single_modal_data + file, delimiter='\t')
    vtype = float(file.split('v')[1][0])

    vtype = 1 if vtype == 7.0 else 0

    rdf = str_dfs_to_real_dfs([df])[0]
    minval = rdf.min()
    rdf_and_vtype = pd.array([(each - minval) for each in rdf] + [vtype])
    single_modal_df.append(rdf_and_vtype)
single_modal_df = pd.DataFrame(single_modal_df, columns=columns)


multi_modal_df = []
for file in multi_modal_data_files:
    df = pd.read_csv(path_multi_modal_data + file, delimiter='\t')
    vtype = float(file.split('v')[1][0])

    vtype = 1 if vtype == 7.0 else 0

    rdf = str_dfs_to_real_dfs([df])[0]
    minval = rdf.min()
    rdf_and_vtype = pd.array([(each - minval) for each in rdf] + [vtype])
    multi_modal_df.append(rdf_and_vtype)
multi_modal_df = pd.DataFrame(multi_modal_df, columns=columns)


In [118]:
single_y = single_modal_df.Vtype
single_X = single_modal_df[list(range(1, 5001))]

multi_y = multi_modal_df.Vtype
multi_X = multi_modal_df[list(range(1, 5001))]

First, we train using single modal data and validate 

In [119]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error

fear_detect_model = DecisionTreeClassifier(random_state=1)

fear_detect_model.fit(single_X, single_y)

val_predictions = fear_detect_model.predict(multi_X)
print(mean_absolute_error(multi_y, val_predictions))

0.11507936507936507


Next, we train using multi modal data (inverse)

In [120]:
fear_detect_model.fit(multi_X, multi_y)

val_predictions = fear_detect_model.predict(single_X)
print(mean_absolute_error(single_y, val_predictions))

0.12337662337662338


training for single shows a bit less mean error
Now, we will try to find optimal max_leaf_nodes 

In [121]:
def try_leaves(max_leaf_nodes, train_X, train_y, val_X, val_y):
    fear_detect_model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    fear_detect_model.fit(train_X, train_y)

    val_predictions = fear_detect_model.predict(val_X)
    return mean_absolute_error(val_y, val_predictions)

In [122]:
for leaves in (10, 50, 100, 500, 1000, 5000):
    print(f"max_leaf_nodes = {leaves}, train-single mae: ", try_leaves(leaves, single_X, single_y, multi_X, multi_y))
    print(f"max_leaf_nodes = {leaves}, train-multi mae: ", try_leaves(leaves, multi_X, multi_y, single_X, single_y))

max_leaf_nodes = 10, train-single mae:  0.11507936507936507
max_leaf_nodes = 10, train-multi mae:  0.12987012987012986


max_leaf_nodes = 50, train-single mae:  0.11507936507936507
max_leaf_nodes = 50, train-multi mae:  0.12337662337662338
max_leaf_nodes = 100, train-single mae:  0.11507936507936507
max_leaf_nodes = 100, train-multi mae:  0.12337662337662338
max_leaf_nodes = 500, train-single mae:  0.11507936507936507
max_leaf_nodes = 500, train-multi mae:  0.12337662337662338
max_leaf_nodes = 1000, train-single mae:  0.11507936507936507
max_leaf_nodes = 1000, train-multi mae:  0.12337662337662338
max_leaf_nodes = 5000, train-single mae:  0.11507936507936507
max_leaf_nodes = 5000, train-multi mae:  0.12337662337662338


max_leaf_node=50 with singlemodal data as training data seems to be optimal setting for current configuration.

In [125]:
fear_detect_model = DecisionTreeClassifier(max_leaf_nodes=50, random_state=1)

fear_detect_model.fit(single_X, single_y)

# try to test our model on fear sessions
fear_multi_modal =  multi_modal_df.query("`Vtype` == 1")
val_X = fear_multi_modal[list(range(1, 5001))]
val_predictions = fear_detect_model.predict(val_X)
print(val_predictions)
# as all of the entries in val_prediction should ideally be equal to 7.0:
counter = 0
for each in val_predictions:
    if round(each, 0) == 1.0:
        counter += 1
print(counter)
# getting the % of correct answers:
print(100* (counter / len(val_predictions)))


[0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1.]
17
47.22222222222222


accuracy is 47%
TT

Lets try another model