In [1]:
import pandas as pd
import re
import numpy as np
import scipy.stats as stats
from scipy.stats import wilcoxon
from constants import diffMappingToScore, questions, labelsToElements
from functools import reduce
from utils import fixationProportionThresholdAnalysis, phaseDetection, dwellRegressionOnRelevantElements, periodCalculation, scanPathPrecision, averageFixationDuration, averageSaccadeAmplitudeForPhases, addQuestionInfo

In [2]:
#load data
data = pd.read_csv("data/eventsDataWithAois.csv")

In [3]:
#enrich questions with relevant elements
questions = [ {**question,**{'Relevant elements labels': re.findall('"(.+?)"', question["question"])}}  for question in questions ]

for question in questions:
    for idx, label in enumerate(question["Relevant elements labels"]):
        if re.compile("\[(.+?)\]").match(label):
            question["Relevant elements labels"][idx-1] = f'{question["Relevant elements labels"][idx-1]} {label}'
            question["Relevant elements labels"].remove(label)
            
questions = [ {**question,**{'Relevant elements count': len(question["Relevant elements labels"])}}  for question in questions ]

In [4]:
#get activities labels
questions = [ {**question,**{'Relevant elements names':  [ labelsToElements[activity] for  activity in question["Relevant elements labels"] ]   }}  for question in questions ]

In [5]:
#################
#
# Phase detection
#
#################

In [6]:
#drop na
fixationData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)
#add question info
fixationData = addQuestionInfo(fixationData,questions)

"""Q13 (local) and Q25 (global) need to be removed for SP11 due to low data quality"""
fixationData = fixationData.drop(fixationData[(fixationData['participant'] == 'SP11-no') & (fixationData['Type3'] == 'Exclusiveness')].index)

In [7]:
#detect phases (phase 1: pre/post to the point when all relevant activities identified)
phDectFix = phaseDetection(fixationData,questions)

In [8]:
#add Timestamp_formatted column
phDectFix["timestamp_formatted"] = pd.to_datetime(phDectFix['Fixation Start'], unit='ms')

In [9]:
#prepare datasets

In [10]:
#######################
#
# Average fixation duration
#
#######################

In [11]:
avFDPT = averageFixationDuration(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
avFDPT = avFDPT.loc[avFDPT["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
avFDPT = avFDPT.loc[avFDPT["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
avFDPT = avFDPT.sort_values(by=['participant','currentQuestion','timestamp'])

In [12]:
####################
#
# Average Saccade amplitude
#
####################

In [13]:
#filter out those with N/A
phases = phDectFix.loc[phDectFix["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
phases = phases.loc[phases["Type2"]=="Control-flow"].copy(deep=True)
#calculate avSacAmplitude
avSacAmplitude = averageSaccadeAmplitudeForPhases(phases,data,['currentQuestion','participant','Type1','Type2','Type3','Phase'])
#sorting (extra)
avSacAmplitude = avSacAmplitude.sort_values(by=['participant','currentQuestion','timestamp'])

In [14]:
####################
#
# Scan-path precision
#
####################

In [15]:
scanPathPrecisionData = scanPathPrecision(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
scanPathPrecisionData = scanPathPrecisionData.sort_values(by=['participant','currentQuestion','timestamp'])

In [16]:
#######################
#
# Fixation threshold proportion analysis
#
#######################

In [17]:
fxThresholdsData = fixationProportionThresholdAnalysis(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
fxThresholdsData = fxThresholdsData.sort_values(by=['participant','currentQuestion','timestamp'])

In [18]:
#######################
#
# All measures in one dataframe
#
######################

In [19]:
#merge all dataframes (computed previously)
dfs = [avFDPT,avSacAmplitude,scanPathPrecisionData,fxThresholdsData]
all_measures = reduce(lambda left,right: pd.merge(left,right,on=['participant', 'currentQuestion', 'Type1', 'Type2', 'Type3', 'Phase','timestamp'], how='inner'), dfs)
all_measures.columns

Index(['participant', 'currentQuestion', 'Type1', 'Type2', 'Type3', 'Phase',
       'Average_Fixation_Duration', 'timestamp', 'avSaccadeAmplitude',
       'scan_path_precision', 'timeInterval', 'shortFixationsProp',
       'longFixationsProp'],
      dtype='object')

In [20]:
#all_measures.to_csv("all_measures.csv")
all_measures

Unnamed: 0,participant,currentQuestion,Type1,Type2,Type3,Phase,Average_Fixation_Duration,timestamp,avSaccadeAmplitude,scan_path_precision,timeInterval,shortFixationsProp,longFixationsProp
0,KP1-no,10,Local,Control-flow,Concurrency,search,159.351696,1970-01-01 01:51:21.415415000,3.090691,0.019608,48663.093,0.810458,0.013072
1,KP1-no,10,Local,Control-flow,Concurrency,inference,211.228100,1970-01-01 01:52:10.295124500,2.372972,0.300000,7590.798,0.700000,0.050000
2,KP1-no,13,Local,Control-flow,Exclusiveness,search,142.890213,1970-01-01 01:35:15.800703500,2.578242,0.019802,35750.635,0.930693,0.009901
3,KP1-no,13,Local,Control-flow,Exclusiveness,inference,171.710651,1970-01-01 01:35:52.142963500,3.072699,0.091398,68223.252,0.833333,0.032258
4,KP1-no,19,Global,Control-flow,Ordering,search,155.017721,1970-01-01 01:54:28.652575000,2.500522,0.069767,17977.707,0.837209,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,SP9-no,19,Global,Control-flow,Ordering,inference,275.219120,1970-01-01 02:40:59.025503499,3.649313,0.210000,33029.763,0.580000,0.090000
642,SP9-no,22,Global,Control-flow,Concurrency,search,217.570035,1970-01-01 02:33:27.850005000,3.314333,0.030303,29521.790,0.686869,0.080808
643,SP9-no,22,Global,Control-flow,Concurrency,inference,277.670664,1970-01-01 02:33:57.596808000,3.216007,0.292793,77515.817,0.572072,0.108108
644,SP9-no,25,Global,Control-flow,Exclusiveness,search,192.338002,1970-01-01 02:12:21.324865500,3.323614,0.036232,73555.341,0.775362,0.025362


In [21]:
#######################
#
# Descriptives and Inferentials
#
######################

In [22]:
allstats = all_measures.groupby(['Phase']).agg({'Average_Fixation_Duration':'mean',
'avSaccadeAmplitude':'mean',
'scan_path_precision':'mean', 
'shortFixationsProp':'mean', 
'longFixationsProp':'mean'},as_index=False)

In [23]:
allstats.round(2)

Unnamed: 0_level_0,Average_Fixation_Duration,avSaccadeAmplitude,scan_path_precision,shortFixationsProp,longFixationsProp
Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
inference,212.0,3.85,0.17,0.76,0.06
search,186.29,3.64,0.04,0.81,0.02


In [24]:
measures = ['Average_Fixation_Duration', 'avSaccadeAmplitude',
       'scan_path_precision', 'shortFixationsProp', 
       'longFixationsProp']

# new dict with measure values
values = []


for measure in measures:
    print(f'--{measure}')
    
    #get one measure per participant/phase
    all_measures_part = all_measures.groupby(['participant','Phase'], as_index=False).agg({measure:"mean"})
    

    measure_a = all_measures_part.loc[(all_measures_part["Phase"]=='inference')][['participant',measure]]
    measure_b = all_measures_part.loc[(all_measures_part["Phase"]=='search')][['participant',measure]]
    print(len(measure_a), len(measure_b))
    measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
    print(len(measure_merge))
    pvalue = stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']).pvalue
    print(pvalue)
    values.append(pvalue)
    

# append the new row to the dataframe
new_row = pd.Series(values, index=allstats.columns, name='p-values')

# add the new row to the dataframe using loc
allstats_withInf = allstats.append(new_row)

--Average_Fixation_Duration
44 44
44
8.662937034387141e-11
--avSaccadeAmplitude
44 44
44
0.0027197802502314516
--scan_path_precision
44 44
44
1.1368683772161603e-13
--shortFixationsProp
44 44
44
1.0963003660435788e-07
--longFixationsProp
44 44
44
5.684341886080801e-13


In [25]:
allstats_withInf

Unnamed: 0_level_0,Average_Fixation_Duration,avSaccadeAmplitude,scan_path_precision,shortFixationsProp,longFixationsProp
Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
inference,212.0015,3.851871,0.1667027,0.7621073,0.05503833
search,186.2891,3.635792,0.0376342,0.8092834,0.02242044
p-values,8.662937e-11,0.00272,1.136868e-13,1.0963e-07,5.684342e-13


In [26]:
allstats_withInf.applymap(lambda x: '<0.001' if x < 0.001 else x).round(3)


Unnamed: 0_level_0,Average_Fixation_Duration,avSaccadeAmplitude,scan_path_precision,shortFixationsProp,longFixationsProp
Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
inference,212.002,3.852,0.166703,0.762107,0.0550383
search,186.289,3.636,0.0376342,0.809283,0.0224204
p-values,<0.001,0.003,<0.001,<0.001,<0.001
