In [None]:
import pandas as pd
import re
import numpy as np
import scipy.stats as stats
from scipy.stats import wilcoxon
from constants import diffMappingToScore, questions, labelsToElements
from functools import reduce
from utils import fixationProportionThresholdAnalysis, phaseDetection, dwellRegressionOnRelevantElements, periodCalculation, scanPathPrecision, averageFixationDuration, averageSaccadeAmplitudeForPhases, addQuestionInfo

In [None]:
#load data
data = pd.read_csv("data/eventsDataWithAois.csv")

In [None]:
#enrich questions with relevant elements
questions = [ {**question,**{'Relevant elements labels': re.findall('"(.+?)"', question["question"])}}  for question in questions ]

for question in questions:
    for idx, label in enumerate(question["Relevant elements labels"]):
        if re.compile("\[(.+?)\]").match(label):
            question["Relevant elements labels"][idx-1] = f'{question["Relevant elements labels"][idx-1]} {label}'
            question["Relevant elements labels"].remove(label)
            
questions = [ {**question,**{'Relevant elements count': len(question["Relevant elements labels"])}}  for question in questions ]

In [None]:
#get activities labels
questions = [ {**question,**{'Relevant elements names':  [ labelsToElements[activity] for  activity in question["Relevant elements labels"] ]   }}  for question in questions ]

In [None]:
#################
#
# Phase detection
#
#################

In [None]:
#drop na
fixationData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)
#add question info
fixationData = addQuestionInfo(fixationData,questions)

"""Q13 (local) and Q25 (global) need to be removed for SP11 due to low data quality"""
fixationData = fixationData.drop(fixationData[(fixationData['participant'] == 'SP11-no') & (fixationData['Type3'] == 'Exclusiveness')].index)

In [None]:
#detect phases (phase 1: pre/post to the point when all relevant activities identified)
phDectFix = phaseDetection(fixationData,questions)

In [None]:
#add Timestamp_formatted column
phDectFix["timestamp_formatted"] = pd.to_datetime(phDectFix['Fixation Start'], unit='ms')

In [None]:
#prepare datasets

In [None]:
#######################
#
# Average fixation duration
#
#######################

In [None]:
avFDPT = averageFixationDuration(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
avFDPT = avFDPT.loc[avFDPT["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
avFDPT = avFDPT.loc[avFDPT["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
avFDPT = avFDPT.sort_values(by=['participant','currentQuestion','timestamp'])

In [None]:
####################
#
# Average Saccade amplitude
#
####################

In [None]:
#filter out those with N/A
phases = phDectFix.loc[phDectFix["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
phases = phases.loc[phases["Type2"]=="Control-flow"].copy(deep=True)
#calculate avSacAmplitude
avSacAmplitude = averageSaccadeAmplitudeForPhases(phases,data,['currentQuestion','participant','Type1','Type2','Type3','Phase'])
#sorting (extra)
avSacAmplitude = avSacAmplitude.sort_values(by=['participant','currentQuestion','timestamp'])

In [None]:
####################
#
# Scan-path precision
#
####################

In [None]:
scanPathPrecisionData = scanPathPrecision(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
scanPathPrecisionData = scanPathPrecisionData.sort_values(by=['participant','currentQuestion','timestamp'])

In [None]:
#######################
#
# Fixation threshold proportion analysis
#
#######################

In [None]:
fxThresholdsData = fixationProportionThresholdAnalysis(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
fxThresholdsData = fxThresholdsData.sort_values(by=['participant','currentQuestion','timestamp'])

In [None]:
#######################
#
# All measures in one dataframe
#
######################

In [None]:
#merge all dataframes (computed previously)
dfs = [avFDPT,avSacAmplitude,scanPathPrecisionData,fxThresholdsData]
all_measures = reduce(lambda left,right: pd.merge(left,right,on=['participant', 'currentQuestion', 'Type1', 'Type2', 'Type3', 'Phase','timestamp'], how='inner'), dfs)
all_measures.columns

In [None]:
#all_measures.to_csv("all_measures.csv")
all_measures

In [None]:
#######################
#
# Descriptives and Inferentials
#
######################

In [None]:
allstats = all_measures.groupby(['Phase']).agg({'Average_Fixation_Duration':'mean',
'avSaccadeAmplitude':'mean',
'scan_path_precision':'mean', 
'shortFixationsProp':'mean', 
'longFixationsProp':'mean'},as_index=False)

In [None]:
allstats.round(2)

In [None]:
measures = ['Average_Fixation_Duration', 'avSaccadeAmplitude',
       'scan_path_precision', 'shortFixationsProp', 
       'longFixationsProp']

# new dict with measure values
values = []


for measure in measures:
    print(f'--{measure}')
    
    #get one measure per participant/phase
    all_measures_part = all_measures.groupby(['participant','Phase'], as_index=False).agg({measure:"mean"})
    

    measure_a = all_measures_part.loc[(all_measures_part["Phase"]=='inference')][['participant',measure]]
    measure_b = all_measures_part.loc[(all_measures_part["Phase"]=='search')][['participant',measure]]
    print(len(measure_a), len(measure_b))
    measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
    print(len(measure_merge))
    pvalue = stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']).pvalue
    print(pvalue)
    values.append(pvalue)
    

# append the new row to the dataframe
new_row = pd.Series(values, index=allstats.columns, name='p-values')

# add the new row to the dataframe using loc
allstats_withInf = allstats.append(new_row)

In [None]:
allstats_withInf

In [None]:
allstats_withInf.applymap(lambda x: '<0.001' if x < 0.001 else x).round(3)
