## 1 - GUI Data Preparation

##### Imports

In [2]:
import os
import warnings
import json
import pandas as pd
import numpy as np
import scipy as su
import ast
# from typing import Dict
# from scipy.signal import
# from scikit-learn import

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd

participant_ids = [1,2, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

### 1. Single-File Corrections

##### P01
Time blocks, Time-to-completes

In [2]:
dtypes = {
        'Event Label': 'str',
        'MarkerDescription': 'str',
        'MarkerName': 'str'
    }
columns_p1 = [
    'Timestamp',
    'timestamp_Pupil.3',  # for synchronization

    'MarkerName',
    'MarkerDescription',
    'Event Label',
    'Event Index',
 
    'task_block instance',
    'time_to_complete instance',
    'time_to_start instance',

    'task_block_additional instance',   #
    'task_block2 instance',             #
    'time_to_complete2 instance',       #
    'time_to_complete3 instance',       #
]
p1_df = pd.read_csv("./iMotions/exports/001_vp01.csv", skiprows=36, dtype=dtypes)[columns_p1].dropna(how='all', subset=[col for col in columns_p1 if col != 'Timestamp'])

  p1_df = pd.read_csv("./iMotions/exports/001_vp01.csv", skiprows=36, dtype=dtypes)[columns_p1].dropna(how='all', subset=[col for col in columns_p1 if col != 'Timestamp'])


In [3]:
# Task block correction
p1_df['task_block instance'] = p1_df['task_block2 instance']
p1_df.loc[p1_df['task_block_additional instance'] == 1, 'task_block instance'] = 1
p1_df.loc[p1_df['task_block instance'] >= 3, 'task_block instance'] += 1
p1_df.loc[p1_df['task_block_additional instance'] == 2, 'task_block instance'] = 3
p1_df.loc[p1_df['task_block_additional instance'] == 3, 'task_block instance'] = 6

In [4]:
# Time-to-complete correction
p1_df['time_to_complete instance'] = p1_df['time_to_complete2 instance']
p1_df.loc[p1_df['time_to_complete instance'] >= 7, 'time_to_complete instance'] += 1
p1_df.loc[p1_df['time_to_complete3 instance'] == 1, 'time_to_complete instance'] = 7

##### P02
Time-to-completes

In [5]:
columns_p2 = [
    'Timestamp',
    'timestamp_Pupil.3',  # for synchronization

    'MarkerName',
    'MarkerDescription',
    'Event Label',
    'Event Index',

    'task_block instance',
    'time_to_complete instance',
    'time_to_start instance',

    'time_to_complete2 instance',       #
]
p2_df = pd.read_csv("./iMotions/exports/001_vp02.csv", skiprows=36, dtype=dtypes)[columns_p2].dropna(how='all', subset=[col for col in columns_p2 if col != 'Timestamp'])

  p2_df = pd.read_csv("./iMotions/exports/001_vp02.csv", skiprows=36, dtype=dtypes)[columns_p2].dropna(how='all', subset=[col for col in columns_p2 if col != 'Timestamp'])


In [6]:
# Time-to-complete correction
p2_df['time_to_complete instance'] = p2_df['time_to_complete2 instance']

##### P04
To delete

In [7]:
columns_p4 = [
    'Timestamp',

    'MarkerName',
    'MarkerDescription',
    'Event Label',
    'Event Index',

    'task_block instance',
    'time_to_complete instance',
    'time_to_start instance',

    'To_Delete active'                #
]
p4_df = pd.read_csv("./iMotions/exports/002_vp03.csv", skiprows=35, dtype=dtypes)[columns_p4].dropna(how='all', subset=[col for col in columns_p4 if col != 'Timestamp'])

  p4_df = pd.read_csv("./iMotions/exports/002_vp03.csv", skiprows=35, dtype=dtypes)[columns_p4].dropna(how='all', subset=[col for col in columns_p4 if col != 'Timestamp'])


In [8]:
p4_df = p4_df.loc[(p4_df['Timestamp'] > 50000) & (p4_df['To_Delete active'].isna())]
p4_df.loc[p4_df['task_block instance'].notna(), 'task_block instance'] -= 1
p4_df.loc[p4_df['time_to_complete instance'].notna(), 'time_to_complete instance'] -= 1

##### P01 extension
- Extensions thanks to logging file

In [9]:
# Prepare log data
column_names = ["UTC_time", "MarkerName", "MarkerDescription", 'Errors+']
column_names.extend([f"Column{i+1}" for i in range(4, 20)])
log_file_path = f"./iMotions/logging/p01_long.log"
p_log = pd.read_csv(log_file_path, sep='\t', names=column_names, header=None)[["UTC_time", "MarkerName", "MarkerDescription", "Errors+"]]
p_log['UTC_time'] = pd.to_datetime(p_log['UTC_time'], format='%Y-%m-%d_%H:%M:%S.%f')
p_log['unix_timestamp'] = p_log['UTC_time'].apply(lambda x: pd.Timestamp(x).tz_localize('Europe/Berlin').timestamp())

In [10]:
# Prepare gui data
markers = p_log['MarkerName'].unique().tolist()
p1_df_markers = p1_df[['Timestamp', 'MarkerName', 'MarkerDescription']].dropna()
p1_df_markers_filtered = p1_df_markers[p1_df_markers['MarkerName'].isin(markers)]
p1_df_markers_filtered['Timestamp'] = p1_df_markers_filtered['Timestamp'] /1000
p1_df_markers_filtered = p1_df_markers_filtered.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p1_df_markers_filtered['Timestamp'] = p1_df_markers_filtered['Timestamp'] /1000


In [11]:
# Merge log data with gui
merged_df = pd.merge(
    p_log,
    p1_df_markers_filtered,
    left_index=True,
    right_index=True,
    how='left',
    suffixes=('', '_gui')
)
discrepancies = (int((merged_df['MarkerName'] != merged_df['MarkerName_gui']).sum())) - (len(p_log)-len(p1_df_markers_filtered))
discrepancies

0

In [12]:
# Compute unix to gui timestamp offset and fill na values in Timestamp
unix_to_gui_offset_mean = np.mean(merged_df['unix_timestamp'] - merged_df['Timestamp'])
unix_to_gui_offset_std = np.std(merged_df['unix_timestamp'] - merged_df['Timestamp'])
merged_df['Timestamp'] = merged_df['Timestamp'].fillna(merged_df['unix_timestamp'] - unix_to_gui_offset_mean)

In [13]:
# Prepare extension markers
extension_df = merged_df[merged_df[['MarkerName_gui', 'MarkerDescription_gui']].isna().all(axis=1)]
extension_df['MarkerName'] = extension_df['MarkerName'].replace('Start_Task', 'StartTask')
extension_df['Timestamp'] = extension_df['Timestamp'] * 1000
extension_df.loc[extension_df['MarkerName'] == 'Result', 'MarkerDescription'] = (extension_df['MarkerDescription'] + '\t' + extension_df['Errors+'].astype(str))
extension_df = extension_df[['Timestamp', 'MarkerName', 'MarkerDescription']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extension_df['MarkerName'] = extension_df['MarkerName'].replace('Start_Task', 'StartTask')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extension_df['Timestamp'] = extension_df['Timestamp'] * 1000


In [14]:
# Compute annotations
task_block_instance = 6
time_to_complete_instance = 40
time_to_start_instance = 44

task_block_values = []
time_to_complete_values = []
time_to_start_values = []

inside_time_to_complete = False
inside_time_to_start = False

for idx, row in extension_df.iterrows():
    # Task Block Instance
    if row['MarkerName'] == 'StartTask':
        task_block_instance += 1
        task_block_values.append(task_block_instance)
    elif row['MarkerName'] == 'Stop_Tasks':
        task_block_values.append(task_block_instance)
    else:
        task_block_values.append(task_block_instance)
    
    # Time to Complete Instance
    if row['MarkerName'] == 'on_item_clicked':
        time_to_complete_instance += 1
        inside_time_to_complete = True
    
    if inside_time_to_complete:
        time_to_complete_values.append(time_to_complete_instance)
    else:
        time_to_complete_values.append(None)
    
    if row['MarkerName'] == 'Result':
        inside_time_to_complete = False
    
    # Time to Start Instance
    if row['MarkerName'] == 'Task':
        time_to_start_instance += 1
        inside_time_to_start = True
    
    if inside_time_to_start:
        time_to_start_values.append(time_to_start_instance)
    else:
        time_to_start_values.append(None)
    
    if row['MarkerName'] == 'on_item_clicked':
        inside_time_to_start = False

# Assign the computed lists to the DataFrame
extension_df['task_block instance'] = task_block_values
extension_df['time_to_complete instance'] = time_to_complete_values
extension_df['time_to_start instance'] = time_to_start_values

In [15]:
#  Extend p1_df
extension_reindexed_df = extension_df.reindex(columns=p1_df.columns)
p1_df_long = pd.concat([p1_df, extension_reindexed_df], ignore_index=True)

In [16]:
p1_df_long

Unnamed: 0,Timestamp,timestamp_Pupil.3,MarkerName,MarkerDescription,Event Label,Event Index,task_block instance,time_to_complete instance,time_to_start instance,task_block_additional instance,task_block2 instance,time_to_complete2 instance,time_to_complete3 instance
0,2.200430e+01,-672862.749103,,,,,,,,,,,
1,2.200430e+01,-672862.749103,,,,,,,,,,,
2,2.200430e+01,-672862.749102,,,,,,,,,,,
3,2.200430e+01,-672862.749102,,,,,,,,,,,
4,2.905510e+01,-672862.741050,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049489,3.908052e+06,,view2,,,,8.0,64.0,,,,,
2049490,3.924983e+06,,button_click2,False,,,8.0,64.0,,,,,
2049491,3.924983e+06,,__init__,,,,8.0,64.0,,,,,
2049492,3.924984e+06,,Result,MAV#2226\t0,,,8.0,64.0,,,,,


### 2. Aggregate Files
 Read csv files, skip first rows and select the columns

In [17]:
# Set root directory and columns
root_dir = './iMotions/exports'
os.makedirs(root_dir, exist_ok=True)
relevant_columns = [
    'Timestamp',
    'MarkerName',
    'MarkerDescription',
    'task_block instance',
    'time_to_complete instance',
    'time_to_start instance'
]

In [18]:
dtypes = {
        'MarkerName': 'str',
        'MarkerDescription': 'str',
}
new_columns = {
    'Timestamp': 'gui_timestamp',
    'timestamp_Pupil.3': 'pupil_timestamp',
    'MarkerName': 'marker_name',
    'MarkerDescription': 'marker_data',
    'task_block instance': 'task_block',
    'time_to_complete instance': 'time_to_complete',
    'time_to_start instance': 'time_to_start'
}
def add_pid_column(df, pid):   # Add 'participant_id' as first column
    df['participant_id'] = pid
    col_order = ['participant_id'] + [col for col in df.columns if col != 'participant_id']
    return df[col_order]

def aggregate_files(root_dir, columns): # Read, filter and aggregate into one file
    participant_id = 0
    df_list = []

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.csv'):
                participant_id += 1
                if participant_id == 1:
                    df = p1_df_long#_long
                    print(f"{participant_id}: is_read by file")
                elif participant_id == 2:
                    df = p2_df
                    print(f"{participant_id}: is_read by file")
                elif participant_id == 3: #skip participant3
                    participant_id += 1
                    df = p4_df
                    print(f"{participant_id}: is_read by file")
                else: #read the file, skipping rows until "Row,"
                    file_path = os.path.join(subdir, file)
                    with open(file_path, 'r') as f:
                        lines = f.readlines()
                    start_idx = next(i for i, line in enumerate(lines) if line.startswith("Row,"))
                    df = pd.read_csv(file_path, skiprows=start_idx, dtype=dtypes)
                    print(f"{participant_id}: is_read {file_path}")

                df = df[columns]
                df = df.dropna(how='all', subset=[col for col in columns if col != 'Timestamp'])
                df.loc[:,'Timestamp'] /= 1000
                df = df.rename(columns=new_columns)
                
                # Add participant_id column
                df = add_pid_column(df, participant_id)       

                # Append to list
                df_list.append(df)
    
    return pd.concat(df_list, ignore_index=True)

In [None]:
# Run aggregation
gui_df = aggregate_files(root_dir, relevant_columns)

In [None]:
rel_values = [
    'StartTask',
    'Task',
    'on_item_clicked',
    'view1',
    'view2',
    'view3',
    'view4',
    'view5',
    'view6',
    'button_click1',
    'button_click2',
    'button_click3',
    'Result',
    'Stop_Task'
]
def process_participant_markers(df):
    # Select relevant data
    df = df.dropna(subset=['marker_name'])[['participant_id', 'gui_timestamp', 'marker_name', 'marker_data']]
    df = df.loc[df['marker_name'].isin(rel_values)]

    # Arrange data
    start_mask = df['marker_name'] == 'StartTask'
    task_mask = df['marker_name'] == 'Task'
    result_mask = df['marker_name'] == 'Result'
    df.loc[start_mask, 'marker_info'] = df.loc[start_mask, 'marker_data'].apply(str)  # block type (1-8)
    df.loc[task_mask, 'marker_info'] = df.loc[task_mask, 'marker_data'].apply(lambda x: str(x.split('\t')[1]) if len(x.split('\t')) > 1 else str(x.split('\t')[0]))  # task code (MAV#XXX)
    df.loc[result_mask, 'marker_info'] = df.loc[result_mask, 'marker_data'].apply(lambda x: str(x.split('\t')[2]) if len(x.split('\t')) > 2 else str(x.split('\t')[1]))  # n_errors

    # Reorder columns
    col_order = [col for col in df.columns if col != 'marker_data'] + ['marker_data']
    new_columns = {
        'marker_name': 'type',
        'marker_info': 'add',
        'marker_data': 'base_data',
    }
    df = df[col_order].rename(columns=new_columns)

    return df

In [None]:
# Process aggregated df
markers_dfs_list = []
for pid in participant_ids:
    gui_pid_df = gui_df[gui_df['participant_id'] == pid]
    markers_dfs_list.append(process_participant_markers(gui_pid_df))
gui_markers_df = pd.concat(markers_dfs_list, ignore_index=True)

### 3. Re-structure data (GUI Timestamp)
Based on task block, with nested events (completion, start) and relative data

In [None]:
# Functions and lists
participant_ids_list = []
block_ids = []
block_types = []
task_ids = []
start_timestamps = []
reaction_times = []
clicked_timestamps = []
completion_times = []
end_timestamps = []
errors = []
task_codes = []
block_data = []

# Function to extract marker info ( to be refined)
def extract_marker_info(df):
    start_mask = df['marker_name'] == 'StartTask'
    task_mask = df['marker_name'] == 'Task'
    result_mask = df['marker_name'] == 'Result'
    start_types = df.loc[start_mask, 'marker_data'].apply(lambda x: int(float(x))).tolist()
    task_codes_list = df.loc[task_mask, 'marker_data'].apply(lambda x: x.split('\t')[1] if len(x.split('\t')) > 1 else x.split('\t')[0]).tolist()
    error_counts = df.loc[result_mask, 'marker_data'].apply(lambda x: int(x.split('\t')[2]) if len(x.split('\t')) > 2 else int(x.split('\t')[1])).tolist()
    
    return {
        'block_types': start_types,
        'task_codes': task_codes_list,
        'errors': error_counts,
    }

def rearrange_data(df, marker_info, pid):
    task_blocks = df['task_block'].dropna().unique()
    
    if pid == 1:#handle additional result
        marker_info['errors'].insert(6,0)
    
    for tb in task_blocks:  # Iterate over task blocks
        block_df = df[df['task_block'] == tb]
        start_ts = block_df['gui_timestamp'].min()
        end_ts = block_df['gui_timestamp'].max()
        duration = end_ts - start_ts
        block_type_row = marker_info['block_types'].pop(0)
        block_data_row = [float(start_ts), float(duration), float(end_ts)]

        # Process time_to_start and time_to_complete events
        task_id_counter = 1
        start_events = block_df['time_to_start'].dropna().unique()
        complete_events = block_df['time_to_complete'].dropna().unique()

        if len(start_events) != len(complete_events): # handle exception in P01 and P02
            start_events = start_events[:-1]
            print(f"Exception handled in P{pid}")
        
        if len(start_events) == len(complete_events):
            for se, ce in zip(start_events, complete_events):
                event_start_df = block_df[block_df['time_to_start'] == se]
                event_complete_df = block_df[block_df['time_to_complete'] == ce]

                start_timestamp = event_start_df['gui_timestamp'].min()
                clicked_timestamp = event_start_df['gui_timestamp'].max()
                reaction_time = clicked_timestamp - start_timestamp

                end_timestamp = event_complete_df['gui_timestamp'].max()
                completion_time = end_timestamp - clicked_timestamp

                error = marker_info['errors'].pop(0)
                task_code = marker_info['task_codes'].pop(0)

                participant_ids_list.append(pid)
                block_ids.append(int(tb))
                block_types.append(block_type_row)
                task_ids.append(task_id_counter)
                start_timestamps.append(start_timestamp)
                reaction_times.append(reaction_time)
                clicked_timestamps.append(clicked_timestamp)
                completion_times.append(completion_time)
                end_timestamps.append(end_timestamp)
                errors.append(error)
                task_codes.append(task_code)
                block_data.append(block_data_row)

                task_id_counter += 1

        else:
            print(f"Len of events is different for block {tb}, participant {pid}")
            print(f"len of start: {len(start_events)}")
            print(f"len of complete: {len(complete_events)}")

In [None]:
# Run re-arrangement
filtered_df = gui_df[['participant_id', 'gui_timestamp', 'marker_name', 'marker_data', 'task_block', 'time_to_complete', 'time_to_start']]

# Iterate over participant ids
for pid in participant_ids:
    participant_df = filtered_df[filtered_df['participant_id'] == pid]
    participant_marker_info = extract_marker_info(participant_df)
    rearrange_data(participant_df, participant_marker_info, pid)
    print(f"Computed pid {pid}")

# Create the final dataframe
new_gui_df = pd.DataFrame({
    'participant_id': participant_ids_list,
    'block_id': block_ids,    
    'block_type': block_types,
    'task_id': task_ids,
    'start_timestamp': start_timestamps,
    'reaction_time': reaction_times,
    'clicked_timestamp': clicked_timestamps,
    'completion_time': completion_times,
    'end_timestamp': end_timestamps,
    'errors': errors,
    'task_code': task_codes,
    'block_data': block_data
})
new_gui_df = pd.concat([group.sort_values(by='start_timestamp') for _, group in new_gui_df.groupby('participant_id')]).reset_index(drop=True)

Exception handled in P1
Exception handled in P1
Exception handled in P1
Exception handled in P1
Computed pid 1
Exception handled in P2
Computed pid 2
Computed pid 4
Computed pid 5
Computed pid 6
Computed pid 7
Computed pid 8
Computed pid 9
Computed pid 10
Computed pid 11
Computed pid 12
Computed pid 13
Computed pid 14
Computed pid 15
Computed pid 16
Computed pid 17
Computed pid 18
Computed pid 19
Computed pid 20


In [None]:
new_gui_df.to_csv('./aggregated_data/all_gui_data.csv', index=False)

### 4. Unix Timestamps

##### Data Loading

In [None]:
# Load ddfs into dictionary (exception for 'all_surf_pos', df)
root_dir = './aggregated_data'
def load_data(root_dir, exception=False):    # Get csv file reads into one dictionary
    data = {}
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            file_name = os.path.splitext(file)[0]   #file name without '.csv'
            if file.endswith('.csv') and file != 'all_surf_positions_HiDrive_Studie2.csv':
                data[file_name] = dd.read_csv(file_path)    #read and attach to dict
            elif exception and file == 'all_surf_positions_HiDrive_Studie2.csv':
                data [file_name] = pd.read_csv(file_path, converters=converters)    #read as normal and attach to dict
    return data
def parse(filedata): # Manually read the column
    output = []
    for line in filedata.split('\n'): # split into lines
        line = line.strip().rstrip(']').lstrip('[') #remove whitespace and brackets
        if not line:  
            continue    #skip empty lines
        line = line.split() #split into cell
        row = []
        for cell in line:
            cell = cell.strip()     #remove whitespace
            if not cell.strip():
                continue    #skip empty cells
            row.append(float(cell)) #convert to float and add
        output.append(row)
    return output
converters = {
    "img_to_surf_trans": parse,
    "surf_to_img_trans": parse,
    "dist_img_to_surf_trans": parse,
    "surf_to_dist_img_trans": parse,
}

data = load_data(root_dir)

In [None]:
participant_ids = [1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
ddf = data['all_gui_data']
df = ddf.compute()
df['block_data'] = df['block_data'].apply(lambda x: eval(x))

##### Conversion parameter

In [None]:
# Manually creating the DataFrame with data from the provided text files
offset_data = {
    'participant_id': [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'pupil_mean': [
        None, None, None, None, None, None,
        4109.257553304065, 
        22187.97703672082,
        5416.92263870804,
        16531.296070843073,
        84199.35689554777, 
        93620.139733137,
        99778.58807834735,
        3135.9419403789816,
        87722.83890598793, 
        2655.8941165745155,
        4067.6027457861605,
        71583.83790804863,
        80480.59764496642
    ],
    'pupil_std': [
        None, None, None, None, None, None,
        0.0015540616445236736,
        0.0009565443258474121,
        0.0010185332871037824,
        0.0008931335310408102,
        0.0009202319056514787,
        0.0010068354754641231,
        0.0010086043613726362,
        0.0010346697359956188,
        0.000898015048832067,
        0.0008651488841632027, 
        0.004909002020535595,
        0.0015035274843810211,
        0.004411814755109096
    ],
    'gaze_mean': [
        -672862.8254689132,
        -664541.5826019697,
        -588022.8872828754,
        -578918.4627699563,
        82154.6509890003,
        349976.70950270735,
        4109.203127473147,
        22187.92287464568,
        5416.8687206548875,
        16531.242376280596,
        84199.30305764821,
        93620.08597532041,
        99778.53439845197,
        3135.88798966699,
        87722.78483034433,
        2655.8404129786127,
        4067.548879011448,
        71583.78398800883,
        80480.5438803016
    ],
    'gaze_std': [
        0.010101152431444825,
        0.011213037927546807,
        0.016895081555496176,
        0.015107109620048447,
        0.012640083799283774,
        0.01461043801255117,
        0.01599184351455052,
        0.01780569830867164,
        0.010848758750081483,
        0.009713719881618923,
        0.01239760723051959,
        0.010683433863253248,
        0.010193936300467025,
        0.013376659433486439,
        0.019103793234118906,
        0.01194004737632296,
        0.012154765751067805,
        0.012422883072455681,
        0.010797759985148002
    ],
    'log_pupil_mean': [
        None, None, None, None, None, None,
        1710841379.7298098,
        1710859458.4487002,
        1711021383.8964255,
        1711032498.1677978,
        1711100165.706887,
        1711109586.6234353,
        1711115745.0770855,
        1711380580.6615686,
        1711465166.2651808,
        1711530420.2930207,
        1711549938.8896227,
        1711617454.089709,
        1711626350.8593812
    ],
    'log_pupil_std': [
        None, None, None, None, None, None,
        0.002654590308012348,
        0.0019606005221617867,
        0.002596132207761823,
        0.0016050413399801406,
        0.008953334304134418,
        0.002252080743393519,
        0.001826714344806407,
        0.02498242118477865,
        0.006965123911033478,
        0.005961296218748378,
        0.002292924818227741,
        0.0019064997533042869,
        0.0016124437080255506
    ],
    'log_gaze_mean': [
        1710323060.2391326,
        1710331381.5050497,
        1710407899.148977,
        1710417003.592421,
        1710504954.71339,
        1710772775.5559084,
        1710841379.6769013,
        1710859458.3949537,
        1711021383.84353,
        1711032498.1141572,
        1711100165.6529558,
        1711109586.5698,
        1711115745.0236366,
        1711380580.6074257,
        1711465166.2607026,
        1711530420.2390208,
        1711549938.835953,
        1711617454.0360806,
        1711626350.8057556
    ],
    'log_gaze_std': [
        0.009267704577657755,
        0.009865290472380293,
        0.010213497413320831,
        0.010608973953347407,
        0.009436665703948553,
        0.009804471214128545,
        0.013381547240806944,
        0.009739741282879723,
        0.0095273627454879,
        0.009402437996693207,
        0.012820757323793652,
        0.009573894656712597,
        0.009919684910694278,
        0.026339731913368473,
        0.009384245142926977,
        0.009558716130033616,
        0.009923130672874592,
        0.009438752643090517,
        0.009601663696196626
    ],
}

offset_df = pd.DataFrame(offset_data)
offset_df

Unnamed: 0,participant_id,pupil_mean,pupil_std,gaze_mean,gaze_std,log_pupil_mean,log_pupil_std,log_gaze_mean,log_gaze_std
0,1,,,-672862.825469,0.010101,,,1710323000.0,0.009268
1,2,,,-664541.582602,0.011213,,,1710331000.0,0.009865
2,4,,,-588022.887283,0.016895,,,1710408000.0,0.010213
3,5,,,-578918.46277,0.015107,,,1710417000.0,0.010609
4,6,,,82154.650989,0.01264,,,1710505000.0,0.009437
5,7,,,349976.709503,0.01461,,,1710773000.0,0.009804
6,8,4109.257553,0.001554,4109.203127,0.015992,1710841000.0,0.002655,1710841000.0,0.013382
7,9,22187.977037,0.000957,22187.922875,0.017806,1710859000.0,0.001961,1710859000.0,0.00974
8,10,5416.922639,0.001019,5416.868721,0.010849,1711021000.0,0.002596,1711021000.0,0.009527
9,11,16531.296071,0.000893,16531.242376,0.009714,1711032000.0,0.001605,1711032000.0,0.009402


In [None]:
# Get best methods
std_columns = ['pupil_std', 'gaze_std', 'log_pupil_std', 'log_gaze_std']
min_columns = offset_df[std_columns].idxmin(axis=1)
offset_df['best_method'] = min_columns.str.replace('_std', '')

# Add the best mean value by appending '_mean' to the best method
offset_df['best_offset'] = offset_df.apply(lambda row: row[row['best_method'] + '_mean'], axis=1)
offset_df


Unnamed: 0,participant_id,pupil_mean,pupil_std,gaze_mean,gaze_std,log_pupil_mean,log_pupil_std,log_gaze_mean,log_gaze_std,best_method,best_offset
0,1,,,-672862.825469,0.010101,,,1710323000.0,0.009268,log_gaze,1710323000.0
1,2,,,-664541.582602,0.011213,,,1710331000.0,0.009865,log_gaze,1710331000.0
2,4,,,-588022.887283,0.016895,,,1710408000.0,0.010213,log_gaze,1710408000.0
3,5,,,-578918.46277,0.015107,,,1710417000.0,0.010609,log_gaze,1710417000.0
4,6,,,82154.650989,0.01264,,,1710505000.0,0.009437,log_gaze,1710505000.0
5,7,,,349976.709503,0.01461,,,1710773000.0,0.009804,log_gaze,1710773000.0
6,8,4109.257553,0.001554,4109.203127,0.015992,1710841000.0,0.002655,1710841000.0,0.013382,pupil,4109.258
7,9,22187.977037,0.000957,22187.922875,0.017806,1710859000.0,0.001961,1710859000.0,0.00974,pupil,22187.98
8,10,5416.922639,0.001019,5416.868721,0.010849,1711021000.0,0.002596,1711021000.0,0.009527,pupil,5416.923
9,11,16531.296071,0.000893,16531.242376,0.009714,1711032000.0,0.001605,1711032000.0,0.009402,pupil,16531.3


##### Conversion computation

In [None]:
# Preparation
convertion_df_list = []
info_player_names = {
    1: 'p01',
    2: 'p02',
    3: '-',
    4: 'p04',
    5: 'p05',
    6: 'p06',
    7: 'p07',
    8: 'p08',
    9: 'p09',
    10: 'p10',
    11: 'p11',
    12: 'p12',
    13: 'p13',
    14: 'p14',
    15: 'p15',
    16: 'p16',
    17: 'p17',
    18: 'p18',
    19: 'p19',
    20: 'p20',
}

def convert_gui_timestamps(dataf, offset):
    timetstamp_columns = ['start_timestamp', 'clicked_timestamp', 'end_timestamp']
    unix_dataf = dataf.copy()
    for col in timetstamp_columns:
        unix_dataf[col] = unix_dataf[col].apply(lambda x: float(x) + offset)
    unix_dataf['block_data'] = unix_dataf['block_data'].apply(
    lambda lst: [lst[0] + offset, lst[1], lst[2] + offset]
    )

    return unix_dataf

In [None]:
# Pupil method
pupil_offset_df = offset_df[offset_df['best_method'] == 'pupil']
pupil_participant_ids = pupil_offset_df['participant_id'].tolist()

for pid in pupil_participant_ids:
    participant_df = df.query(f"participant_id == {pid}")
    participant_offset_to_pupil = offset_df.query(f"participant_id == {pid}")['best_offset'].values[0]

    with open(f"./info_players/{info_player_names[pid]}.json") as file:
        meta_info = json.load(file)
    start_timestamp_diff = meta_info["start_time_system_s"] - meta_info["start_time_synced_s"]
    participant_offset_to_unix = participant_offset_to_pupil + start_timestamp_diff  
    
    unix_participant_df = convert_gui_timestamps(participant_df, participant_offset_to_unix)
    convertion_df_list.append(unix_participant_df)

In [None]:
# Gaze method
gaze_offset_df = offset_df[offset_df['best_method'] == 'gaze']
gaze_participant_ids = gaze_offset_df['participant_id'].tolist()

for pid in gaze_participant_ids:
    participant_df = df.query(f"participant_id == {pid}")
    participant_offset_to_pupil = offset_df.query(f"participant_id == {pid}")['best_offset'].values[0]

    with open(f"./info_players/{info_player_names[pid]}.json") as file:
        meta_info = json.load(file)
    start_timestamp_diff = meta_info["start_time_system_s"] - meta_info["start_time_synced_s"]
    participant_offset_to_unix = participant_offset_to_pupil + start_timestamp_diff  
    
    unix_participant_df = convert_gui_timestamps(participant_df, participant_offset_to_unix)
    convertion_df_list.append(unix_participant_df)

In [None]:
# Log-Pupil method
log_pupil_offset_df = offset_df[offset_df['best_method'] == 'log_pupil']
log_pupil_participant_ids = log_pupil_offset_df['participant_id'].tolist()

for pid in log_pupil_participant_ids:
    participant_df = df.query(f"participant_id == {pid}")
    participant_offset_to_unix = offset_df.query(f"participant_id == {pid}")['best_offset'].values[0]
    
    unix_participant_df = convert_gui_timestamps(participant_df, participant_offset_to_unix)
    convertion_df_list.append(unix_participant_df)

In [None]:
# Log-Gaze method
log_gaze_offset_df = offset_df[offset_df['best_method'] == 'log_gaze']
log_gaze_participant_ids = log_gaze_offset_df['participant_id'].tolist()

for pid in log_gaze_participant_ids:
    participant_df = df.query(f"participant_id == {pid}")
    participant_offset_to_unix = offset_df.query(f"participant_id == {pid}")['best_offset'].values[0]
    
    unix_participant_df = convert_gui_timestamps(participant_df, participant_offset_to_unix)
    convertion_df_list.append(unix_participant_df)

In [None]:
new_gui_df2 = pd.concat(convertion_df_list).sort_values(by=['participant_id', 'block_id', 'task_id']).reset_index(drop=True)

In [None]:
new_gui_df2.to_csv('./aggregated_data/all_gui_data.csv', index=False)