# This notebook is used to extract fixations for each participant

### Import libraries

In [14]:
import pandas as pd
import numpy as np
import os

# Enable interactive Matplotlib plots in the notebook
%matplotlib qt5

import matplotlib.pyplot as plt
from matplotlib import cm
import os
import astropy.convolution as krn
import scipy.stats as stats
import sys

import matplotlib.pyplot as plt
from matplotlib.widgets import Button



### Preprocess, extract fixations and add them to the dataframe

In [15]:
import os
import numpy as np
import pandas as pd
from FixationDetection.I2MC import runI2MC

def extract_fixations(df, path):
    """
    Extracts fixation information from eye-tracking data using the I2MC algorithm, merges the results
    with the original data, and computes additional metrics such as fixation duration and distance
    from the previous fixation.

    The function:
    1. Preprocesses the dataframe by filtering and sorting the data.
    2. Runs the I2MC fixation detection algorithm to obtain fixations.
    3. Adds extracted fixation information (e.g., fixation start/end times, fixation coordinates)
       to the original dataframe.
    4. Computes additional metrics, such as the distance from the previous fixation and 
       the previous fixation coordinates.
    5. Saves the processed dataframe as a CSV file and returns it.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe containing eye-tracking data. It should include columns such as 'sampTime', 
        'user_pred_px_x', 'user_pred_px_y', and 'event'.
    
    path : str
        The path to the data file for running the I2MC fixation detection algorithm.

    Returns
    -------
    pandas.DataFrame
        The preprocessed dataframe, filtered to include only rows where the target was presented.
    """
    
    # Preprocess the input dataframe: Remove invalid or missing data
    df = df[df.fName.notna()]  # Filter rows where fName is not NaN
    df['frameNr'] = pd.to_numeric(df['frameNr'], errors='coerce')  # Convert frameNr to numeric, set non-numeric to NaN
    df = df[df['frameNr'].notna()]  # Filter out rows where frameNr is NaN
    
    df['sampTime'] = pd.to_numeric(df['sampTime'], errors='coerce')  # Convert sampTime to numeric
    df = df[df['sampTime'].notna()]  # Filter rows where sampTime is NaN
    df = df[df['user_pred_px_x'].notna() & df['user_pred_px_y'].notna()]  # Filter rows with valid x and y coordinates
    df = df.apply(pd.to_numeric, errors='ignore')  # Convert any possible strings to numbers
    df = df.sort_values('frameNr').reset_index(drop=True)  # Sort by frame number
    df = df.drop_duplicates(subset=['sampTime'], ignore_index=True)  # Drop duplicate sampTime entries
    
    # Run the I2MC fixation detection algorithm to extract fixation data
    fixDF = runI2MC(path, plotData=False)

    # Initialize arrays for storing extracted fixation data
    FixXPos = np.zeros(df.shape[0])
    FixYPos = np.zeros(df.shape[0])
    FixStartEnd = np.empty(df.shape[0], dtype='U10')
    FixStartEnd.fill('')  # Fill array with empty strings
    FixDur = np.zeros(df.shape[0])
    DistFromPrevFix = np.zeros(df.shape[0])
    PrevFixXPos = np.zeros(df.shape[0])
    PrevFixYPos = np.zeros(df.shape[0])
    PrevFixSampTime = np.zeros(df.shape[0])

    prev_fix_x = None  # Keep track of the previous fixation's x-coordinate
    prev_fix_y = None  # Keep track of the previous fixation's y-coordinate
    prev_fix_sampTime = 0  # Previous fixation's sample time
    idx = 0  # Index for iterating through fixDF

    # Iterate through the dataframe to accumulate fixation information
    for index, row in df.iterrows():
        if idx >= fixDF.shape[0]:
            break  # Stop if we run out of fixations to process
        
        # Move to the next fixation if the sample time exceeds the current fixation's end time
        if row['sampTime'] > fixDF['FixEnd'].iloc[idx]:
            idx += 1
            if idx >= fixDF.shape[0]:
                break
        
        # Check if the current sample time falls within a fixation
        if fixDF['FixStart'].iloc[idx] <= row['sampTime'] <= fixDF['FixEnd'].iloc[idx]:
            FixXPos[index] = fixDF['XPos'].iloc[idx]
            FixYPos[index] = fixDF['YPos'].iloc[idx]
        
        # Label fixation start and end
        if row['sampTime'] == fixDF['FixStart'].iloc[idx]:
            FixStartEnd[index] = 'fix_start'
            if prev_fix_x is not None:
                # Calculate the distance from the previous fixation
                DistFromPrevFix[index] = np.sqrt((fixDF['XPos'].iloc[idx] - prev_fix_x) ** 2 + 
                                                 (fixDF['YPos'].iloc[idx] - prev_fix_y) ** 2)
                PrevFixXPos[index] = prev_fix_x
                PrevFixYPos[index] = prev_fix_y
                PrevFixSampTime[index] = prev_fix_sampTime
        
        elif row['sampTime'] == fixDF['FixEnd'].iloc[idx]:
            FixStartEnd[index] = 'fix_end'
            FixDur[index] = fixDF['FixDur'].iloc[idx]
            prev_fix_x = fixDF['XPos'].iloc[idx]
            prev_fix_y = fixDF['YPos'].iloc[idx]
            prev_fix_sampTime = row['sampTime']

    # Add extracted fixation data to the original dataframe
    df['FixXPos'] = FixXPos
    df['FixYPos'] = FixYPos
    df['FixStartEnd'] = FixStartEnd
    df['FixDur'] = FixDur
    df['DistFromPrevFix'] = DistFromPrevFix
    df['PrevFixSampTime'] = PrevFixSampTime
    df['PrevFixXPos'] = PrevFixXPos
    df['PrevFixYPos'] = PrevFixYPos

    # Filter out invalid fixations and coordinates
    df = df[(df['FixXPos'] > 0) & (df['FixYPos'] > 0) & 
            (df['user_pred_px_x'] > 0) & (df['user_pred_px_y'] > 0)]

    # Save the pre-processed dataframe to a CSV file
    output_file = os.path.splitext(path)[0] + '_extra.csv'
    df.to_csv(output_file, index=False)

    return df


### Batch extract fixations

In [16]:
def process_folder_for_fixations(path_to_data):
    """
    Process all participant data files in the specified folder using the extract_fixations function.

    This function iterates through all subdirectories (or files) in the given folder, applies the 
    extract_fixations function to each participant's data file, and skips files that do not exist 
    or cause errors during reading.

    Parameters
    ----------
    path_to_data : str
        Path to the folder containing participant data files.

    Returns
    -------
    None
        The function processes the data and saves the results to new CSV files for each participant.
    """

    # Get all folder names (or participant directories) in the data folder
    folder_names = os.listdir(path_to_data)

    # Process each participant's data file
    for fn in folder_names:
        path_to_file = os.path.join(path_to_data, fn, fn + '_record.csv')

        print(f'Processing participant {fn}...')

        # Read the file and handle any potential errors
        try:
            # Use 'on_bad_lines' to skip rows with issues, such as improperly formatted lines
            df = pd.read_csv(path_to_file, on_bad_lines='skip')
        except FileNotFoundError:
            print(f'File does not exist: {path_to_file}')
            continue
        except Exception as e:
            print(f'Error reading file {path_to_file}: {e}')
            continue

        # Apply extract_fixations to the dataframe
        try:
            extract_fixations(df, path_to_file)
        except Exception as e:
            print(f'Error during fixation extraction for {fn}: {e}')
            continue

    print('Processing complete.')
   

# Example usage:
# path_to_data = 'D:/Dropbox/Appliedwork/CognitiveSolutions/Projects/DeepEye/TechnicalReports/TechnicalReport1/Test_Spaak/data/approved/tmp'
# process_folder_for_fixations(path_to_data)


Processing participant 2024_06_14_09_02_37...



Importing and processing: "D:/Dropbox/Appliedwork/CognitiveSolutions/Projects/DeepEye/TechnicalReports/TechnicalReport1/Test_Spaak/data/approved/tmp\2024_06_14_09_02_37\2024_06_14_09_02_37_record.csv"
	Searching for valid interpolation windows
	Replace interpolation windows with Steffen interpolation
	2-Means clustering started for averaged signal
	Determining fixations based on clustering weight mean for averaged signal and separate eyes + 2*std


I2MC took 7.044744491577148s to finish!


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Processing complete.


Unnamed: 0,frameNr,fName,sampTime,trialNr,respKey,respTime,mouseXY,pp_id,event,imageSet,...,webcamFrameRate,webcamLabel,FixXPos,FixYPos,FixStartEnd,FixDur,DistFromPrevFix,PrevFixSampTime,PrevFixXPos,PrevFixYPos
0,1.0,2024_06_14_09_02_37_00001.jpg,291242.5,0,-1,-1.0,-1,-1,fixation_on,['A'],...,30,PC Camera (035f:3211),964.845500,450.338815,fix_start,0.0,0.0,0.0,0.0,0.0
1,2.0,2024_06_14_09_02_37_00002.jpg,291275.2,0,-1,-1.0,-1,-1,fixation_on,['A'],...,30,PC Camera (035f:3211),964.845500,450.338815,,0.0,0.0,0.0,0.0,0.0
2,3.0,2024_06_14_09_02_37_00003.jpg,291309.0,0,-1,-1.0,-1,-1,fixation_on,['A'],...,30,PC Camera (035f:3211),964.845500,450.338815,,0.0,0.0,0.0,0.0,0.0
3,4.0,2024_06_14_09_02_37_00004.jpg,291341.2,0,-1,-1.0,-1,-1,fixation_on,['A'],...,30,PC Camera (035f:3211),964.845500,450.338815,,0.0,0.0,0.0,0.0,0.0
4,5.0,2024_06_14_09_02_37_00005.jpg,291374.9,0,-1,-1.0,-1,-1,fixation_on,['A'],...,30,PC Camera (035f:3211),964.845500,450.338815,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8943,10824.0,2024_06_14_09_02_37_10824.jpg,720083.3,61,arrowleft,2068.0,-1,-1,log_response_on,['A'],...,30,PC Camera (035f:3211),1281.796059,485.542933,,0.0,0.0,0.0,0.0,0.0
8944,10825.0,2024_06_14_09_02_37_10825.jpg,720116.6,61,arrowleft,2068.0,-1,-1,log_response_on,['A'],...,30,PC Camera (035f:3211),1281.796059,485.542933,,0.0,0.0,0.0,0.0,0.0
8945,10826.0,2024_06_14_09_02_37_10826.jpg,720149.3,61,arrowleft,2068.0,-1,-1,log_response_on,['A'],...,30,PC Camera (035f:3211),1281.796059,485.542933,,0.0,0.0,0.0,0.0,0.0
8946,10827.0,2024_06_14_09_02_37_10827.jpg,720182.0,61,arrowleft,2068.0,-1,-1,log_response_on,['A'],...,30,PC Camera (035f:3211),1281.796059,485.542933,,0.0,0.0,0.0,0.0,0.0


In [17]:
# def extract_fixations(df, path):
    
#     # sys.path.append('./FixationDetection')
#     from FixationDetection.I2MC import runI2MC
    
#     # order frames and drop duplicate samples (with same sampleTime)
#     df = df[df.fName.notna()]
#     df.frameNr = df.frameNr.apply(pd.to_numeric, errors='coerce') # if framerNr is not a number, it is replaces with nan
#     df = df[df.frameNr.notna()] # filter out rows where frameNr is a nan

#     df.sampTime = df.sampTime.apply(pd.to_numeric, errors='coerce') # if sampTime is not a number, it is replaces with nan
#     df = df[df.sampTime.notna()]
#     df = df[df.user_pred_px_x.notna()]
#     df = df[df.user_pred_px_y.notna()]
#     df = df.apply(pd.to_numeric, errors='ignore') # if str convert str to numbers

#     df = df.sort_values('frameNr')
#     df = df.reset_index(drop=True)
#     # df = df.drop_duplicates(subset=['user_pred_px_x', 'user_pred_px_y'], ignore_index=True)
#     df = df.drop_duplicates(subset=['sampTime'], ignore_index=True)
       
    
#     # get fixations for the original datafile for each participant
#     fixDF = runI2MC(path, plotData = False)

#     # add extracted fixations to the original data file (two new columns)
#     # for each timestamp where fixation was detected, FixXPos and FixYPos are added
#     idx = 0 # index of fixDF
#     FixXPos = np.zeros(df.shape[0])
#     FixYPos = np.zeros(df.shape[0])
#     FixStartEnd = np.empty(df.shape[0], dtype='U10')
#     FixStartEnd.fill('') # explicitly fill the array (good practice)
#     FixDur = np.zeros(df.shape[0])

#     DistFromPrevFix = np.zeros(df.shape[0])
#     PrevFixXPos = np.zeros(df.shape[0])
#     PrevFixYPos = np.zeros(df.shape[0])
#     prev_fix_x = False # keep track of xy when fixation ends
#     prev_fix_y = False

#     PrevFixSampTime = np.zeros(df.shape[0])
#     prev_fix_sampTime = 0

#     # iterate thru the original dataframe, thru each sample
#     for index, row in df.iterrows():

#         # make sure not to iterate out of range
#         if idx < fixDF.shape[0]:

#             # go to next fixation when fixation ends
#             if row['sampTime'] > np.array(fixDF.FixEnd)[idx]:
#                     idx += 1

#             # make sure not to iterate out of range
#             if idx < fixDF.shape[0]:

#                 # when samples are within fixation, accumulate FixXPos and FixYPos
#                 if row['sampTime'] >= np.array(fixDF.FixStart)[idx] and row['sampTime'] <= np.array(fixDF.FixEnd)[idx]:

#                     FixXPos[index] = (np.array(fixDF.XPos)[idx])
#                     FixYPos[index] = (np.array(fixDF.YPos)[idx])

#                 # label samples on which fixation starts and ends
#                 if row['sampTime'] == np.array(fixDF.FixStart)[idx]:             
#                     FixStartEnd[index] = 'fix_start'

#                     if prev_fix_x != False:

#                         PrevFixXPos[index] = prev_fix_x
#                         PrevFixYPos[index] = prev_fix_y

#                         DistFromPrevFix[index] = np.sqrt((np.array(fixDF.XPos)[idx] - prev_fix_x)**2 
#                                                 + (np.array(fixDF.YPos)[idx] - prev_fix_y)**2)
#                         PrevFixSampTime[index] = prev_fix_sampTime


#                 elif row['sampTime'] == np.array(fixDF.FixEnd)[idx]:                
#                     FixStartEnd[index] = 'fix_end' 
#                     FixDur[index] = np.array(fixDF.FixDur)[idx]

#                     prev_fix_x = np.array(fixDF.XPos)[idx]
#                     prev_fix_y = np.array(fixDF.YPos)[idx]
#                     prev_fix_sampTime = np.array(row['sampTime'])




#     # add fixations to original dataframe
#     df['FixXPos'] = np.array(FixXPos)
#     df['FixYPos'] = np.array(FixYPos)
#     df['FixStartEnd'] = FixStartEnd
#     df['FixDur'] = np.array(FixDur)
#     df['DistFromPrevFix'] = DistFromPrevFix
#     df['PrevFixSampTime'] = PrevFixSampTime
#     df['PrevFixXPos'] = PrevFixXPos
#     df['PrevFixYPos'] = PrevFixYPos
    
    
#     # Remove all negative xs, ys
#     df = df[(df['FixXPos'] > 0) & (df['FixYPos'] > 0) & (df['user_pred_px_x'] > 0) & (df['user_pred_px_y'] > 0)]


#     # Save the pre-processed dataframe
#     df.to_csv((os.path.splitext(path)[0] + '_extra.csv'), index=False)  


#     # Extract only samples when the target was presented
#     df = df[df.event=='target_on']
    
#     return df
    

# # # # Label trials with too few data points
# # # a = df.groupby('trialNr').count().reset_index()
# # # a = a[['trialNr', 'sampTime']]
# # # # 3) rename the columns so they would be added
# # # a.columns = ['trialNr', 'samplesPerTrial']
# # # df = pd.merge(df, a, on="trialNr")


## Main Part
### For each subject fixations are extracted and added to the original datafile and saved as '[original_filename]_record' + '_extra.csv'

In [18]:
# Path to data folders
path_to_data = 'D:/Dropbox/Appliedwork/CognitiveSolutions/Projects/DeepEye/TechnicalReports/TechnicalReport1/Test_Spaak/data/approved/tmp'
# path_to_data = 'C:/Users/artem/Dropbox/Appliedwork/CognitiveSolutions/Projects/DeepEye/TechnicalReports/TechnicalReport1/Test_Spaak/data/approved/data'

# get all folder names
folder_names = os.listdir(path_to_data)

# read and process original datafile for each participant
for fn in folder_names:
    path_to_file = os.path.join(path_to_data, fn, fn+'_record.csv')
    
    print(f'Processing participant {fn}...')

    # Read the file and skip the bad rows    
    try:
       df = pd.read_csv(path_to_file, on_bad_lines='skip')       
    except:
        print('File does not exist: ' + path_to_file)
        continue
        
    df1 = extract_fixations(df, path_to_file)
    
   

Processing participant 2024_06_14_09_02_37...



Importing and processing: "D:/Dropbox/Appliedwork/CognitiveSolutions/Projects/DeepEye/TechnicalReports/TechnicalReport1/Test_Spaak/data/approved/tmp\2024_06_14_09_02_37\2024_06_14_09_02_37_record.csv"
	Searching for valid interpolation windows
	Replace interpolation windows with Steffen interpolation
	2-Means clustering started for averaged signal
	Determining fixations based on clustering weight mean for averaged signal and separate eyes + 2*std


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)




I2MC took 7.0530805587768555s to finish!


In [19]:
# df.sampTime = df.sampTime.apply(pd.to_numeric, errors='coerce')
print(df.sampTime.values)

['291572.4000000004' '291605.60000000056' '291638.30000000075' ...
 '720182.0' '720215.3000000007' '720248.1000000006']
