In [None]:
#################################################
#
# Preprocessing pipeline for eyetracking data
# Inputs: a folder of .mat files with eyetracking data
# Outputs: preprocessed data that can be fed into ML algorithms
# Authors:
#   Benedetta Franceschiello, Ph.D., benedetta.franceschiello@gmail.com (project leader)
#   Alicja Olszewska, a.olszewska@nencki.edu.pl
#   Monika Puchalska, m.puchalska@nencki.edu.pl
#
#   Brainhack Warsaw 2022, 25-27.03.2022 https://brainhackwarsaw.github.io/
#
###################################################

In [None]:
'''
This code takes raw eye-tracking trajectories (x, y coordinates)
and transforms them in a way that can be used for machine learning.

Output: .mat files with coordinates for x, y, labels and RTs, properly formatted
Required input data: x, y, label (neglect/healthy), RTs( optional)

If response times (RTs) are present, they can be included in the data.
In the example RTs are not included.
'''

In [1]:
import scipy.io
import os
import os.path as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
# Set Dataset path and read its contents
# Dataset path should point to the folder where the eyetracking data is localised
Dataset_path = ''
while not op.isdir(Dataset_path):
    Dataset_path = str(input('Please provide your Dataset path:\t'))
    if not op.isdir(Dataset_path):
        print('That\'s not a valid path, let\'s try again or \'q\' to quit.')
    if Dataset_path == 'q': break
if Dataset_path != 'q':
    Dataset_folder = sorted([ f.path for f in os.scandir(Dataset_path) if '.mat' in f.name ])
    if len(Dataset_folder) == 0:
        print('Your path does not contain valid files. Please try again.')
print('Dataset path set to :\n', Dataset_path)

Dataset path set to :
 /Volumes/GoogleDrive/.shortcut-targets-by-id/1GP7DKWJranQy6jycoCLbBHoyU1gq8cLp/2_TeamWork_VisualNeglect/1_Dataset


In [None]:
# load the first of the eyetracking files to read the columns from the .mat files
# This will serve as a basis for the identification how all the files should be read
params_keys = ['Coords_X', 'Coords_Y', 'Labels', 'RTs']

mat = scipy.io.loadmat(Dataset_folder[0])
param_dict = sorted(list(mat.keys()))
print('We assume the files need to contain the parameters:\n - X, Y coordinates in time (separately)\n - Ground Truth labels\n - (optional) Response Times.')

In [None]:
# Indicate whether response times should be used
rtflag = str(input('Do you want to make use of Response Times? [Y/N/q]:\t'))
if rtflag == 'Y': RT_Present = True
else: RT_Present = False

if not RT_Present:
    params_keys.remove('RTs')

#present the user with their choice
print('Your files contain the following fields:')
for i, elem in enumerate(param_dict):
    print('[',i,']\t',elem)

In [None]:
# indicate in which columns from the .mat files should be read
params_vals = []
for key in params_keys:
    params_vals.append(param_dict[int(input('Please select a number corresponding to the field:\t'+key))])

Parameters = { key : val for key, val in zip(params_keys,params_vals)}

In [None]:
# Indicate maximum trial length; all lengths will be scaled. Default is 3000.
MaxExpLength = int(input('Please provide the maximum trial length:\t'))

In [None]:
# Present the user with their choices
print('Here are the settings you chose:')
print('Parameters:\n', Parameters)
print('Maximum trial length:\t', MaxExpLength)

In [7]:
# #Defaults
# Dataset_path = '/Volumes/GoogleDrive/.shortcut-targets-by-id/1GP7DKWJranQy6jycoCLbBHoyU1gq8cLp/2_TeamWork_VisualNeglect/1_Dataset'
# Dataset_folder = sorted([ f.path for f in os.scandir(Dataset_path) if '.mat' in f.name ])
# RT_Present = False
# MaxExpLength = 3000
# Parameters = {'Coords_X': 'EyeX_', 'Coords_Y': 'EyeY_', 'Labels': 'Label'}

In [None]:
# Read the dataset
Dataset = []
for subject in Dataset_folder:
    #read the .mat file
    m = scipy.io.loadmat(subject)
    singlesubj_trials = []

    #fill the data structure with trial information
    for i in range(0, len(m[Parameters['Coords_X']])):

        #dimensions
        cX = m[Parameters['Coords_X']][i]
        cY = m[Parameters['Coords_Y']][i]

        #interpolate to get rid of NaNs and extend to max length
        trial_df = pd.DataFrame()
        xs = pd.DataFrame(cX).transpose().rename({0: 'X'}, axis='index').transpose()
        xs = xs.reindex(list(range(0, MaxExpLength))).reset_index(drop=True)
        xs['X'] = xs['X'].ffill().bfill()
        ys = pd.DataFrame(cY).transpose().rename({0: 'Y'}, axis='index').transpose()
        ys = ys.reindex(list(range(0, MaxExpLength))).reset_index(drop=True)
        ys['Y'] = ys['Y'].ffill().bfill()
        trial_df = xs.transpose().append(ys.transpose()).transpose()

        #label, 1 = healthy, -1 = neglect
        trial_df['label'] = m[Parameters['Labels']][0,0]

        #response times
        if RT_Present:
            trial_df['RT'] = m[Parameters['RTs']].tolist()[i][0]

        singlesubj_trials.append(trial_df)
    Dataset.append(singlesubj_trials)
    print('Finished subject\t', Dataset_folder.index(subject)+1)
print('Done!')

In [None]:
#plot trajectory 29 from subj 7
subj = Dataset[7]
trial = subj[29]

f, ax = plt.subplots(figsize=(18, 18))

plt.plot(trial['X'], trial['Y'], '-o')
plt.show()

In [None]:
# save X, Y, and labels to .mat files
Xall = np.zeros((0, MaxExpLength))
Yall = np.zeros((0, MaxExpLength))
Lall = np.zeros((0, 1))
if RT_Present: RTall = np.zeros((0, 1))
for i, subj in enumerate(Dataset):
    for trial in subj:
        X_to_append = np.reshape(np.array(trial['X']), (1, 3000))
        Y_to_append = np.reshape(np.array(trial['Y']), (1, 3000))
        L_to_append = np.reshape(np.array(trial['label'][0]), (1, 1))

        Xall = np.append(Xall, X_to_append, axis = 0)
        Yall = np.append(Yall, Y_to_append, axis = 0)
        Lall = np.append(Lall, L_to_append, axis = 0)

        if RT_Present:
            RT_to_append = np.reshape(np.array(trial['RT'][0]), (1, 1))
            RTall = np.append(RTall, RT_to_append, axis = 0)
    print('Finished adding subj ', i+1)

mdx = {'X_all': Xall}
mdy = {'Y_all': Yall}
mdl = {'L_all': Lall}
scipy.io.savemat('X_all_preprocessed.mat', mdx)
scipy.io.savemat('Y_all_preprocessed.mat', mdy)
scipy.io.savemat('Labels_all_preprocessed.mat', mdl)
if RT_Present:
    mdRT = {'RT_all' : RTall}
    scipy.io.savemat('Rt_all_preprocessed.mat', mdRT)