### Note: I received four participant CSV files and determined that the second and third file were the same participant, so I removed participant_3.csv from the analysis

### Import necessary packages

In [414]:
import pandas as pd
import numpy as np

## Read each participant's CSV file to their own dataframe

In [415]:
p1 = pd.read_csv('participant_1.csv')
p2 = pd.read_csv('participant_2.csv')
p3 = pd.read_csv('participant_4.csv')

## Merge the participant dataframes into a single dataframe

In [416]:
frames = [p1, p2, p3]

In [417]:
participants = pd.concat(frames, keys=['Participant 1', 'Participant 2', 'Participant 3'])
participants

Unnamed: 0,Unnamed: 1,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,block_type,correct,difficulty
Participant 1,0,0.000,80.530000,0.940600,302.805,158.092,108.692000,0.583333,302.805,158.092,,,,
Participant 1,1,0.016,80.811200,0.940600,302.514,157.777,65.287100,0.029412,302.514,157.777,,,,
Participant 1,2,0.032,81.175500,0.940600,302.604,158.067,-1.806250,-1.000000,302.604,158.067,,,,
Participant 1,3,0.048,82.891400,0.935294,303.046,159.584,35.821400,0.000000,303.046,159.584,,,,
Participant 1,4,0.065,82.385700,0.940600,302.613,160.625,37.239300,0.000000,302.613,160.625,,,,
Participant 1,5,0.084,82.284400,0.940600,303.238,159.134,53.304600,0.117647,303.238,159.134,,,,
Participant 1,6,0.100,82.902300,0.940600,302.769,160.591,70.391600,0.029412,302.769,160.591,,,,
Participant 1,7,0.116,82.526900,0.947200,302.112,159.561,74.799500,0.529412,302.112,159.561,,,,
Participant 1,8,0.132,83.706400,0.941765,302.712,160.014,72.951500,0.542857,302.712,160.014,,,,
Participant 1,9,0.148,84.172400,0.947200,302.632,160.196,38.073800,0.031250,302.632,160.196,,,,


### Check shape of dataframe to see the number of datapoints before cleaning

In [418]:
participants.shape

(94373, 13)

## Remove null values and missing data

In [419]:
ptemp = participants.replace(0,np.nan)

In [420]:
p_clean = ptemp.dropna(subset=['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'])

### Fix timestamp at 0 seconds

In [421]:
p_clean.loc['Participant 1'].at[0,'timestamp']=0

In [422]:
p_clean.loc['Participant 2'].at[0,'timestamp']=0

In [423]:
p_clean.loc['Participant 3'].at[0,'timestamp']=0

### Check shape of files post-cleaning to make sure there are still enough datapoints

In [424]:
p_clean.shape

(88507, 13)

Only ~6000 datapoints (~6%) were lost from cleaning; probably worth seeing how many more datapoints are lost due to a threshhold for left/right eye confidence

## Insert a confidence threshhold

In [425]:
p_temp = p_clean[p_clean['r_conf'] > 0.25]

In [426]:
p_corr = p_temp[p_temp['l_conf'] > 0.25]

In [427]:
p_corr.shape

(83810, 13)

Requiring each eye to have a minimum confidence value of 0.2 removes a mere 4000 datapoints from the original 95000, yet likely confers a benefit from removing outliers.

I also noticed some of the pupil sizes were odd (a few were the value of Pi for example) so I will create another threshhold.

## Insert a pupil size threshhold

In [428]:
p_temp_2 = p_corr[p_corr['r_size'] > 5]

In [429]:
p_final = p_temp_2[p_temp_2['l_size'] > 5]

In [430]:
p_temp_2.shape

(83810, 13)

In [431]:
p_final.shape

(83810, 13)

It seems the previous threshhold for confidence was enough to take care of pupil size outliers.

### Reset index

In [432]:
p_final.reset_index(level=1, drop=True, inplace=True)
p_final

Unnamed: 0,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,block_type,correct,difficulty
Participant 1,0.000,80.530000,0.940600,302.805,158.092000,108.692000,0.583333,302.805,158.092000,,,,
Participant 1,0.116,82.526900,0.947200,302.112,159.561000,74.799500,0.529412,302.112,159.561000,,,,
Participant 1,0.132,83.706400,0.941765,302.712,160.014000,72.951500,0.542857,302.712,160.014000,,,,
Participant 1,0.168,83.795700,0.946122,254.151,-138.958000,47.485600,0.666667,254.151,-138.958000,,,,
Participant 1,0.201,84.952300,0.940600,263.454,-155.037000,61.161700,0.685714,263.454,-155.037000,,,,
Participant 1,0.216,84.053500,0.947200,302.817,161.019000,53.016600,0.411765,302.817,161.019000,,,,
Participant 1,0.337,84.478900,0.946122,301.612,161.122000,97.669900,0.600000,301.612,161.122000,,,,
Participant 1,0.385,84.623200,0.940600,301.170,160.773000,35.204700,0.636364,301.170,160.773000,,,,
Participant 1,0.400,84.140800,0.940600,300.214,161.264000,35.204700,0.636364,300.214,161.264000,,,,
Participant 1,0.452,83.843800,0.939388,328.365,190.017000,188.981000,0.694444,328.365,190.017000,,,,


## Perform basic data analysis

### Group by block number and type

In [433]:
p_final

Unnamed: 0,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,block_type,correct,difficulty
Participant 1,0.000,80.530000,0.940600,302.805,158.092000,108.692000,0.583333,302.805,158.092000,,,,
Participant 1,0.116,82.526900,0.947200,302.112,159.561000,74.799500,0.529412,302.112,159.561000,,,,
Participant 1,0.132,83.706400,0.941765,302.712,160.014000,72.951500,0.542857,302.712,160.014000,,,,
Participant 1,0.168,83.795700,0.946122,254.151,-138.958000,47.485600,0.666667,254.151,-138.958000,,,,
Participant 1,0.201,84.952300,0.940600,263.454,-155.037000,61.161700,0.685714,263.454,-155.037000,,,,
Participant 1,0.216,84.053500,0.947200,302.817,161.019000,53.016600,0.411765,302.817,161.019000,,,,
Participant 1,0.337,84.478900,0.946122,301.612,161.122000,97.669900,0.600000,301.612,161.122000,,,,
Participant 1,0.385,84.623200,0.940600,301.170,160.773000,35.204700,0.636364,301.170,160.773000,,,,
Participant 1,0.400,84.140800,0.940600,300.214,161.264000,35.204700,0.636364,300.214,161.264000,,,,
Participant 1,0.452,83.843800,0.939388,328.365,190.017000,188.981000,0.694444,328.365,190.017000,,,,


In [434]:
p1_blocknum = p_final.loc['Participant 1'].groupby(['block_number', 'block_type'])

In [435]:
p1_blocknum['l_conf'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
block_number,block_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,TRIAL_START,200.0,0.931108,0.076176,0.342857,0.914286,0.942857,0.971429,1.000000
2.0,BEGIN_SPAN,432.0,0.913083,0.060334,0.294118,0.901457,0.914286,0.941176,1.000000
3.0,CUE_ANSWER,625.0,0.904783,0.088940,0.257143,0.904320,0.916667,0.942857,1.000000
4.0,RESPONSE,84.0,0.706652,0.208448,0.257143,0.558824,0.710084,0.911765,1.000000
5.0,BEGIN_SPAN,188.0,0.711440,0.218821,0.257143,0.555482,0.742857,0.914222,1.000000
6.0,CUE_ANSWER,357.0,0.851200,0.164670,0.257143,0.885714,0.914286,0.942857,1.000000
7.0,RESPONSE,233.0,0.921373,0.080690,0.272727,0.914286,0.937500,0.944444,1.000000
8.0,BEGIN_SPAN,346.0,0.884500,0.114298,0.277778,0.888889,0.914286,0.942857,1.000000
9.0,CUE_ANSWER,57.0,0.712022,0.195067,0.264706,0.600000,0.696970,0.916022,0.957000
10.0,RESPONSE,129.0,0.825799,0.196027,0.264706,0.800000,0.923714,0.944444,1.000000


In [436]:
p1_blocknum['l_conf'].mean()

block_number  block_type 
1.0           TRIAL_START    0.931108
2.0           BEGIN_SPAN     0.913083
3.0           CUE_ANSWER     0.904783
4.0           RESPONSE       0.706652
5.0           BEGIN_SPAN     0.711440
6.0           CUE_ANSWER     0.851200
7.0           RESPONSE       0.921373
8.0           BEGIN_SPAN     0.884500
9.0           CUE_ANSWER     0.712022
10.0          RESPONSE       0.825799
11.0          BEGIN_SPAN     0.759037
12.0          CUE_ANSWER     0.863183
13.0          RESPONSE       0.948947
14.0          BEGIN_SPAN     0.900418
15.0          CUE_ANSWER     0.789531
16.0          RESPONSE       0.905016
17.0          BEGIN_SPAN     0.936370
18.0          CUE_ANSWER     0.900095
19.0          RESPONSE       0.885677
20.0          BEGIN_SPAN     0.883542
21.0          CUE_ANSWER     0.863928
22.0          RESPONSE       0.922537
23.0          BEGIN_SPAN     0.922848
24.0          CUE_ANSWER     0.907850
25.0          RESPONSE       0.940643
26.0          BEGIN_SPAN

In [None]:
test = 