### Note: I received four participant CSV files and determined that the second and third file were the same participant, so I removed participant_3.csv from the analysis

### Import necessary packages

In [468]:
import pandas as pd
import numpy as np

## Read each participant's CSV file to their own dataframe

In [469]:
p1 = pd.read_csv('participant_1.csv')
p2 = pd.read_csv('participant_2.csv')
p3 = pd.read_csv('participant_4.csv')

### Check shape of dataframes to see the number of datapoints before cleaning

In [470]:
p1.shape[0]+p2.shape[0]+p3.shape[0]

94373

## Remove null values and missing data

In [471]:
p1temporary = p1.replace(0,np.nan)

In [472]:
p2temporary = p2.replace(0,np.nan)

In [473]:
p3temporary = p3.replace(0,np.nan)

In [474]:
p1_clean = p1temporary.dropna(subset=['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'])

In [475]:
p2_clean = p2temporary.dropna(subset=['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'])

In [476]:
p3_clean = p3temporary.dropna(subset=['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'])

### Fix timestamp at 0 seconds

In [477]:
p1_clean.at[0,'timestamp']=0

In [478]:
p2_clean.at[0,'timestamp']=0

In [479]:
p3_clean.at[0,'timestamp']=0

### Check shape of file post-cleaning to make sure there are still enough datapoints

In [480]:
p1_clean.shape[0]+p2_clean.shape[0]+p3_clean.shape[0]

88507

Only ~6000 datapoints (~6%) were lost by removing zeros from the data; probably worth seeing how many more datapoints are lost due to a threshhold for left/right eye confidence

## Insert a confidence threshhold

In [481]:
p1_temp = p1_clean[p1_clean['r_conf'] > 0.25]

In [482]:
p1_corr = p1_temp[p1_temp['l_conf'] > 0.25].reset_index(drop=True)

In [483]:
p2_temp = p2_clean[p2_clean['r_conf'] > 0.25]

In [484]:
p2_corr = p2_temp[p2_temp['l_conf'] > 0.25].reset_index(drop=True)

In [485]:
p3_temp = p3_clean[p3_clean['r_conf'] > 0.25]

In [486]:
p3_corr = p3_temp[p3_temp['l_conf'] > 0.25].reset_index(drop=True)

In [487]:
p1_corr.shape[0]+p2_corr.shape[0]+p3_corr.shape[0]

83810

Requiring each eye to have a minimum confidence value of 0.25 removes a mere 4700 datapoints from the original 95000, yet likely confers a benefit from removing outliers.

I also noticed some of the pupil sizes were odd (a few were the value of Pi for example) so I will create another threshhold.

## Insert a pupil size threshhold

In [488]:
p1_pupil = p1_corr[p1_corr['r_size'] > 5]

In [489]:
p1_final = p1_pupil[p1_pupil['l_size'] > 5]

In [490]:
p2_pupil = p2_corr[p2_corr['r_size'] > 5]

In [491]:
p2_final = p2_pupil[p2_pupil['l_size'] > 5]

In [492]:
p3_pupil = p3_corr[p3_corr['r_size'] > 5]

In [493]:
p3_final= p3_pupil[p3_pupil['l_size'] > 5]

In [494]:
p1_final.shape[0]+p2_final.shape[0]+p3_final.shape[0]

83810

It seems the previous threshhold for confidence was enough to take care of pupil size outliers.

## Merge the participant dataframes into a single dataframe

In [495]:
frames = [p1_final, p2_final, p3_final]

In [496]:
part = pd.concat(frames, keys=['Participant 1', 'Participant 2', 'Participant 3'])

#### random shit testing hierarchy for multiindex

In [497]:
participants = pd.concat([part, part], keys=['Test'], axis=1)

In [498]:
participants

Unnamed: 0_level_0,Unnamed: 1_level_0,Test,Test,Test,Test,Test,Test,Test,Test,Test,Test,Test,Test,Test
Unnamed: 0_level_1,Unnamed: 1_level_1,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,block_type,correct,difficulty
Participant 1,0,0.000,80.530000,0.940600,302.805,158.092000,108.692000,0.583333,302.805,158.092000,,,,
Participant 1,1,0.116,82.526900,0.947200,302.112,159.561000,74.799500,0.529412,302.112,159.561000,,,,
Participant 1,2,0.132,83.706400,0.941765,302.712,160.014000,72.951500,0.542857,302.712,160.014000,,,,
Participant 1,3,0.168,83.795700,0.946122,254.151,-138.958000,47.485600,0.666667,254.151,-138.958000,,,,
Participant 1,4,0.201,84.952300,0.940600,263.454,-155.037000,61.161700,0.685714,263.454,-155.037000,,,,
Participant 1,5,0.216,84.053500,0.947200,302.817,161.019000,53.016600,0.411765,302.817,161.019000,,,,
Participant 1,6,0.337,84.478900,0.946122,301.612,161.122000,97.669900,0.600000,301.612,161.122000,,,,
Participant 1,7,0.385,84.623200,0.940600,301.170,160.773000,35.204700,0.636364,301.170,160.773000,,,,
Participant 1,8,0.400,84.140800,0.940600,300.214,161.264000,35.204700,0.636364,300.214,161.264000,,,,
Participant 1,9,0.452,83.843800,0.939388,328.365,190.017000,188.981000,0.694444,328.365,190.017000,,,,


## Perform basic data analysis

### Group by block number and type

In [499]:
part

Unnamed: 0,Unnamed: 1,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,block_type,correct,difficulty
Participant 1,0,0.000,80.530000,0.940600,302.805,158.092000,108.692000,0.583333,302.805,158.092000,,,,
Participant 1,1,0.116,82.526900,0.947200,302.112,159.561000,74.799500,0.529412,302.112,159.561000,,,,
Participant 1,2,0.132,83.706400,0.941765,302.712,160.014000,72.951500,0.542857,302.712,160.014000,,,,
Participant 1,3,0.168,83.795700,0.946122,254.151,-138.958000,47.485600,0.666667,254.151,-138.958000,,,,
Participant 1,4,0.201,84.952300,0.940600,263.454,-155.037000,61.161700,0.685714,263.454,-155.037000,,,,
Participant 1,5,0.216,84.053500,0.947200,302.817,161.019000,53.016600,0.411765,302.817,161.019000,,,,
Participant 1,6,0.337,84.478900,0.946122,301.612,161.122000,97.669900,0.600000,301.612,161.122000,,,,
Participant 1,7,0.385,84.623200,0.940600,301.170,160.773000,35.204700,0.636364,301.170,160.773000,,,,
Participant 1,8,0.400,84.140800,0.940600,300.214,161.264000,35.204700,0.636364,300.214,161.264000,,,,
Participant 1,9,0.452,83.843800,0.939388,328.365,190.017000,188.981000,0.694444,328.365,190.017000,,,,


In [500]:
p1_blocknum = part.loc['Participant 1'].groupby(['block_number', 'block_type'], as_index = False)

In [501]:
p2_blocknum = part.loc['Participant 2'].groupby(['block_number', 'block_type'], as_index = False)

## Preliminary data analysis for participants 1 and 2

In [502]:
p1_analysis = p1_blocknum['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'].describe()
p1_analysis

Unnamed: 0_level_0,r_size,r_size,r_size,r_size,r_size,r_size,r_size,r_size,r_conf,r_conf,...,l_x_pos,l_x_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
0,200.0,75.119788,1.534451,72.5879,73.939775,74.63810,76.303475,79.0882,200.0,0.937972,...,308.92550,320.159,200.0,147.304420,5.539411,135.51900,142.793000,146.8120,152.26925,159.245
1,432.0,81.517885,10.616744,30.7274,75.523350,81.96940,84.437275,202.6180,432.0,0.941314,...,312.50000,365.384,432.0,142.857718,6.499474,59.43940,140.923250,143.4530,145.52275,183.195
2,625.0,77.298602,2.750458,56.4005,75.035300,77.50660,79.452400,85.1388,625.0,0.946478,...,323.71700,485.366,625.0,146.091203,38.447144,-194.09300,145.299000,149.0540,156.07600,173.716
3,84.0,109.344910,66.105883,74.6420,76.076975,78.10700,84.619550,262.1780,84.0,0.921450,...,314.53425,357.348,84.0,109.308963,101.467617,-181.20700,140.673000,145.9400,152.57825,173.384
4,188.0,78.865911,2.201778,74.7267,77.550250,78.63180,79.766425,89.5324,188.0,0.932073,...,308.47550,495.957,188.0,90.728021,105.467075,-256.48300,86.160825,140.8570,147.07650,183.620
5,357.0,78.568041,3.421106,40.1633,77.047800,78.59410,80.154900,104.9320,357.0,0.939046,...,358.12500,675.260,357.0,126.967931,64.796623,-268.08200,135.133000,147.9620,154.02800,176.449
6,233.0,74.736615,1.796160,70.5051,73.893700,74.76510,75.562600,82.4849,233.0,0.942468,...,305.71000,442.974,233.0,144.973524,7.367489,119.72500,140.220000,144.1090,147.18300,168.333
7,346.0,81.716425,13.622830,27.7907,77.546775,81.66275,84.507225,163.9620,346.0,0.916952,...,311.52100,345.828,346.0,136.702612,40.790930,-221.68100,139.012000,142.2180,144.97825,168.566
8,57.0,80.431168,2.654904,75.4359,77.607000,81.52120,82.397600,83.7776,57.0,0.948173,...,401.20900,414.678,57.0,94.839630,144.054553,-220.55000,140.501000,146.3850,185.46400,224.314
9,129.0,78.921981,5.605579,72.7842,75.370900,78.76170,80.246800,121.8560,129.0,0.942033,...,333.42600,414.160,129.0,115.835724,90.579868,-241.53900,111.981000,118.9600,143.35200,223.551


In [503]:
p2_analysis = p2_blocknum['r_size','r_conf','r_x_pos','r_y_pos','l_size','l_conf','l_x_pos','l_y_pos'].describe()
p2_analysis

Unnamed: 0_level_0,r_size,r_size,r_size,r_size,r_size,r_size,r_size,r_size,r_conf,r_conf,...,l_x_pos,l_x_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos,l_y_pos
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
0,218.0,103.893111,5.005330,96.3320,98.897750,104.01650,108.224250,113.9510,218.0,0.937777,...,313.88275,337.483,218.0,200.578312,14.172825,59.14300,197.45425,203.2710,205.76150,232.257
1,465.0,96.261304,10.075218,81.8861,85.875900,96.15760,102.682000,117.5470,465.0,0.936325,...,308.38900,311.580,465.0,207.996088,4.506407,190.65100,204.23900,208.6140,210.82900,260.022
2,765.0,117.363801,8.495578,96.3905,112.856000,119.41600,122.624000,173.5030,765.0,0.937029,...,309.02000,474.583,765.0,213.203612,14.512430,158.32700,205.26400,211.2170,217.19000,273.145
3,232.0,102.217900,5.640954,54.5974,99.163350,101.38450,106.027500,112.1830,232.0,0.935222,...,309.99100,318.175,232.0,264.752418,20.359642,239.85300,245.92700,258.5620,280.28125,303.740
4,460.0,107.914500,6.755175,94.6174,103.240250,107.41550,111.239750,122.6710,460.0,0.939993,...,313.99875,323.892,460.0,256.767989,13.256745,229.64200,245.25225,251.4120,266.64275,293.671
5,516.0,120.093529,10.805415,108.0980,115.031000,118.95550,122.766000,250.7220,516.0,0.937276,...,321.00000,333.698,516.0,223.866537,17.706408,197.83300,210.19700,217.0050,244.12675,327.556
6,232.0,102.091268,8.236619,87.3244,94.313050,104.96150,109.854000,113.0760,232.0,0.936344,...,317.12100,323.448,232.0,218.228746,4.339316,207.19900,215.45275,217.4835,220.10700,230.978
7,469.0,105.127349,9.717891,74.8186,97.682900,102.96100,110.001000,142.4770,469.0,0.937253,...,319.37800,327.363,469.0,217.987030,4.528076,191.19800,215.38500,218.2660,220.59700,251.551
8,495.0,108.038330,7.517870,89.4264,101.022000,107.83300,113.239500,125.3280,495.0,0.937153,...,319.41650,324.270,495.0,215.262592,5.501399,197.34300,213.34400,215.0800,218.24250,262.420
9,231.0,97.434909,8.897610,82.1637,91.424650,97.06740,105.075500,113.6980,231.0,0.936619,...,319.67250,324.331,231.0,212.378602,3.609560,196.34700,209.97650,211.3240,215.16400,233.383


### Participant 3 has no data for block number or type, so I cannot group the eye data by question

In [504]:
p3group = part.loc['Participant 3']

In [505]:
p3group.describe()

Unnamed: 0,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,difficulty
count,15423.0,15423.0,15423.0,15423.0,15423.0,15423.0,15423.0,15423.0,15423.0,0.0,0.0
mean,285.559568,2852.633492,0.959265,355.483903,72.265906,10908.206851,0.910927,355.483903,72.265906,,
std,163.1992,4509.885676,0.066277,47.894952,62.061069,6327.209585,0.042625,47.894952,62.061069,,
min,0.0,857.12421,0.264706,108.849,-451.685,1544.691925,0.257143,108.849,-451.685,,
25%,140.083,1410.812084,0.956957,318.4195,44.91785,9024.120915,0.904375,318.4195,44.91785,,
50%,291.427,1638.301183,0.97,358.046,60.2516,10263.304281,0.913224,358.046,60.2516,,
75%,426.619,1938.099744,1.0,387.1,100.428,11761.513862,0.9208,387.1,100.428,,
max,569.619,56062.410156,1.0,1269.8,4273.34,178922.711408,1.0,1269.8,4273.34,,


### MERGE?????!?!?

In [506]:
analysis = pd.concat([p1_analysis, p2_analysis], keys=['Participant 1','Participant 2'])

In [508]:
pd.merge(part,analysis,how='left',left_index=True,right_index=True)



Unnamed: 0,Unnamed: 1,timestamp,r_size,r_conf,r_x_pos,r_y_pos,l_size,l_conf,l_x_pos,l_y_pos,block_number,...,"(l_x_pos, 75%)","(l_x_pos, max)","(l_y_pos, count)","(l_y_pos, mean)","(l_y_pos, std)","(l_y_pos, min)","(l_y_pos, 25%)","(l_y_pos, 50%)","(l_y_pos, 75%)","(l_y_pos, max)"
Participant 1,0,0.000,80.530000,0.940600,302.805,158.092000,108.692000,0.583333,302.805,158.092000,,...,308.92550,320.159,200.0,147.304420,5.539411,135.5190,142.793000,146.8120,152.26925,159.245
Participant 1,1,0.116,82.526900,0.947200,302.112,159.561000,74.799500,0.529412,302.112,159.561000,,...,312.50000,365.384,432.0,142.857718,6.499474,59.4394,140.923250,143.4530,145.52275,183.195
Participant 1,2,0.132,83.706400,0.941765,302.712,160.014000,72.951500,0.542857,302.712,160.014000,,...,323.71700,485.366,625.0,146.091203,38.447144,-194.0930,145.299000,149.0540,156.07600,173.716
Participant 1,3,0.168,83.795700,0.946122,254.151,-138.958000,47.485600,0.666667,254.151,-138.958000,,...,314.53425,357.348,84.0,109.308963,101.467617,-181.2070,140.673000,145.9400,152.57825,173.384
Participant 1,4,0.201,84.952300,0.940600,263.454,-155.037000,61.161700,0.685714,263.454,-155.037000,,...,308.47550,495.957,188.0,90.728021,105.467075,-256.4830,86.160825,140.8570,147.07650,183.620
Participant 1,5,0.216,84.053500,0.947200,302.817,161.019000,53.016600,0.411765,302.817,161.019000,,...,358.12500,675.260,357.0,126.967931,64.796623,-268.0820,135.133000,147.9620,154.02800,176.449
Participant 1,6,0.337,84.478900,0.946122,301.612,161.122000,97.669900,0.600000,301.612,161.122000,,...,305.71000,442.974,233.0,144.973524,7.367489,119.7250,140.220000,144.1090,147.18300,168.333
Participant 1,7,0.385,84.623200,0.940600,301.170,160.773000,35.204700,0.636364,301.170,160.773000,,...,311.52100,345.828,346.0,136.702612,40.790930,-221.6810,139.012000,142.2180,144.97825,168.566
Participant 1,8,0.400,84.140800,0.940600,300.214,161.264000,35.204700,0.636364,300.214,161.264000,,...,401.20900,414.678,57.0,94.839630,144.054553,-220.5500,140.501000,146.3850,185.46400,224.314
Participant 1,9,0.452,83.843800,0.939388,328.365,190.017000,188.981000,0.694444,328.365,190.017000,,...,333.42600,414.160,129.0,115.835724,90.579868,-241.5390,111.981000,118.9600,143.35200,223.551


## Attempt to merge with multiindex

In [248]:
test = pd.merge(participants, p1_analysis, how='left', on='index')

KeyError: 'index'

## Attempt to merge dataframes without multiindexing

In [190]:
a = p1_blocknum['l_conf'].agg([np.mean,np.std,np.var]).rename(columns={'mean':'l_conf_mean','std':'l_conf_std','var':'l_conf_var'})

In [191]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,l_conf_mean,l_conf_std,l_conf_var
block_number,block_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,TRIAL_START,0.931108,0.076176,0.005803
2.0,BEGIN_SPAN,0.913083,0.060334,0.003640
3.0,CUE_ANSWER,0.904783,0.088940,0.007910
4.0,RESPONSE,0.706652,0.208448,0.043451
5.0,BEGIN_SPAN,0.711440,0.218821,0.047882
6.0,CUE_ANSWER,0.851200,0.164670,0.027116
7.0,RESPONSE,0.921373,0.080690,0.006511
8.0,BEGIN_SPAN,0.884500,0.114298,0.013064
9.0,CUE_ANSWER,0.712022,0.195067,0.038051
10.0,RESPONSE,0.825799,0.196027,0.038427


In [192]:
dframes = [p_final, p1_blocknum['l_conf'].mean()]

In [193]:
dframes

[                     timestamp        r_size    r_conf   r_x_pos      r_y_pos  \
 Participant 1 0          0.000     80.530000  0.940600   302.805   158.092000   
               7          0.116     82.526900  0.947200   302.112   159.561000   
               8          0.132     83.706400  0.941765   302.712   160.014000   
               10         0.168     83.795700  0.946122   254.151  -138.958000   
               12         0.201     84.952300  0.940600   263.454  -155.037000   
               13         0.216     84.053500  0.947200   302.817   161.019000   
               20         0.337     84.478900  0.946122   301.612   161.122000   
               23         0.385     84.623200  0.940600   301.170   160.773000   
               24         0.400     84.140800  0.940600   300.214   161.264000   
               27         0.452     83.843800  0.939388   328.365   190.017000   
               28         0.468     83.822100  0.940600   304.063   152.556000   
               2

In [241]:
test = pd.merge(p_final, a, how='left')

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False