# Pilot data check - TODO

- frame rate/subject
- number of grasps/subject
- number of eye samples/object
- number of eye samples/grasped object
- number of eye samples/grasped object/sorting type --> shouldn't be different
- number of samples with NaN
- validation error across trials/subject
- azimuth and elevation angles/subject (histograms)
- Amplitude of head movements based on velocity
- Saccade amplitude based on velocity

# Quick notes
Subjects 1001, 1002, 1003, 1004, 1005, 1007 --> no grasped object recorded

In [1]:
import ProtobufTypes_pb2 
import pandas as pd
import sys
from google.protobuf.json_format import MessageToDict
import collections
import numpy as np
import glob
import seaborn as sns
import os
import matplotlib.pyplot as plt
import pickle
import pprint
from IPython.display import display
PLOT_DIR = './PLOTS/data_quality/'
os.makedirs(os.path.dirname(PLOT_DIR), exist_ok=True)
sns.set(context = "talk", style="white", palette="dark", font_scale=1, rc={'figure.figsize':(11.7,8.27)})
COLORS = {'g': '#CAE6CB', 'r': '#E6CACA', 'b': '#C9D4F5'}

In [2]:
# pickle the data
def pickleData(filepath,filename, data):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(str(filepath+filename),'wb') as fh:
        pickle.dump(data,fh)

# read from pickled data
def read_dataFrameFromFile(filename):
    with open(filename, 'rb') as fh:
        dataframe = pickle.load(fh)
    return dataframe

In [None]:
protobuf_obj = ProtobufTypes_pb2.EyetrackingDataSet()
files = glob.glob("./subject*.etd")
samples_df = pd.DataFrame()
shelf_df = pd.DataFrame()
for fi in files:
    try:
        with open(fi, "rb") as f:
            protobuf_obj.ParseFromString(f.read())
        dict_obj = MessageToDict(protobuf_obj)
        for nT, trial in enumerate(dict_obj['trials']):
            tmpdf = pd.io.json.json_normalize(data=trial['samples'])
            if 'handData.graspedObject' in tmpdf.columns:
                tmpdf = tmpdf.loc[:, ['unixTimestamp','timestamp', 'combinedEye.raycastHitObject','handData.graspedObject']]
            else:
                tmpdf = tmpdf.loc[:, ['unixTimestamp','timestamp', 'combinedEye.raycastHitObject']]
                                  
            tmpdf['trialID'] = trial['metaData']['trialID'] if 'trialID' in trial['metaData'] else np.NaN  
            tmpdf['subjectID'] = dict_obj['subjectID']
            tmpdf['trialNum'] = nT
            samples_df = pd.concat([samples_df, tmpdf], ignore_index=True, sort=False)             
            
            tmpdf = pd.io.json.json_normalize(data=trial['metaData']['initialConfiguration']['items'])
            tmpdf['trialID'] = trial['metaData']['trialID'] if 'trialID' in trial['metaData'] else np.NaN  
            tmpdf['subjectID'] = dict_obj['subjectID']
            tmpdf['trialNum'] = nT
            shelf_df = pd.concat([shelf_df, tmpdf], ignore_index=True, sort=False) 
            
    except FileNotFoundError:
        print("{} not found, moving on!".format(fi))
    
pickleData('./Data/','reducedAllData',samples_df)
del samples_df
pickleData('./Data/','allShelfData',shelf_df)
del shelf_df

In [None]:
samples_df = read_dataFrameFromFile('./Data/reducedAllData')

In [None]:
samples_df.head(20)

In [None]:
samples_df.info()

In [None]:
samples_df.isna().mean()

In [None]:
filtered_df = samples_df.dropna(subset=['trialID'])
filtered_df = filtered_df.drop(columns='unixTimestamp')
filtered_df = filtered_df.loc[(filtered_df.timestamp!=-1), :]

In [None]:
filtered_df.subjectID.unique()

In [None]:
filtered_df.trialNum.value_counts()

In [None]:
tmpdf = (
    samples_df
    .groupby(['subjectID','trialNum'])['timestamp']
    .apply(lambda x: 1/np.mean(x - x.shift()))
    .rename('Frame_Rate')
    .reset_index()
)
# display(tmpdf)

_, ax = plt.subplots(1,1,figsize=(20,10))
sns.pointplot(data=tmpdf, x='subjectID', y='Frame_Rate', color='b',
              errwidth=2, capsize=0, saturation=0.5,ci='sd')

plt.title('Frame Rate')
plt.savefig(PLOT_DIR+'/frame_rate.png', quality=90)

In [None]:
tmpdf = (
    samples_df
    .groupby(['subjectID','trialID','trialNum'])['handData.graspedObject']
    .apply(pd.Series.nunique)
    .rename('Number of Grasped Objects')
    .reset_index()
)
# display(tmpdf)
_, ax = plt.subplots(figsize=(20,10))
sns.boxplot(data=tmpdf, x='subjectID', y='Number of Grasped Objects', palette='GnBu')
# sns.boxplot(data=tmpdf, x='trialID', y='Number of Grasped Objects', palette='GnBu')

plt.title('Number of Unique Grasped Objects over Subjects')
plt.savefig(PLOT_DIR+'/graspedObjects_perSubject.png', quality=90)

In [None]:
tmpdf = (
    samples_df
    .groupby(['subjectID','trialID','trialNum'])['handData.graspedObject']
    .apply(lambda x: pd.Series.nunique(x) -1)
    .rename('Number of Grasped Objects')
    .reset_index()
)
# display(tmpdf)
_, ax = plt.subplots(figsize=(20,10))
# sns.pointplot(data=tmpdf, x='trialID', y='Number of Grasped Objects', color='b',
#               errwidth=2, capsize=0, saturation=0.5,ci='sd')
sns.stripplot(data=tmpdf, x='trialID', y='Number of Grasped Objects', palette='GnBu', size=20)
plt.title('Number of Unique Grasped Objects')
plt.savefig(PLOT_DIR+'/graspedObjects_perTrial.png', quality=90)

In [None]:
tmpdf = (
    samples_df
    .groupby(['subjectID','trialNum','combinedEye.raycastHitObject'])
    .agg({'trialID':'count'})
    .rename(columns={'trialID':'numSamples'})
    .reset_index()
)
# display(tmpdf)

_, ax = plt.subplots(figsize=(20,10))
sns.pointplot(data=tmpdf, x='subjectID', y='numSamples', color='b',
              errwidth=2, capsize=0, saturation=0.5,ci='sd')

plt.title('Number of Raycast Hits per Trial')
plt.savefig(PLOT_DIR+'/SamplesPerObject_subject.png', quality=90)

In [None]:
def replace_objs(s):
    valid_objs = ['Cube_Blue', 'Cube_Green', 'Cube_Red', 'Cube_Yellow', 'Cylinder_Blue', 'Cylinder_Green',
                  'Cylinder_Red','Cylinder_Yellow', 'Sphere_Blue', 'Sphere_Green', 'Sphere_Red',
                  'Sphere_Yellow', 'Tetraeder_Blue', 'Tetraeder_Green', 'Tetraeder_Red', 'Tetraeder_Yellow']
    if s in valid_objs:
        return s
    else:
        return 'Other'



In [None]:
tmpdf = samples_df
tmpdf['combinedEye.raycastHitObject'] = tmpdf['combinedEye.raycastHitObject'].apply(replace_objs)
display(tmpdf)
tmpdf = (
    tmpdf
    .groupby(['subjectID','trialNum','combinedEye.raycastHitObject'])
    .agg({'trialID': 'count'})
    .rename(columns={'trialID': 'numSamples'})
    .reset_index()
)
# display(tmpdf)
_, ax = plt.subplots(1,1,figsize=(20,10))
sns.pointplot(data=tmpdf, x='subjectID', y='numSamples', hue='combinedEye.raycastHitObject',
              errwidth=2, capsize=0, saturation=0.5,ci='sd', palette='dark', dodge=0.2, ax=ax)
plt.legend(loc='upper left')
plt.title('Number of Raycast Hits per Trial')
plt.savefig(PLOT_DIR+'/SamplesPerObject_Shelf.png', quality=90)

In [None]:
samples_df['handData.graspedObject'].value_counts()

In [None]:
samples_df.head()

In [None]:
tmpdf = (
    samples_df
    .groupby(['trialID','subjectID','combinedEye.raycastHitObject'])
    .agg({'trialID':'count'})
    .rename(columns={'trialID':'numSamples'})
    .reset_index()
)
# display(tmpdf)
_, ax = plt.subplots(1,1,figsize=(20,10))
sns.boxplot(data=tmpdf, x='trialID', y='numSamples', hue='combinedEye.raycastHitObject',
              saturation=0.5, palette='GnBu', dodge=0.1, ax=ax)
# sns.pointplot(data=tmpdf, x='trialID', y='numSamples', hue='combinedEye.raycastHitObject',
#               errwidth=2, capsize=0, saturation=0.5,ci='sd', palette='dark', dodge=0.1, ax=ax)
# sns.stripplot(data=tmpdf, x='trialID', y='numSamples', hue='combinedEye.raycastHitObject',
#                palette='dark', jitter=0.1,dodge=0.2, size=10, ax=ax)
plt.ylim(0, 40000)
plt.legend(loc='upper left')
plt.title('Number of Raycast Hits per Task Type')
plt.savefig(PLOT_DIR+'/SamplesPerObject_trials.png', quality=90)

In [None]:
tmpdf = (
    samples_df.groupby(['subjectID','trialID','trialNum'])['timestamp']
    .apply(lambda x: (x.iloc[-1]-x.iloc[0]))
    .reset_index()
)
# display(tmpdf)
# ax = sns.pointplot(data=tmpdf, x='trialID', y='timestamp',
#               errwidth=2, capsize=0, saturation=0.5,ci='sd')
_, ax = plt.subplots(1,1,figsize=(20,10))
ax = sns.boxplot(data=tmpdf, x='trialID', y='timestamp', palette='GnBu',
              saturation=0.5)
ax.set_ylabel('Duration (sec)')
plt.title('Duration of Trials over Subjects')
plt.savefig(PLOT_DIR+'/durationPertrials_taskType.png', quality=90)

In [None]:
tmpdf = (
    samples_df.groupby(['subjectID', 'trialNum'])['timestamp']
    .apply(lambda x: (x.iloc[-1]-x.iloc[0]))
    .reset_index()
)
# display(tmpdf)
# ax = sns.pointplot(data=tmpdf, x='trialID', y='timestamp',
#               errwidth=2, capsize=0, saturation=0.5,ci='sd')
_, ax = plt.subplots(1,1,figsize=(20,10))
ax = sns.boxplot(data=tmpdf, x='subjectID', y='timestamp', palette='GnBu',
              saturation=0.5)
ax.set_ylabel('Duration (sec)')
plt.title('Duration of Trials/Subjects')
plt.savefig(PLOT_DIR+'/durationPertrials_subject.png', quality=90)

In [None]:
shelf_df = read_dataFrameFromFile('./Data/allShelfData')
shelf_df.info()
shelf_df.isna().mean()

In [None]:
%matplotlib inline
%matplotlib inline
tmpdf = (
    shelf_df
    .groupby('subjectID')
    .apply(lambda x: x.isna().mean())
    .drop(columns=['subjectID','trialID'])
    .reset_index()
    .set_index('subjectID')
)
_, _ = plt.subplots(1,1,figsize=(12,12))
ax = sns.heatmap(tmpdf, vmin=0, vmax=0.25, annot=True, linewidths=.5, cmap='viridis_r', annot_kws={"size": 25})
ax.set_ylim(len(tmpdf)-0.1, -0.5)
plt.yticks(rotation=0, fontsize=25) 
plt.xticks(rotation=45, fontsize=25)
plt.xlabel(fontsize=20)
plt.title('Proportion of Missing Vlues of Shelf Configuration', fontsize=30)
plt.savefig(PLOT_DIR+'/shelf_missingVals.png', quality=90, transparent=True)

In [None]:
samples_df.head()
    

In [None]:
samples_df['handData.graspedObject'].fillna(0, inplace=True)
samples_df['numHits'] = (
        samples_df['combinedEye.raycastHitObject']
        .groupby((samples_df['combinedEye.raycastHitObject']!=samples_df['combinedEye.raycastHitObject'].shift())
        .cumsum())
        .transform('size')
)
samples_df['eyeChange'] = (
                samples_df['combinedEye.raycastHitObject']!=samples_df['combinedEye.raycastHitObject'].shift()
)
samples_df['handChange'] = (
                samples_df['handData.graspedObject']!=samples_df['handData.graspedObject'].shift()
)

# samples_df['dwellTime'] = (
#                 samples_df['combinedEye.raycastHitObject']!=samples_df['combinedEye.raycastHitObject'].shift()
# )

tmpdf = samples_df
tmpdf.loc[(~tmpdf['handChange']),'handData.graspedObject' ] = 0

grasp = np.where(tmpdf.handChange)[0]
next_grasp = np.where(tmpdf.handChange.shift())[0]
display(tmpdf.loc[next_grasp])

In [None]:
filtered_df.head()

In [None]:
filtered_df['combinedEye.raycastHitObject'] = filtered_df['combinedEye.raycastHitObject'].apply(replace_objs)
display(filtered_df.tail())

In [None]:
# %matplotlib notebook
tmpdf = (
    filtered_df
    .rename(columns={'combinedEye.raycastHitObject':'eyeHit', 'handData:graspedObject':'grasp'})
    .query('subjectID == 1006 & trialNum == 6')
    .query('eyeHit != "Other"')
    .sort_values('timestamp')
    [['timestamp', 'eyeHit']]
    .set_index('timestamp')
)
sns.set(context = "talk", style="whitegrid", palette="dark", font_scale=1, rc={'figure.figsize':(11.7,8.27)})
_, ax = plt.subplots(figsize=(20,10))
tmpdf = pd.get_dummies(tmpdf, prefix='')
tmpdf = tmpdf.mul(np.arange(1,17))
tmpdf.plot(ax=ax)


In [None]:
raise

# Scratch

In [None]:
np.arange(1,17)