# Groupings


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import os
import re
import sys
sys.path.append('../process data/')
import scipy.stats as stats
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.offline as pyo
import plotly.express as px
from encode_processed_data import encode_data

In [4]:
# with open('../data objects/batch_processing_object.pkl', 'rb') as file2:
#     bp = pickle.load(file2)

# ---- fetch data object ----x
with open('../data objects/batch_processing_object_with_encodings.pkl', 'rb') as file2:
    ed = pickle.load(file2)

   
ed.__dict__.keys()

dict_keys(['raw', 'fitts_summary_stats', 'corsi_summary_stats', 'navon_summary_stats', 'nback_summary_stats', 'demographics_plot', 'demographics'])

In [5]:
ed.describe_data()



        ------------------------------------------------------------------
            self.path            : raw data loc
            self.metadata        : mturk metadata
            self.mapping         : reference table
            self.data_times      : reference times table
            self.participants    : list of participant identifiers
            self.parti_code      : list of participant codes
            self.n               : total number of samples
            self.wcst_paths      : paths to wcst  raw data
            self.nback_paths     : paths to nback raw data
            self.corsi_paths     : paths to corsi raw data
            self.fitts_paths     : paths to fitts raw data
            self.navon_paths     : paths to navon raw data
            self.wcst_data       : wcst  dataframe
            self.nback_data      : nback dataframe
            self.corsi_data      : corsi dataframe
            self.fitts_data      : fitts dataframe
            self.navon_data    

In [6]:
ed.clean_data_info()



                WCST - Wisconsin Card Sorting Task                                                  DataFrame: ed.raw.wcst_date
            ---------------------------------------------------------------------------------------------------------------------------
            
                participant                     : key               : participant ID
                card_no                         : categorical       : the card shown
                correct_card                    : categorical       : the card that should be clicked of the top four on screen      
                correct_persevering             : categorical       : the card that would be clicked if the participant is persevering
                seq_no                          : numeric           : trial number
                rule                            : categorical       : matching rule  
                card_shape                      : categorical       : current card shape
                card_num

In [7]:
# ----- all categories descriptors -----x
cats_demographics   = ['gender_a', 'handedness_a', 'education_a', 'age_group']
cats_navon          = [('level_of_target', '')]

# ---- add numerical descriptors ----x
num_demographics = ['age_a','income_a', 'computer_hours_a', 'mean_reation_time_ms']
num_nback = [('block_number', ''), ('score', 'mean'), ('score', 'std'), ('status', 'mean'), ('status', 'std'), ('miss', 'mean'), 
            ('miss', 'std'), ('false_alarm', 'mean'), ('false_alarm', 'std'), ('reaction_time_ms', 'mean'), ('reaction_time_ms', 'std')]
num_navon = [('correct', 'mean'), ('correct',  'std'), ('too_slow', 'mean'), ('too_slow',  'std'), 
            ('reaction_time_ms', 'mean'), ('reaction_time_ms',  'std')]
num_corsi = [('highest_span',  'max'), ('status', 'mean'), ('status',  'std')]
num_fitts = [('delta', 'mean'), ('delta',  'std'), ('status', 'mean')]


# ---- user selected menus ---x

# Fitts Law

In [8]:
def fitts_law_deviation(n=10, data=ed.fitts_summary_stats[[('delta','mean')]]):

    # ---- compute bins ----x
    def compute_fitts_bins(data, n=n):
        rng = np.linspace(min(data[('delta','mean')]), max(data[('delta','mean')]), n)
        data['bin'] = 'na'; data['group'] = 'na'
        for r in range(n-1): 
            data.loc[(data[('delta','mean')] > rng[r]) & (data[('delta','mean')] <= rng[r+1]), 'bin']   = str(round(rng[r+1])) + '-' + str(round(rng[r])) 
            data.loc[(data[('delta','mean')] > rng[r]) & (data[('delta','mean')] <= rng[r+1]), 'group'] = r+1
        return(data)
    data = compute_fitts_bins(data=data)
    
    # ---- plot ----x
    trace  = go.Histogram(x=data[('delta','mean')], marker_color='#4ca3dd')
    layout = go.Layout( title='Mean Fitts Law Deviation', xaxis={'title':'deviation from expected (Fitts Law Predicted) RT'}, yaxis={'title':'Frequency'}, 
                        template='none', width=700, height=500)
    fig    = go.Figure(data=trace, layout=layout)

    return({'data':data, 'figure':fig})


fitts = fitts_law_deviation()
fitts['figure']

In [9]:
fitts['data'].head()

Unnamed: 0_level_0,delta,bin,group
Unnamed: 0_level_1,mean,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-5.7,44--137,9
1,-203.05,-137--318,8
2,-342.65,-318--499,7
3,-85.05,44--137,9
4,-141.8,-137--318,8


# Corsi Blocak Span

In [10]:
data = ed.corsi_summary_stats
data.head()

Unnamed: 0_level_0,participant,highest_span,n_items,status,status
Unnamed: 0_level_1,Unnamed: 1_level_1,max,max,mean,std
0,100934.0,4,5,0.428571,0.534522
1,103322.0,6,7,0.625,0.517549
2,107700.0,6,7,0.625,0.517549
3,117200.0,4,5,0.5,0.547723
4,117306.0,5,6,0.571429,0.534522


In [11]:
  # ---- plot ----x
data = ed.corsi_summary_stats
trace = go.Histogram(x=data[('highest_span','max')], marker_color='#c43078')
layout = go.Layout( title='Max corsi block span distribution', xaxis={'title':'Max Corsi Block Span'}, yaxis={'title':'Frequency'}, 
template='none', width=700, height=500)
fig    = go.Figure(data=trace, layout=layout)
fig.show()

#   # ---- plot: % Correct (NA) ----x
# trace = go.Histogram(x=data[('status','mean')], marker_color='#18d4e4')
# layout = go.Layout( title='Max corsi block span distribution', xaxis={'title':'Max Corsi Block Span'}, yaxis={'title':'Frequency'}, 
# template='none', width=700, height=500)
# fig    = go.Figure(data=trace, layout=layout)
# fig.show()


In [13]:
colors = ['#A56CC1', '#A6ACEC', '#63F5EF']
#         self.demographics           : dataframe
data = ed.nback_summary_stats
data.head()

var = 'score'
fig = make_subplots(rows=2, cols=1, subplot_titles=(var+' mean', var+' std'))
hist_data_means = []; hist_data_std = []
group_labels = []
for i in [3,2,1]:
    group_labels.append('block ' + str(i))
    hist_data_means.append(data.loc[data['block_number']==i, (var, 'mean')])
    hist_data_std.append(data.loc[data['block_number']==i, (var, 'std')])


fig2 = ff.create_distplot(hist_data_means, group_labels, colors=colors, show_rug=False, bin_size=0.015)
fig3 = ff.create_distplot(hist_data_std, group_labels, colors=colors, show_rug=False, bin_size=0.015)


# ---- add subplots ----x
for i in [2,1,0]: 
    fig.add_trace(go.Histogram(fig2['data'][i], legendgroup=1), row=1, col=1)
    fig.add_trace(go.Histogram(fig3['data'][i], showlegend=False), row=2, col=1)
for j in [3,4,5]: 
    fig.add_trace(go.Scatter(fig2['data'][j]), row=1, col=1)
    fig.add_trace(go.Scatter(fig3['data'][j]), row=2, col=1)


# Add title
fig.update_layout(title_text='Nback Task Distribution', template='none')
fig.show()

In [17]:
data = ed.nback_summary_stats
data.head()
data.describe()

Unnamed: 0_level_0,participant,block_number,trial_counter,score,score,status,status,miss,miss,false_alarm,false_alarm,reaction_time_ms,reaction_time_ms
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std
count,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0
mean,546111.299213,2.0,26.666667,0.308727,0.460112,0.749694,0.366391,0.198381,0.340992,0.110346,0.232506,2214.854812,855.473684
std,261751.306114,0.817033,4.717141,0.090589,0.0424,0.193614,0.148714,0.134232,0.174419,0.126008,0.178033,640.105743,417.501856
min,100934.0,1.0,20.0,0.05,0.223607,0.15,0.0,0.0,0.0,0.0,0.0,149.366667,0.0
25%,317920.0,1.0,20.0,0.25,0.444262,0.633333,0.305129,0.1,0.305129,0.0,0.0,1975.783333,706.367101
50%,528567.5,2.0,30.0,0.3,0.470162,0.8,0.410391,0.2,0.410391,0.05,0.223607,2301.191667,1029.825048
75%,790956.0,3.0,30.0,0.366667,0.490133,0.9,0.48936,0.3,0.466092,0.2,0.406838,2657.325,1159.9737
max,998593.0,3.0,30.0,0.6,0.512989,1.0,0.512989,0.566667,0.512989,0.566667,0.512989,3000.0,1361.925104


In [56]:
N = 1000
t = np.linspace(0, 10, 100)



def scatter_plot(data, xvar, yvar, group_var=False, xlab='', ylab='', title='', cols=px.colors.qualitative.Pastel):
    

    if not group_var: 
        traces = [go.Scatter(x=data[xvar], y=data[yvar], mode='markers', marker_color=cols[0])]
        layout = go.Layout( title=title, xaxis={'title':xlab}, yaxis={'title':ylab}, template='none')
    else:
        traces = []; c=0
        for g in np.unique(data[group_var]):
            c += 1
            dt = data.loc[data[group_var]==g,]
            traces.append(go.Scatter(x=dt[xvar], y=dt[yvar], mode='markers', marker_color=cols[c], name=round(g)))
        layout = go.Layout( title=title, xaxis={'title':xlab}, yaxis={'title':ylab}, template='none', legend_title_text='Trend')
    fig = go.Figure(data=traces, layout=layout)
    return fig

scatter_plot(data=data, group_var='block_number', xvar=('reaction_time_ms', 'mean'), yvar=('status', 'mean'), xlab='Reaction Time (ms)', ylab='% Correct', title='Performance by Reaction Time (RT)')


    

In [77]:
# ----- NBack -----x
x = ed.nback_summary_stats.groupby('participant').agg({
    ('status', 'mean'): ['mean'],
    ('reaction_time_ms', 'mean'): ['mean']
})
x.columns = ['nback_status', 'nback_reaction_time_ms']
x.head()

Unnamed: 0_level_0,nback_status,nback_reaction_time_ms
participant,Unnamed: 1_level_1,Unnamed: 2_level_1
100934.0,0.894444,2218.077778
103322.0,0.916667,2366.011111
107700.0,0.95,2137.211111
117200.0,0.766667,2359.627778
117306.0,0.966667,2323.222222


In [58]:
ed.nback_summary_stats.describe()

Unnamed: 0_level_0,participant,block_number,trial_counter,score,score,status,status,miss,miss,false_alarm,false_alarm,reaction_time_ms,reaction_time_ms
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,mean,std,mean,std,mean,std,mean,std
count,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0,762.0
mean,546111.299213,2.0,26.666667,0.308727,0.460112,0.749694,0.366391,0.198381,0.340992,0.110346,0.232506,2214.854812,855.473684
std,261751.306114,0.817033,4.717141,0.090589,0.0424,0.193614,0.148714,0.134232,0.174419,0.126008,0.178033,640.105743,417.501856
min,100934.0,1.0,20.0,0.05,0.223607,0.15,0.0,0.0,0.0,0.0,0.0,149.366667,0.0
25%,317920.0,1.0,20.0,0.25,0.444262,0.633333,0.305129,0.1,0.305129,0.0,0.0,1975.783333,706.367101
50%,528567.5,2.0,30.0,0.3,0.470162,0.8,0.410391,0.2,0.410391,0.05,0.223607,2301.191667,1029.825048
75%,790956.0,3.0,30.0,0.366667,0.490133,0.9,0.48936,0.3,0.466092,0.2,0.406838,2657.325,1159.9737
max,998593.0,3.0,30.0,0.6,0.512989,1.0,0.512989,0.566667,0.512989,0.566667,0.512989,3000.0,1361.925104


In [None]:
# ---------------- demographic data 
demo_pie_map = {
    'gender_a':     {'dummy_var':'gender_a',        'labels':['male', 'female', 'other'],                       'colors':['steelblue', 'darkred', 'cyan'],                                          'title':'Gender Distribution',     'name':'gender'},
    'education_a':  {'dummy_var':'education_a',     'labels':['university', 'graduate school', 'high school'],  'colors':['rgb(177, 127, 38)', 'rgb(129, 180, 179)', 'rgb(205, 152, 36)'],  'title':'Education Distribution',   'name':'education'},
    'handedness_a': {'dummy_var':'handedness_a',    'labels':['right', 'left', 'ambidextrous'],                 'colors':px.colors.sequential.RdBu,                                         'title':'Handedness Distribution',  'name':'handedness'},
    'age_group':    {'dummy_var':'age_group',       'labels':np.unique(ed.demographics[['age_group']]).tolist(),'colors':px.colors.sequential.GnBu,                                         'title':'Age Distribution',         'name':'age'}
    }
    
demo_continuous_naming = {
     'age_a':                   {'xlab':'Age',                      'ylab':'Count', 'name':'Age Distribution by '},
     'income_a':                {'xlab':'Income',                   'ylab':'Count', 'name':'Income Distribution by '},
     'computer_hours_a':        {'xlab':'Computer hours',           'ylab':'Count', 'name':'Computer Hours Distribution by '},
     'mean_reation_time_ms':    {'xlab':'RT (reaction time (ms))',  'ylab':'Count', 'name':'RT Distribution by '},
}

In [49]:
# ----- all categories descriptors -----x
cats_demographics   = ['gender_a', 'handedness_a', 'education_a', 'age_group']
cats_navon          = [('level_of_target', '')]

# ---- add numerical descriptors ----x
num_demographics = ['age_a','income_a', 'computer_hours_a', 'mean_reation_time_ms']
num_nback = [('block_number', ''), ('score', 'mean'), ('score', 'std'), ('status', 'mean'), ('status', 'std'), ('miss', 'mean'), 
            ('miss', 'std'), ('false_alarm', 'mean'), ('false_alarm', 'std'), ('reaction_time_ms', 'mean'), ('reaction_time_ms', 'std')]
num_navon = [('correct', 'mean'), ('correct',  'std'), ('too_slow', 'mean'), ('too_slow',  'std'), 
            ('reaction_time_ms', 'mean'), ('reaction_time_ms',  'std')]
num_corsi = [('highest_span',  'max'), ('status', 'mean'), ('status',  'std')]
num_fitts = [('delta', 'mean'), ('delta',  'std'), ('status', 'mean')]


categorical_vars = [
    # ------ demographics -----x
    'gender_a', 'education_a', 'handedness_a', 'age_group',
    # ------ Fitts ------x
    ('delta', 'mean'),
    # ------ Nback ------x
    'nback_status', 'nback_reaction_time_ms',
    
    ]