# Selecting demographic variables
### Model free analysis 

This notebook employs t-tests to select `demographic variables`: by testing whether or not statistically significant differences are observable in average perfomance of `wcst` between difference demographic groups.

Where statistical difference are observed, one would like to keep the demographic information in the model. However, where no significant relationship can be drawn, it it is best to remove the superfluous parameters for parsimony.

---------
```
author:         Zach Wolpe
email:          zachcolinwolpe@gmail.com
reviewer:       n/a
date:           06 Dec 2021
```
---------

In [6]:
import sys
sys.path.append('../process data')
from dependencies import *
# import nbformatx

In [7]:
# load data --------------*
with open('../data objects/batch_processing_object_with_encodings.pkl', 'rb') as file2: bp = pickle.load(file2)

# create object ----------*
spf = summary_plots_and_figures(bp)

# view keys --------------*
def show_keys(obj, name):
    print('')
    print(name + ' object attributes: ')
    for k in obj.__dict__.keys():print('   -', k)

show_keys(bp, 'ed')
show_keys(bp.raw, 'bp')
show_keys(spf, 'spf')


ed object attributes: 
   - raw
   - summary_table
   - fitts_summary_stats
   - corsi_summary_stats
   - navon_summary_stats
   - nback_summary_stats
   - demographics_plot
   - demographics

bp object attributes: 
   - path
   - metadata
   - mapping
   - data_times
   - participants
   - parti_code
   - n
   - wcst_paths
   - nback_paths
   - corsi_paths
   - fitts_paths
   - navon_paths
   - wcst_data
   - nback_data
   - corsi_data
   - fitts_data
   - navon_data
   - individual_data
   - MTurk

spf object attributes: 
   - ed
   - wcst_performance
   - final_data_aggregated
   - final_data_unaggregated
   - continuous_vars
   - categorical_vars
   - demographic_groups
   - demographic_cont_vars
   - demo_pie_map
   - demo_continuous_naming


# Outlier removal

In [30]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots



# psychological variables
columns = ['nback_status', 'nback_reaction_time_ms', 'fitts_mean_deviation', 'corsi_block_span', 'navon_level_of_target', 'navon_perc_correct', 'navon_reaction_time_ms', 'wcst_accuracy', 'wcst_RT']
columns = ['wcst_accuracy', 'wcst_RT', 'nback_status', 'nback_reaction_time_ms', 'fitts_mean_deviation', 'corsi_block_span']
# colours
clrs = ['#1a3dea', '#fbab75', '#2180c7', '#7ba053', '#ffc34c', '#006b68', '#537fbb', '#80c080', '#537fbb']
# clrs = [val for val in clrs for _ in (0,1)]

row = [1,2]*3
col = [1,1,2,2,3,3]

df = bp.summary_table
var_name=None


fig = make_subplots(rows=2, cols=3)

traces = []
for clr, colmn, r, c in zip(clrs, columns, row, col):
    variable, threshold, colour =  colmn, 0.45, clr

    below = round(np.mean(df[variable] < threshold), 3)
    if not var_name: var_name=variable

    fig.add_trace(go.Histogram(x=df[variable], name=var_name), row=r, col=c)
    # fig.add_vline(x=threshold, line_width=3, line_dash="dash", line_color='darkred', row=r, col=c)

    var_name=None

fig.update_layout(template='none')

fig.show()

In [15]:
# show_keys(bp, 'ed')
# show_keys(bp.raw, 'bp')
# show_keys(spf, 'spf')




#
def plot_threshold(df, variable, threshold, var_name=None, colour='steelblue'):
    below = round(np.mean(df[variable] < threshold), 3)
    if not var_name: var_name=variable
    fig = px.histogram(df, x=variable, marginal="box", template='none', title=f'{var_name}: % subjects below threshold={below}', color_discrete_sequence=[colour])
    fig.add_vline(x=threshold, line_width=3, line_dash="dash", line_color="darkred")
    fig.show()

# psychological variables
columns = ['nback_status', 'nback_reaction_time_ms', 'fitts_mean_deviation', 'corsi_block_span', 'navon_level_of_target', 'navon_perc_correct', 'navon_reaction_time_ms', 'wcst_accuracy', 'wcst_RT']

# colours
clrs = ['#1a3dea', '#fbab75', '#2180c7', 'indigo-red', '#7ba053', '#ffc34c', '#006b68', '#537fbb', '#80c080', '#537fbb']


# Demographics selection

In [4]:
p=np.random.choice(bp.summary_table.index)
sub = bp.raw.wcst_data.loc[bp.raw.wcst_data.participant==p,]

bp.summary_table.loc[bp.summary_table.index==p,['wcst_accuracy','wcst_RT']]



Unnamed: 0_level_0,wcst_accuracy,wcst_RT
participant,Unnamed: 1_level_1,Unnamed: 2_level_1
469441.0,0.85,1336.19
469441.0,0.85,1336.19
469441.0,0.85,1336.19


In [5]:

rng = np.random.default_rng()
rvs1 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
rvs2 = stats.norm.rvs(loc=5, scale=10, size=500, random_state=rng)
stats.ttest_ind(rvs1, rvs2)

Ttest_indResult(statistic=0.10759511119051789, pvalue=0.9143385009811279)