# Imports

In [11]:
import pandas as pd
import numpy as np
from os import path
from IPython.display import display
from tqdm.notebook import tqdm

# Combine raw data files
Concatenate raw data and codify participant IDs into a more readable form.

In [None]:
# Open main files and combine them
df = pd.concat(
    (
        pd.read_csv('data/raw/ig_main.csv'),
        pd.read_csv('data/raw/eg_main.csv')
    )
)

# Codify subject IDs
df.loc[:, 'sid'] = df.sid.astype('category').cat.codes

# Save combined data
df.to_csv(path.join(data_path, 'combined_main.csv'), index=False)

# Exclude outliers
Exclude outliers based on allocation variance and response bias

In [162]:
def make_clean_dataset(input_data_path, save_path, **kwargs):
    # Define a response bias function
    def rbf(x):
        _, response_counts = np.unique(x.response, return_counts=True)
        return np.max(response_counts) / np.sum(response_counts)


    # Open combined data file
    df = pd.read_csv(input_data_path, index_col=None).set_index('sid')

    # Initialize columns to record values of interest
    df['alloc_var'], df['resp_bias'] = 0, 0

    # Calculate values of interest
    activities = ('A1', 'A2', 'A3', 'A4')
    for sid, sdf in tqdm(df.groupby(by='sid'), desc='Progress: '):
        # Allocation variance
        counts = [sum(sdf.activity == i) for i in activities]
        allocation_variance = np.std(counts)
        df.loc[sid, 'alloc_var'] = allocation_variance

        # Response bias
        response_bias = sdf.groupby('family').apply(rbf).mean()
        df.loc[sid, 'resp_bias'] = response_bias

    # Detect high allocation variance and response bias
    df_ = df.reset_index().groupby('sid').head(1).reset_index()
    df_['high_av'] = df_.alloc_var >= kwargs['av_crit']
    df_['high_rb'] = np.logical_and(df_.resp_bias > df_.resp_bias.mean() + kwargs['rb_crit'] * df_.resp_bias.std(), ~df_.high_av)

    display(df_.groupby(by='group')[['high_av', 'high_rb']].sum().astype(int))
    print('Found {} outliers'.format(np.logical_or(df_.high_av, df_.high_rb).sum()))

    # Exclude outliers
    outlier = df_.loc[df_.high_av | df_.high_rb, 'sid']
    df = df.loc[~df.index.isin(outlier), :]
    display(df.reset_index().groupby(by='group')['sid'].nunique())

    # Save data
    if save_path:
        df.reset_index().to_csv(save_path, index=False)
    

if 1:
    make_clean_dataset(
        input_data_path = 'data/combined_main.csv',
        save_path = 'data/clean_data.csv',

        # Set outlier criteria
        av_crit = 100,   # allocation variance critical value
        rb_crit = 2 ,    # response bias critical value
    )

HBox(children=(HTML(value='Progress: '), FloatProgress(value=0.0, max=400.0), HTML(value='')))




Unnamed: 0_level_0,high_av,high_rb
group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35,10
1,20,5


Found 70 outliers


group
0    154
1    176
Name: sid, dtype: int64

# Calculate heuristics
|Heuristic|Description ($t_i$ = trial number $i$; $w$ = window size)|
|:-------:|:--------------------------------------------------------|
| **PC**  | overall competence ($t_0$ to $t_i$)                     |
| **rPC** | recent competence ($t_{i-w}$ to $t_i$)                  |
| **rLP** | recent learning progress ($t_{i-w}$ to $t_i$)           |
| **SC**  | self-challenge                                          |

In [161]:
# Define a function for computing recent LP
def rlp_func(x, subwindow_1, subwindow_2, abs_lp=True):
    diff = np.mean(x[:subwindow_1]) - np.mean(x[-subwindow_2:])
    return np.abs(diff) if abs_lp else diff


def make_heuristics_dataset(input_data_path, save_path, **kwargs):
    # Read clean data and drop unused data
    df = pd.read_csv(input_data_path, index_col=None).set_index(['sid','activity'])
    df = df.loc[:, 'group,stage,trial,correct'.split(',')]
    df = df.loc[df.trial <= 60+250]
#     df = df.sort_index()

    # Add new columns
    activities = 'A1,A2,A3,A4'.split(',')
    for heuristic in ['pc','rpc','rlp']:
        for a in activities:
            df['{}{}'.format(heuristic, a[1])] = np.nan
    df['sc'] = np.nan

    # Calculate dynamic performance heuristics for each subject
    act_codes = {'A1':1, 'A2':2, 'A3':3, 'A4':4}
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress: '):
        for a in activities:
            x = sdf.loc[(i, a), 'correct'].astype(int)

            # Overall competence (pc)
            pc = np.cumsum(x) / np.arange(1, x.size+1)
            df.loc[(i, a), 'pc{}'.format(a[1])] = pc

            # Recent competence (rpc)
            rpc = x.rolling(min_periods=kwargs['window_size'], window=kwargs['window_size']).mean()
            df.loc[(i, a), 'rpc{}'.format(a[1])] = rpc

            # Recent learning progress (rlp)
            rlp = x.rolling(min_periods=kwargs['window_size'], window=kwargs['window_size']).apply(
                rlp_func, args=(kwargs['subwindow_size_1'], kwargs['subwindow_size_2']), raw=False
            )
            df.loc[(i, a), 'rlp{}'.format(a[1])] = rlp
        
        df.loc[(i, slice(None)), :] = df.loc[(i, slice(None)), :].fillna(method='ffill', axis=0)

        # Self-challenge (sc)
        rpc_max = df.loc[(i, slice(None)), 'rpc1':'rpc4'].max(axis=1).rolling(min_periods=1, window=250).max()
        rpc_min = df.loc[(i, slice(None)), 'rpc1':'rpc4'].min(axis=1).rolling(min_periods=1, window=250).min()
        act_inds = np.array([act_codes[a] for a in sdf.index.get_level_values(1).tolist()]) - 1
        current_rpc = df.loc[(i, slice(None)), 'rpc1':'rpc4'].values[np.arange(60+250), act_inds]
        sc = 1 - (current_rpc-rpc_min)/(rpc_max-rpc_min)
        df.loc[(i, slice(None)), 'sc'] = sc

    df = df.reset_index().sort_values(by=['sid', 'trial'])
    df.loc[df.stage=='train', 'sc'] = np.nan    # make sure SC is NaN during training
    display(df.loc[(df.sid==0)&(df.trial>=1)&(df.trial<70), :])
    
    # Save data
    if save_path:
        df.reset_index().to_csv(save_path, index=False)
    
    
if 1:
    make_heuristics_dataset(
        input_data_path = 'data/clean_data.csv',
        save_path = 'heuristics_data.csv',
        window_size = 15,
        subwindow_size_1 = 10,
        subwindow_size_2 = 6,
    )

HBox(children=(HTML(value='Progress: '), FloatProgress(value=0.0, max=330.0), HTML(value='')))




Unnamed: 0,sid,activity,group,stage,trial,correct,pc1,pc2,pc3,pc4,rpc1,rpc2,rpc3,rpc4,rlp1,rlp2,rlp3,rlp4,sc
0,0,A3,0,train,1,False,,,0.000000,,,,,,,,,,
1,0,A3,0,train,2,False,,,0.000000,,,,,,,,,,
2,0,A3,0,train,3,False,,,0.000000,,,,,,,,,,
3,0,A3,0,train,4,True,,,0.250000,,,,,,,,,,
4,0,A3,0,train,5,False,,,0.200000,,,,,,,,,,
5,0,A3,0,train,6,True,,,0.333333,,,,,,,,,,
6,0,A3,0,train,7,False,,,0.285714,,,,,,,,,,
7,0,A3,0,train,8,False,,,0.250000,,,,,,,,,,
8,0,A3,0,train,9,False,,,0.222222,,,,,,,,,,
9,0,A3,0,train,10,True,,,0.300000,,,,,,,,,,


In [86]:
import python_scripts.utils.loc_utils as lut
df = lut.unpickle('data/lpreds_data.pkl')

In [87]:
df.head()

Unnamed: 0,sid,grp,ntm,good,post,pre,lapt,sc_grand,sc_sw,sc_lep,sc_blk,ei,ri
0,0,0,3,False,0.966667,0.5,0.622222,0.25,0.454545,0.250783,0.335569,0.46,0.486957
1,1,0,3,True,0.788889,0.722222,0.711111,0.450756,0.471605,0.310822,0.433688,0.5,0.4
2,2,0,1,True,0.488889,0.488889,0.533333,0.433218,0.35,0.511014,0.514412,0.54,0.733333
3,3,0,3,True,0.955556,0.866667,0.866667,0.406033,0.36453,0.204654,0.35507,0.412,0.990291
4,4,0,2,False,0.733333,0.588889,0.6,0.323278,0.551587,0.262039,0.431748,0.308,0.519481
