## this NB generates `view_df` - a dataframe containing information from viewing runs
- steps:
- load `logfile_df` from psychopy logfile
    - NB `logfile_df` has a row for each timestamps
    - extract TR information and remove rows corresponding to timestamps before first TR
- from `logfile_df`, extract `state_dict`: what events was visited on each wedding
- from `logfile_df` and `state_dict`, make `timing_df` 
    - NB `timing_df` has a row for each event in the experiment
- include wedding level information (schema and wed_id)
- `timing_df` = `view_df`

In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

import sklearn
import brainiak
import nilearn as nl
from nilearn import image, plotting, input_data

from glob import glob as glob

from scipy.spatial import distance

pd.options.display.max_rows = 200

### load logfile_df and estimate the TR of each timestep


In [2]:
def read_logfile(sub_num):
  """ 
  read psychopy logifle into dataframe
  """
  # initialize dataframe
  df = pd.DataFrame(columns=['tstamp','logdata'])
  df = df.astype({'tstamp':float})
  # open logfile
  fpath = 'data/behav/silvy_buckets/sub%iday2/%i_viewing.log'%(100+sub_num,sub_num)
  f = open(fpath, "r")
  # loop over logfile rows
  for x in f:
    # deal with S11,39
    if len(x.split('\t')) != 3: continue
    tstamp,B,C = x.split('\t')
    tstamp = float(tstamp)
    df.loc[tstamp,'logdata'] = C[:-1]
    df.loc[tstamp,'tstamp'] = tstamp
  return df

def estimate_TR(df):
  """
  takes logdf (indexed by tstamps)
  includes the TR on each row
  corrects for hemodynamic lag (HRF)
  """
  TR_rate = 1.5
  # first TR
  first_TR_tstamp = df[df.logdata=='Keypress: equal'].iloc[0].tstamp
  # center tstamps on first TR
  df.tstamp = df.tstamp-first_TR_tstamp
  # include TR column
  df['TR'] = np.ceil(df.tstamp/TR_rate)
  df = df.astype({'TR':int})
  # remove negative TRs
  df = df[df.TR>-1]
  #  HRF hemodynamic lag 
  df.TR = df.TR + 3
  # reindex by row number
  df.index = np.arange(len(df))
  return df

def make_logfile_df(sub_num):
  """
  wrapper for funs above
  """
  df = read_logfile(sub_num)
  df = estimate_TR(df)
  return df

### go from logfile_df (indexed by timestamps) to timing_df (TR level information)

In [3]:
def init_sub_timing_df(logdf,sub_num):
  """ 
  this fun transforms the logfile_df (each timestamp a row) 
  into the timing_df (each experiment event a row)
  edit needed: 
    fun takes dict with info about the state_value
  """
  vid_strL = ['vid1a','vid1b','vid2','vid3','vid4','vid5',
           'vid1a_q','vid1b_q','vid2_q','vid3_q','vid4_q','vid5_q']
  df_row_L = []
  ## loop over video strings, extract onset TRs
  for vid_str in vid_strL:
    ## extract onset TR for wedding string
    TR_vals = logdf[logdf.logdata == '%s: autoDraw = True'%vid_str].TR.values
    if vid_str[-2:]=='_q':
      df_row = pd.DataFrame.from_dict({
        'sub_num':np.repeat(sub_num,2),
        'vid_str':np.repeat(vid_str,2),
        'wed_num':[0,11],
        'onsetTR':TR_vals
      })
    else:
      df_row = pd.DataFrame.from_dict({
        'sub_num':np.repeat(sub_num,10),
        'vid_str':np.repeat(vid_str,10),
        'wed_num':np.arange(1,11),
        'onsetTR':TR_vals
      })
    df_row_L.append(df_row)
  df = pd.concat(df_row_L)
  df.index = np.arange(len(df))
  return df

def include_offset_TRs(df):
  """
  includes the offset TRs for each experimental event in a timing df
  note from Silvy:
    the first 26 seconds are intro, followed by 9 seconds start-event, 
    17 seconds campfire or flower (depending on label in pkl), 
    23 seconds coin or torch, 24 seconds egg or painting, and remainder gifts. 
    NB CURRENTLY USING 10S AS PLACEHOLDER FOR FINAL EVENT (gifts) 
  """

  ## dict with len of each vid
  TR_rate = 1.5
  vid_len_D = {'vid1a':26/TR_rate,'vid1b':9/TR_rate,'vid2':17/TR_rate,
               'vid3':23/TR_rate,'vid4':24/TR_rate,'vid5':10/TR_rate,
               'vid1a_q':26/TR_rate,'vid1b_q':9/TR_rate,'vid2_q':17/TR_rate,
               'vid3_q':23/TR_rate,'vid4_q':24/TR_rate,'vid5_q':10/TR_rate
              }
  vid_len_D = {k:np.round(v).astype(int) for k,v in vid_len_D.items()}
  for vid_str,num_TRs in vid_len_D.items():
    df.loc[df['vid_str']==vid_str,'len_TRs'] = int(num_TRs)
  df['offsetTR'] = df['onsetTR'] + df['len_TRs']
  return df

### label each TR with state

In [4]:
def get_state_dict(logdf):
  """
  returns a dict indexed by (wed_num,state_type/depth)
  which gives the state_value [2a,2b,3a,3b,4a,4b]
  """
  init_wed_idx_L = logdf[logdf.logdata.str[:len('Created vid1a_q')] == 'Created vid1a_q'].index
  # from these rows, uncover what states were used for given wedding 
  state_dict = {}
  for wed_num,init_wed_idx in enumerate(init_wed_idx_L):
    for i,r in logdf.iloc[init_wed_idx:init_wed_idx+6].iterrows():
      state_id = r.logdata.split(',')[5].split('/')[1].split('.')[1]
      state_dict[(wed_num,int(state_id[0]))] = state_id
  return state_dict

def include_state_value(timing_df,state_dict):
  """ 
  uses state_dict to include the column state_value in timing_df
  """
  timing_df.loc[:,'state'] = 'N/A'
  for i,r in timing_df.iterrows():
    state_value = state_dict[(r.wed_num,int(r.vid_str[3]))]
    timing_df.loc[i,'state'] = state_value
  return timing_df

### wedding level info

In [5]:
def load_wed_df(sub_num):
  fpathL = glob('data/behav/silvy_buckets/sub%iday2/%i_viewing_*trials.csv'%(100+sub_num,sub_num))
  fpathL.sort()
  orderdf = pd.read_csv(fpathL[-1]).iloc[:12]
  orderdf = orderdf.loc[:,('northOrSouth','stimFile1a')]
  orderdf.columns = ['wed_schema','wed_id']
  orderdf.wed_schema = orderdf.wed_schema.str.split('.').str[0].str[0].str.capitalize()
  orderdf.wed_id = orderdf.wed_id.str.split('-').str[-1].str.split('.').str[0]
  orderdf.loc[:,'wed_num'] = np.arange(12)
  orderdf.loc[:,'sub_num'] = sub_num
  return orderdf

def include_wed_info(timing_df,wed_df):
  for idx,wed_df_row in wed_df.iterrows():
    sub_bool = timing_df.sub_num == wed_df_row.sub_num
    wed_bool = timing_df.wed_num == wed_df_row.wed_num
    timing_df.loc[sub_bool & wed_bool,'wed_id'] = wed_df_row.wed_id
    timing_df.loc[sub_bool & wed_bool,'wed_schema'] = wed_df_row.wed_schema
  return timing_df

### wrapper for gathering all info of given subject

In [6]:
def load_sub_timing_df(sub_num):
  """ 
  main wrapper function for loading info of a given subject
  """
  log_df = make_logfile_df(sub_num)
  wed_df = load_wed_df(sub_num)
  state_dict = get_state_dict(log_df)
  ## from logfile_df to timing_df
  df = init_sub_timing_df(log_df,sub_num)
  df = include_state_value(df,state_dict)
  df = include_offset_TRs(df)
  df = include_wed_info(df,wed_df)
  df = df.astype({'onsetTR':int,'len_TRs':int,'offsetTR':int})
  df = df.sort_values('onsetTR')
  return df

### include wedding id and wedding schema

In [7]:
""" 
loop over subjects
"""

sub_df_L = []
for sub_num in np.arange(45):
  print(sub_num)
  try:
    sub_timing_df = load_sub_timing_df(sub_num)
    sub_df_L.append(sub_timing_df)
  except:
    print('err',sub_num)

timing_df = pd.concat(sub_df_L)



0
err 0
1
err 1
2
err 2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
err 16
17
18
19
20
err 20
21
err 21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
err 37
38
39
err 39
40
41
42
43
44


# cleanup and save

In [8]:
# sort and reindex
timing_df = timing_df.sort_values(['sub_num','onsetTR'])
timing_df.index = np.arange(len(timing_df))

In [10]:
timing_df.to_csv('deriv/view_df.csv')
timing_df

Unnamed: 0,sub_num,vid_str,wed_num,onsetTR,state,len_TRs,offsetTR,wed_id,wed_schema
0,3,vid1a_q,0,12,1b,17,29,20,S
1,3,vid1b_q,0,29,1b,6,35,20,S
2,3,vid2_q,0,35,2b,11,46,20,S
3,3,vid3_q,0,49,3a,15,64,20,S
4,3,vid4_q,0,67,4b,16,83,20,S
...,...,...,...,...,...,...,...,...,...
2659,44,vid1b_q,11,1091,1b,6,1097,38,S
2660,44,vid2_q,11,1097,2a,11,1108,38,S
2661,44,vid3_q,11,1111,3b,15,1126,38,S
2662,44,vid4_q,11,1129,4a,16,1145,38,S
