* creates `deriv/recall_df.csv`


In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

import sklearn
import brainiak
import nilearn as nl

from nilearn import image, plotting, input_data
from scipy.spatial import distance

pd.options.display.max_rows = 200

In [2]:
view_df = pd.read_csv('deriv/view_df.csv',index_col=0)

### make the initial logfile df with onsetTR, offsetTR and wed_id - 12 rows, one per wedding

In [3]:
def read_mylogfile(sub_num):
  """ 
  read logfile into dataframe
    NB using a different logfile from the one used in view
  returns df with 12 rows: each row recall cue
  """
  TR_rate = 1.5
  # load sub log file
  fpath = 'data/behav/silvy_buckets/sub%.2iday2/%i_recall_mylog.log'%(100+sub_num,sub_num)
  f = open(fpath, "r")
  first_line = f.readline()
  # extract first TR tstamp
  first_tstamp = float(first_line.split()[1])
  # init df
  sub_df = pd.DataFrame(columns=['sub_num','wed_id','onsetTR'])
  # loop over rows in logfile (stim onsets)
  for x in f:
    # extract info
    RV,tstamp,stim = x.split(' ')
    tstamp = float(tstamp[:-1])
    wed_id = stim.split('/')[-1].split('.')[0][1:]
    onsetTR = np.floor((tstamp-first_tstamp)/TR_rate)
    # populate df with info
    sub_df.loc[tstamp,'sub_num'] = sub_num
    sub_df.loc[tstamp,'wed_id'] = wed_id
    sub_df.loc[tstamp,'onsetTR'] = onsetTR 
  return sub_df


def get_final_recall_TR(sub_num):
  """
  currently using nuisance var file to find last TR
  NB roi files have extra TR, so I adjust
  """
  final_TR = pd.read_csv(
    "data/fmri/selected_nuisance/sub-%i_ses-02_task-recall_confounds_selected.txt"%(100+sub_num)).shape[0]
  # adjust for extra TR in ROi files compared to nuisance
  final_TR += 1 
  return final_TR


def include_offsetTR_col(df):
  """ set offset of even as onset of next event
  """
  df = df.sort_values('onsetTR')
  df.loc[:,'wed_num_recall'] = np.arange(12)
  df.index = np.arange(12)
  df.loc[np.arange(11),'offsetTR'] = df.iloc[1:].onsetTR.values
  # final TR from roi file
  df.loc[11,'offsetTR'] = get_final_recall_TR(sub_num)
  return df



### from logfile_df make recall_df where each row is a TR

In [20]:
def init_recall_df(logdf):
  """
  expand logdf so that each row is a TR
  """
  L = []
  for idx,log_df_row in logdf.iterrows(): 
    for TR in np.arange(log_df_row.onsetTR,log_df_row.offsetTR):
      D = {}
      D['TR'] = int(TR)
      D['sub_num'] = int(log_df_row.sub_num)
      D['wed_id'] = int(log_df_row.wed_id)
      D['wed_num_recall'] = int(log_df_row.wed_num_recall)
      D['onsetTR'] = int(log_df_row.onsetTR)
      D['offsetTR'] = int(log_df_row.offsetTR)
      L.append(D)
  return pd.DataFrame(L)



### include recall transcriptions for each TR of recall_df

In [21]:
def load_transcript_df(sub_num):
  """ 
  reindex from seconds to TRs
    NB this causes duplicates
    to resolve duplicates, non-zero entries break the tie
  """
  TR_rate = 1.5
  tdf = pd.read_csv('data/behav/silvy_buckets/recallTranscriptions/S%i.csv'%sub_num,index_col=0).T.fillna(0)
  if sub_num==6: 
    tdf = tdf.replace('\n',0)   
  # transform index from seconds to TRs
  tdf.index = (tdf.index.astype(int)/TR_rate).astype(int)
  tdf = tdf.astype(int)
  return resolve_tdf_duplicates(tdf)

def resolve_tdf_duplicates(tdf):
  """
  when converting tdf seconds to TRs,
    there are duplicates
    to resolve duplicates, 
      take row with most non-zeros
      *NB could be improved*
  """
  L = []
  for idx in tdf.index.unique():
    rows = tdf.loc[idx,:]
    # detect if there are duplicates
    if len(rows.shape)==1:
      row = rows
    elif len(rows.shape)>1:
      # resolve duplicate entries
      keep_row_num = np.sum(rows != 0,1).argmax()    
      row = rows.iloc[keep_row_num]
    L.append(row)
  return pd.concat(L,1).T

In [22]:
def include_recall_transcription(recall_df,transcript_df):
  """ 
  using wedding_id to match between 
  information in log_df with transcriptions
  """
  for idx,recall_df_row in recall_df.iterrows():
    transcript_df_row_num = recall_df_row.TR - int(recall_df_row.onsetTR)
    recall = transcript_df.loc[transcript_df_row_num,"W%i"%int(recall_df_row.wed_id)]
    recall_df.loc[idx,'recall'] = int(recall)
  recall_df = recall_df.astype({'recall':int})
  return recall_df

### schema and path information

In [34]:
sub_num = 33
recall_df = build_sub_recall_df(sub_num)

for idx,recall_df_row in recall_df.iterrows():
  ## select rows corresponding to subject/wedding
  sub_bool = (view_df.sub_num == recall_df_row.sub_num)
  wed_bool = (view_df.wed_id == recall_df_row.wed_id)
  sub_wed_view_df = view_df[sub_bool & wed_bool]
  ## get row corresponding to first event
  sub_wed_view_row = sub_wed_view_df[sub_wed_view_df.state.str[0] == '2']
  path = sub_wed_view_row.wed_schema + sub_wed_view_row.state.str[1]
#   recall_df.loc[idx,'wed_schema'] = sub_wed_view_row.wed_schema
  print(idx
  recall_df.loc[recall_df_row,'wed_path'] = path
  
recall_df

#   # find path for given sub/wed in view_df
#   view_row = sub_wed_view_df[sub_wed_view_df.state.str[0] == '2']
#   path = view_row.schema + view_row.state.str[1]
#   # include path in recall_df
#   try:
#     recall_df.loc[idx,'schema'] = str(view_row.schema.values[0])
#     recall_df.loc[idx,'path'] = str(path.values[0])
#   except:
#     print('err S=',row.sub_num)
    
# sub_wed_view_df

0


ValueError: Incompatible indexer with Series

In [None]:
assert False

### loop over subjects

In [7]:
def build_sub_recall_df(sub_num):
  log_df = read_mylogfile(sub_num)
  log_df = include_offsetTR_col(log_df)
  recall_df = init_recall_df(log_df)
  ## include transcribed recall
  transcript_df = load_transcript_df(sub_num)
  transcript_df = resolve_tdf_duplicates(transcript_df)
  recall_df = include_recall_transcription(recall_df,transcript_df)
  return recall_df

In [12]:
sub_num=33
L = []
for sub_num in np.arange(33,35):
  sub_recall_df = build_sub_recall_df(sub_num)
  L.append(sub_recall_df)
  
group_recall_df = pd.concat(L)

Unnamed: 0,TR,sub_num,wed_id,wed_num_recall,onsetTR,offsetTR,recall
0,0,33,22,0,0,40,0
1,1,33,22,0,0,40,0
2,2,33,22,0,0,40,0
3,3,33,22,0,0,40,0
4,4,33,22,0,0,40,1
...,...,...,...,...,...,...,...
1028,504,34,17,11,483,509,0
1029,505,34,17,11,483,509,0
1030,506,34,17,11,483,509,0
1031,507,34,17,11,483,509,0


### clean-up and save

In [None]:
group_recall_df.index = np.arange(len(group_recall_df))
group_recall_df

In [9]:
assert False

AssertionError: 

In [None]:
""" loop over subejcts to make group recall_df"""
rm_subs = [2,15]

L = []
err_sub_L = []
for sub_num in np.arange(45):
  if sub_num in rm_subs:
    continue
  print('sub',sub_num)
  try:
    ldf = load_logdf(sub_num)
    tdf = load_transcript_df(sub_num)
    sub_recall_df = make_sub_recall_df(ldf,tdf)
    L.append(sub_recall_df)
  except:
    err_sub_L.append(sub_num)
    print('ERR. sub',sub_num)

## CONCAT AND REIDNEX
recall_df = pd.concat(L)
recall_df.index = np.arange(len(recall_df))

In [None]:
'missing subjects'
err_sub_L

### include path and schema 

In [None]:
recall_df = recall_df.astype({'TR':int})

In [None]:
recall_df.to_csv('deriv/recall_df.csv')