* creates `deriv/recall_df.csv`


In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

import sklearn
import brainiak
import nilearn as nl

from nilearn import image, plotting, input_data
from scipy.spatial import distance

pd.options.display.max_rows = 200

In [2]:
view_df = pd.read_csv('deriv/view_df.csv',index_col=0)

### make the initial logfile df with onsetTR, offsetTR and wed_id - 12 rows, one per wedding

In [3]:
def read_mylogfile(sub_num):
  """ 
  read logfile into dataframe
    NB using a different logfile from the one used in view
  returns df with 12 rows: each row recall cue
  """
  TR_rate = 1.5
  # load sub log file
  fpath = 'data/behav/silvy_buckets/sub%.2iday2/%i_recall_mylog.log'%(100+sub_num,sub_num)
  f = open(fpath, "r")
  first_line = f.readline()
  # extract first TR tstamp
  first_tstamp = float(first_line.split()[1])
  # init df
  sub_df = pd.DataFrame(columns=['sub_num','wed_id','onsetTR'])
  # loop over rows in logfile (stim onsets)
  for x in f:
    # extract info
    RV,tstamp,stim = x.split(' ')
    tstamp = float(tstamp[:-1])
    wed_id = stim.split('/')[-1].split('.')[0][1:]
    onsetTR = np.floor((tstamp-first_tstamp)/TR_rate)
    # populate df with info
    sub_df.loc[tstamp,'sub_num'] = sub_num
    sub_df.loc[tstamp,'wed_id'] = wed_id
    sub_df.loc[tstamp,'onsetTR'] = onsetTR 
  return sub_df


def get_final_recall_TR(sub_num):
  """
  currently using nuisance var file to find last TR
  NB roi files have extra TR, so I adjust
  """
  final_TR = pd.read_csv(
    "data/fmri/selected_nuisance/sub-%i_ses-02_task-recall_confounds_selected.txt"%(100+sub_num)).shape[0]
  # adjust for extra TR in ROi files compared to nuisance
  final_TR += 1 
  return final_TR


def include_offsetTR_col(df):
  """ set offset of even as onset of next event
  """
  df = df.sort_values('onsetTR')
  df.loc[:,'wed_num_recall'] = np.arange(12)
  df.index = np.arange(12)
  df.loc[np.arange(11),'offsetTR'] = df.iloc[1:].onsetTR.values
  # final TR from roi file
  sub_num = df.sub_num.unique()[0]
  df.loc[11,'offsetTR'] = get_final_recall_TR(sub_num)
  return df



### from logfile_df make recall_df where each row is a TR

In [4]:
def init_recall_df(logdf):
  """
  expand logdf so that each row is a TR
  """
  L = []
  for idx,log_df_row in logdf.iterrows(): 
    for TR in np.arange(log_df_row.onsetTR,log_df_row.offsetTR):
      D = {}
      D['TR'] = int(TR)
      D['sub_num'] = int(log_df_row.sub_num)
      D['wed_id'] = int(log_df_row.wed_id)
      D['wed_num_recall'] = int(log_df_row.wed_num_recall)
      D['onsetTR'] = int(log_df_row.onsetTR)
      D['offsetTR'] = int(log_df_row.offsetTR)
      L.append(D)
  return pd.DataFrame(L)



### include recall transcriptions for each TR of recall_df

In [5]:
def load_transcript_df(sub_num):
  """ 
  reindex from seconds to TRs
    NB this causes duplicates
    to resolve duplicates, non-zero entries break the tie
  """
  TR_rate = 1.5
  tdf = pd.read_csv('data/behav/silvy_buckets/recallTranscriptions/S%i.csv'%sub_num,index_col=0).T.fillna(0)
  if sub_num==6: 
    tdf = tdf.replace('\n',0)   
  # transform index from seconds to TRs
  tdf.index = (tdf.index.astype(int)/TR_rate).astype(int)
  tdf = tdf.astype(int)
  return resolve_tdf_duplicates(tdf)

def resolve_tdf_duplicates(tdf):
  """
  when converting tdf seconds to TRs,
    there are duplicates
    to resolve duplicates, 
      take row with most non-zeros
      *NB could be improved*
  """
  L = []
  for idx in tdf.index.unique():
    rows = tdf.loc[idx,:]
    # detect if there are duplicates
    if len(rows.shape)==1:
      row = rows
    elif len(rows.shape)>1:
      # resolve duplicate entries
      keep_row_num = np.sum(rows != 0,1).argmax()    
      row = rows.iloc[keep_row_num]
    L.append(row)
  return pd.concat(L,1).T

In [6]:
def include_recall_transcription(recall_df,transcript_df):
  """ 
  using wedding_id to match between 
  information in log_df with transcriptions
  """
  for idx,recall_df_row in recall_df.iterrows():
    transcript_df_row_num = recall_df_row.TR - int(recall_df_row.onsetTR)
    recall = transcript_df.loc[transcript_df_row_num,"W%i"%int(recall_df_row.wed_id)]
    recall_df.loc[idx,'recall'] = int(recall)
  recall_df = recall_df.astype({'recall':int})
  return recall_df

### loop over subjects

In [7]:
def build_sub_recall_df(sub_num):
  log_df = read_mylogfile(sub_num)
  log_df = include_offsetTR_col(log_df)
  recall_df = init_recall_df(log_df)
  ## include transcribed recall
  transcript_df = load_transcript_df(sub_num)
  transcript_df = resolve_tdf_duplicates(transcript_df)
  recall_df = include_recall_transcription(recall_df,transcript_df)
  return recall_df

In [8]:
dfL = []
for sub_num in np.arange(45):
  try:
    dfL.append(build_sub_recall_df(sub_num))
  except:
    print('err, sub',sub_num)
    
recall_df = pd.concat(dfL)

err, sub 0
err, sub 1
err, sub 15
err, sub 16
err, sub 20
err, sub 21


### clean-up and save

In [9]:
## CONCAT AND REIDNEX
recall_df.index = np.arange(len(recall_df))
recall_df = recall_df.astype({'TR':int})

In [10]:
recall_df.to_csv('deriv/recall_df.csv')