### 02_interpolate_decision_mt.ipynb
In this script, we will convert the existing mouse tracking data into a dataframe that contains data every 20 ms. Movements that are stagnant will remain the same across interpolated timepoints (zero-order hold interpolation).

In [None]:
import os 
import pandas as pd 
import numpy as np 
from glob import glob

%autosave 180

In [None]:
# Establish directories 
currPath = os.getcwd()
rootPath = os.path.abspath(os.path.join(currPath, os.pardir, os.pardir))
dataPath = os.path.join(rootPath, 'data/raw/raw_trajectories')
proc_dataPath = os.path.join(rootPath, 'data/processed/trajectories') # processed data path

# Get the subject files 
sub_files = os.listdir(dataPath); sub_files.sort()

In [None]:
# Are we saving files? 
saveToggle = False

#### Interpolate the mouse tracking data for each subject

In [None]:
# For each file.... 
for m in sub_files[:1]: 
    curr_mouse = pd.read_csv(m)
    
    # Remove the practice trials 
    new_cols = [x.strip(' ') for x in curr_mouse.columns]; curr_mouse.columns = new_cols # get rid of extra spaces 
    curr_mouse = curr_mouse[curr_mouse['Screen'] != 'practice_stim']
    
    # Place the subject label in the dataframe
    curr_mouse['sub'] = m.split('/')[-2]
    
    # Remove columns and rows we don't need 
    curr_mouse = curr_mouse.drop(columns = ['Experiment ID', 'Experiment Version', 'Screen ID', 'Screen Index', 'Screen Counter', 'Spreadsheet Index', 'Unique ID'])
        
    #######################################################################################################################
    ####### Interpolate mousetracking data
    
    # Set up new mouse data dataframe 
    mouse_interp = []
    
    # For each trial.... 
    trial_nums = np.unique(curr_mouse['Trial Count'].values)
    for t in trial_nums: 
        curr_trial = curr_mouse[curr_mouse['Trial Count'] == t]
        
        # Come up with the list of numbers that will be the 'timepoints' we work with
        tp = np.arange(np.min(curr_trial['Elapsed'].values), np.max(curr_trial['Elapsed'].values) + 20, 20)
        tp_0 = np.arange(0, len(tp)*20, 20) # timepoints relative to 0
        
        # Cycle through our timepoints and find matching information... 
        trial_df = [] # future dataframe for our interpolated data
        for curr_time in tp: 
            # Define the number (index) of the timepoint we are currently dealing with
            tp_index = list(tp).index(curr_time)
            
            # If we are dealing with the first timepoint.... 
            if tp_index == 0: 
                # Save the first timepoint
                time_df = pd.DataFrame(curr_trial.iloc[0]).T.copy()
                
                # Then, impute the new elapsed value
                time_df['Elapsed_new'] = time_df['Elapsed'].values[0]
                
                # Add in a column that tracks the timepoint relative to 0 
                time_df['timepoint_rel_0'] = tp_0[tp_index]
                trial_df.append(time_df)
        
            else: 
                # Find the portion of the dataframe that falls within the range of time we are looking for
                lower_time_thresh = curr_time - 20
                curr_time_match = curr_trial[(curr_trial['Elapsed'] > lower_time_thresh) & (curr_trial['Elapsed'] <= curr_time)]
                
                # If there is an existing match...
                if len(curr_time_match) > 0: 
                    # Get the data and change the elapsed and relative to 0 columns
                    # We will take the 'last' row of this match, just in case an additional trial occurred within 20 ms 
                    time_df = pd.DataFrame(curr_time_match.iloc[-1]).T.copy()
                    
                    # Then, impute the new elapsed value
                    time_df['Elapsed_new'] = tp[tp_index]
                    
                    # Also, place in the value that tracks the timepoint relative to 0 
                    time_df['timepoint_rel_0'] = tp_0[tp_index]
                    trial_df.append(time_df)
                    
                # If there isn't an existing match, e.g., someone just didn't move their mouse for more than 20 ms... 
                # Take the preceding location from our eventual dataframe
                else: 
                    time_df = trial_df[-1].copy()
                    
                    # Then, impute the new elapsed value
                    time_df['Elapsed_new'] = tp[tp_index]
                    
                    # Also, place in the value that tracks the timepoint relative to 0 
                    time_df['timepoint_rel_0'] = tp_0[tp_index]
                    trial_df.append(time_df)            
                
        # Then, save this entire trial dataframe to our overall mouse data 
        trial_df = pd.concat(trial_df) # initialize trial df
        mouse_interp.append(trial_df)  
    
    # Initialize dataframe (after creating the entire file)
    mouse_interp = pd.concat(mouse_interp)
    
    #######################################################################################################################
    ####### Link to decision data
    
    # Import the decision phase accuracy (whether subject sorted correctly)
    sub_label = m.split('_')[0]
    
    # Import the decision phase metadata
    enc_file = glob(os.path.join(rootPath, 'data/raw/raw_acc/*_encoding.csv' % sub_label)) # contains non-anonymized data, not on Github repo
    enc_file = pd.read_csv(enc_file[0])
    
    # Merge the encoding data with accuracy (as well as assoc accuracy as well, just in case we want to look at that later)
    enc_file = enc_file[enc_file.columns[33:]]
    enc_file = enc_file[(enc_file['Screen'] == 'stim') & (enc_file['Response Type'] == 'response')]
    assert len(enc_file == 160)

    # Select columns 
    enc_file = enc_file[['Current Spreadsheet', 'Trial Number', 'Display', 'Screen',
            'Response Type', 'Response', 'Reaction Time', 'Spreadsheet: display', 
            'Spreadsheet: block', 'Spreadsheet: trial', 'Spreadsheet: stimulus_id',
        'Spreadsheet: expected_goal', 'Spreadsheet: fold',
        'Spreadsheet: stimulus_name', 'Spreadsheet: c_group',
        'Spreadsheet: condition', 'Spreadsheet: within_block_dist',
        'Spreadsheet: paired_status', 'Spreadsheet: stimulus_path',
        'Spreadsheet: expected_location', 'Spreadsheet: block_split',
        'Spreadsheet: respStim_1', 'Spreadsheet: respStim_2', 'sub_label']]
    
    enc_file.columns = ['Current Spreadsheet', 'Trial Number', 'Display', 'Screen',
        'Response Type', 'Response', 'Reaction Time', 'display',
        'block', 'block_trial', 'stimulus_id',
        'expected_goal', 'fold',
        'stimulus_name', 'c_group',
        'condition', 'within_block_dist',
        'paired_status', 'stimulus_path',
        'expected_location', 'block_split',
        'respStim_1', 'respStim_2', 'sub_label']
    enc_file.reset_index(drop=True, inplace=True)
    
    
    for t in enc_file['Trial Number'].values: 
      # Get the index for this trial number 
      trial_idx = enc_file[enc_file['Trial Number'] == t].index.values[0]
      
      # If someone answered for this trial... 
      if enc_file[enc_file['Trial Number'] == t]['Response'].values[0] != 'miss':

         # Get the mouse data for this trial 
         trial_mouse = mouse_interp[mouse_interp['Trial Count'] == t]

         # Get time trajectories
         time_traj = list(trial_mouse['Elapsed_new'].values.astype(str))
         time_traj = ','.join(time_traj)
         time_traj = time_traj.replace(',', ' ')

         # Get x and y positions 
         x_pos = list(trial_mouse['X Normalised'].values.astype(str))
         x_pos = ','.join(x_pos)
         x_pos = x_pos.replace(',', ' ')

         y_pos = list(trial_mouse['Y Normalised'].values.astype(str))
         y_pos = ','.join(y_pos)
         y_pos = y_pos.replace(',', ' ')

         # Then place this information in for the current trial 
         enc_file.loc[trial_idx, 'x_pos'] = x_pos; enc_file.loc[trial_idx, 'y_pos'] = y_pos
         enc_file.loc[trial_idx, 'time_trajectories'] = time_traj
         
    # THen save this file to our processed data folder
    enc_mouse_file_dir = os.path.join(proc_dataPath, '%s_enc_mt_formattedData.csv' % (sub_label))
    if saveToggle: 
        if not os.path.exists(enc_mouse_file_dir): 
            enc_file.to_csv(enc_mouse_file_dir, index=False)
            print('File saved:', enc_mouse_file_dir.split('/')[-1])
        else: 
            print('File already exists...')
    else: 
        print('File not saved—trigger save toggle.')
