In [1]:
# Collection of functions to load log files for envelope discrimination experiment.
# Version : 3.0
# Date : June 24, 2016
# Author: Sergey Antopolsky

import pandas as pd
import os
import numpy as np
from nptdms import TdmsFile

In [2]:
def load_behavioral_data(path_txt, path_tdms, path_csv, rats, 
                         load_streams = False, save_CSV = True,
                         force_TDMS_load = False):
    """
    Loads behavioral data for envelope discrimination experiment. The data
    formats have changed over the years, so it has some built-in redundancy
    to work with older formats.
    
    The newest data format is NI TDMS, in which not only core data is saved,
    but also streams from sensors. They occupy a lot of space and take
    considerable time to load. To reduce time, this function supports saving
    core data to CSV format and loading it next time the log is requested, unless
    TDMS is newly added or script explicitly forced to load TDMS (useful when
    there have been made modifications to the parsing algorhithm).
    
    Parameters
    ----------
    path_txt : string
        Path to the folder with TXT log files (older format of logs)
    path_tdms : string
        Path to the folder with TDMS log files (from cRIO setup)
    path_csv : string
        Path to the folder with CSV files (the folder used to save CSV
        files if needed)
    rats : list of strings
        Names of rats, whos data need to be loaded
    load_streams : boolean
        Whether to load streams of sensor data (licking sensors, nosepoke sensor)
    save_CSV : boolean
        Save log to CSV file
    force_TDMS_load : boolean
        Force load information from .tdms and .txt formats
    
    Returns
    ----------
    out : Series of DataFrames 
        Contains behavioral data for every session
        requested
        
    """
    
    def _tdms_log_name_convert(s,reverse=False):
        """
        Converts tmds filename (Rat_YYMMDD.tdms) to standart session name (YYYYMMDDRat) and back.
        """
        if reverse:
            return s[s.find('S'):] + '_' + s[2:8] + '.tdms'
        else:
            return '20' + s[:-5].split('_')[1] + s[:-5].split('_')[0] 
    
    ld = pd.Series()
    
    # get list of files for each format
    files_txt = [s for s in os.listdir(path_txt) if s.endswith('.txt')]
    files_tdms = [s for s in os.listdir(path_tdms) if s.endswith('.tdms')]
    files_csv = [s for s in os.listdir(path_csv) if s.endswith('.csv')]

    # go through all rats
    for rat in rats:
        
        # extract all log files names, which contain current rat's name
        sessions_txt_rat  = np.array([ s[:-4] for s in files_txt  if rat in s ])
        sessions_tdms_rat = np.array([ _tdms_log_name_convert(s)
                             for s in files_tdms if rat in s ])
        
        
        if not force_TDMS_load:
            sessions_csv_rat  = np.array([ s[:-4] for s in files_csv if rat in s ])
            
            # remove from TXT and TDMS loads sessions that are converted to CSV    
            def _remove_converted_to_csv(files_csv,files_other):
                remove_mask = np.array([])
                for (i,s) in enumerate(files_other):
                    # for some reason this sometimes gives warning, but I cannot figure why
                    if s in files_csv:
                        remove_mask = np.append(remove_mask,i)
                return np.delete(files_other,remove_mask)

            sessions_tdms_rat = _remove_converted_to_csv(sessions_csv_rat,sessions_tdms_rat)
            sessions_txt_rat  = _remove_converted_to_csv(sessions_csv_rat,sessions_txt_rat)
        
            # load sessions from CSV
            for s in sessions_csv_rat:
                ld[s] = pd.read_csv(path_csv + '/' + s + '.csv',index_col=0)
        
        # process and save all txt logs
        for s in sessions_txt_rat:
            ld[s] = parse_txt_log(path_txt + '/' + s + '.txt')
            
            if save_CSV:
                ld[s].to_csv(path_csv + '/' + s + '.csv')
        
        # for tdms files
        for s in sessions_tdms_rat:
            
            # in some sessions data is misplaced
            # affected sessions:
            sessions_to_correct = ['20160516S8',  '20160517S8',  '20160518S8',
                                   '20160516S9',  '20160517S9',  '20160518S9',
                                   '20160516S10', '20160517S10', '20160518S10']
            
            # parse TDMS, save to DataFrame
            if s in sessions_to_correct:
                ld[s] = parse_TDMS_log_correction(path_tdms + '/' + _tdms_log_name_convert(s,reverse=True))
            else:
                ld[s] = parse_TDMS_log(path_tdms + '/' + _tdms_log_name_convert(s,reverse=True))
            
            if save_CSV:
                ld[s].to_csv(path_csv + '/' + s + '.csv')

    return ld

In [None]:
def convert_mixed(s):
    """
    Converts string of the form '5.000000-1.000000' etc to string of the form '5-1'. When NaN returns ''.
    
    20160829 by Sergey Antopolskiy
    """
    if pd.isnull(s) or len(s)==0:
        return np.nan
    else:
        s = np.array(pd.Series(s).str.split('-')[0]).astype(float).astype(int).astype(str)
        return '-'.join(s)

In [15]:
def parse_TDMS_log(file_path):
    """
    Parse NI TDMS log file (cRIO setup) into pandas DataFrame.
    
    20160517 Version 1.1: shorter code (added get_property func) + get rig of unefficient
    assignments, which raised warnings. Doc started.
    
    To add:
    There are some new properties, which sould be parsed, if exist, e.g. SEED
    """
    
    tdms_file = TdmsFile(file_path)
    
    # get group names (in this case just "stim")
    group_names = tdms_file.groups()

    log_data = pd.DataFrame()
    log_data['Trial'] = group_names[:-1]

    ## REASSIGNING
    def get_property(name):
        """
        Utility function for pulling out different properties (columns) for TDMS file.
        """
        return np.array([tdms_file.object(trial).property(name) for trial in log_data['Trial']])

    # is the trial considered correct? (i.e. valid and choice was correct)
    log_data['Correct?'] = to_boolean(get_property('TRIAL RESULT '))

    # trial is valid when rat waited enough, i.e. until the go cue
    log_data['Trial_valid'] = to_boolean(get_property('TRIAL VALID'))

    # the side rat had to go to get reward
    log_data['Rewarded Side'] = get_property('REWARD SIDE')
    log_data.loc[log_data['Rewarded Side'] == 'left','Rewarded Side'] = 'Left'
    log_data.loc[log_data['Rewarded Side'] == 'right','Rewarded Side'] = 'Right'

    # the side the rat has actually gone to
    log_data["Rat's Choice"] = get_property('RAT CHOICE')
    log_data.loc[log_data["Rat's Choice"] == 'left',"Rat's Choice"] = 'Left'
    log_data.loc[log_data["Rat's Choice"] == 'right',"Rat's Choice"] = 'Right'
    log_data.loc[log_data["Rat's Choice"] == 'Not applicable',"Rat's Choice"] = np.nan

    # modulation frequency of the sine envelope of the stimulus on this trial
    log_data['Stim_modulation_freq'] = get_property('STIM MOD FREQ')

    # gain of the stimulus; makes sense in the context of other gain valies
    # (i.e. comparison between lower and higher gain trials)
    log_data['Stim_gain'] = get_property('STIM GAIN')
    
    # correct gain values (some changes in the labview code were made to make 3 gains I used
    # to be 0.8, 1.0 and 1.2; the old value need to be corrected, but physically these are the same
    # stimuli gains)

    log_data.loc[(log_data['Stim_gain']==1.38) | 
                 (log_data['Stim_gain']==1.15) | 
                 (log_data['Stim_gain']==0.92),'Stim_gain'] = \
    log_data.loc[(log_data['Stim_gain']==1.38) | 
                 (log_data['Stim_gain']==1.15) | 
                 (log_data['Stim_gain']==0.92),'Stim_gain'] / 1.15

    
    # stimuli are seeded: same seed has the same noise sequence
    # (this can also be used match stimuli with tracked offline sequences)
    if 'SEED ID' in tdms_file.object('trial 1').properties:
        log_data['Seed_ID'] = get_property('SEED ID').astype(int)

    # delay between nosepoke sensor activation and stimulus start
    log_data['PreDelay'] = get_property('PRE STIMULUS DELAY (ms)')

    # delay after stimulus has ended and the go cue
    log_data['PostDelay'] = get_property('POST STIMULUS DELAY (ms)')

    # absolute timestamp for the beginning of the trial (green LED on)
    # (this timestamp is what other timestamps are relative to)
    log_data['T0_trial_begin_abs_sec'] = get_property('T0 BLUE LED ON  absolute time (s)')
    log_data['T1_nosepoke_in_rel_sec'] = get_property('T1 NOSEPOKE IN  relative time (s)')
    log_data['T2_stimulus_start_rel_sec'] = get_property('T2 STIMULUS START  relative time (s)')
    log_data['T3_nosepoke_out_rel_sec'] = get_property('T3 NOSEPOKE OUT  relative time (s)')
    log_data['T4_first_lick_rel_sec'] = get_property('T4 FIRST LICK  relative time (s)')
    log_data['T5_reward_delivery_rel_sec'] = get_property('T5 REWARD DELIVERY relative time (s)')
    log_data['T6_next_trial_begin_rel_sec'] = get_property('T6 NEXT BLUE LED ON relative time (s)')

    # second thought refers to when the rat in training mode has made a mistake on a previous trial
    # and went to the other side to pick up reward; second thought therefore refers to the 
    # previous trial
    log_data['Second_thought'] = to_boolean(get_property('SECOND THOUGHT'))
    log_data['T7_second_thought_abs_sec'] = get_property('SECOND THOUGHT absolute time (s)')

    log_data['Pre_reward_delay'] = get_property('PRE REWARD DELAY (ms)')
    log_data['Reward_amount_turns'] = get_property('REWARD PUMP')
    log_data['Hint_volume'] = get_property('HINT VOLUME')
    log_data['Reward_sound_volume'] = get_property('REWARD VOLUME')

    # the delay that is applied after the trial has been finished and before
    # the new trial has begun; e.g. punishment delay in case of error
    log_data['Delay_after_trial'] = get_property('FINAL DELAY (ms)')

    log_data['Stim_duration_ms'] = get_property('STIM DURATION (ms)')
    log_data['Stim_sampling_freq_Hz'] = get_property('STIM SAMPLING RATE (Sps)')
    log_data['Stim_modulation_phase_deg'] = get_property('STIM PHASE')
    log_data['Manual_reward_bias'] = get_property('REWARD SIDE BIAS')
    log_data['Reward_side_alternation_protection_threshold'] = get_property('REW SIDE ALT PROT THRESHOLD')
    log_data['Reward_side_max_repetitions'] = get_property('REW SIDE MAX REPETITIONS')
    log_data['Choice_side_alternation_protection_threshold'] = get_property('CHOICE SIDE ALT PROT THRESHOLD')
    log_data['Choice_side_repetition_protection_threshold'] = get_property('CHOICE SIDE REP PROT THRESHOLD')
    log_data['Reward_size'] = get_property('REWARD PUMP')
    log_data['N_for_choice_protections'] = get_property('N')
    log_data['Training?'] = to_boolean(get_property('TRAINING MODE'))
    log_data['Water_barrier'] = to_boolean(get_property('BARRIER'))
    log_data['Pre_lick_reward'] = to_boolean(get_property('PRE-LICK REWARD'))
    log_data['Detection?'] = to_boolean(get_property('DETECTION ONLY'))
    log_data['Hint?'] = to_boolean(get_property('HINT'))
    log_data['Stimulus_type'] = get_property('STIM TYPE')
    log_data['Reward_rule'] = get_property('STIMULUS RULE')

    # information about mixed ('catch') trials
    if 'CATCH TRIAL INFO' in tdms_file.object(tdms_file.groups()[0]).properties:
        log_data['Mixed_trial_info'] = get_property('CATCH TRIAL INFO')
        
        # clean up mixed trial info
        log_data['Mixed_trial_info'] = log_data['Mixed_trial_info'].apply(convert_mixed)
        
        # clean up mixed trial info
#         log_data['Mixed_trial_info'] = log_data['Mixed_trial_info'].str.replace('0','').str.replace('.','')

    log_data['Correct?'], mismatched = reconstruct_correct(log_data["Rewarded Side"],
                                                           log_data["Rat's Choice"],
                                                           log_data['Correct?'])
    if mismatched.size:
        print('Mismatched indexes: ', mismatched, ' in the file ', file_path)
    
    # sometimes there is a day trasition during trainin, and it resets times of
    # trial begin; following fixed the timestamp to be continuous
    if np.where(log_data['T0_trial_begin_abs_sec'].diff()<0)[0].size != 0:
        day_transition_ix = np.where(log_data['T0_trial_begin_abs_sec'].diff()<0)[0][0]
        last_correct_time = log_data.at[day_transition_ix-1,'T0_trial_begin_abs_sec']
        inter_trial_time = log_data.at[day_transition_ix-1,'T6_next_trial_begin_rel_sec']
        real_delay = inter_trial_time - log_data.at[day_transition_ix,'T0_trial_begin_abs_sec']
        
        log_data.loc[day_transition_ix:,'T0_trial_begin_abs_sec']+= (last_correct_time + real_delay)
    
    #### OVERRIDES TO FIX LOG
    log_data.loc[log_data["Rat's Choice"].isnull(),'Correct?'] = False
    
    return log_data

In [4]:
def parse_txt_log(file_path):
    
    # open file and extract lines
    text_file = open(file_path, 'r')
    lines = text_file.read().splitlines()
    text_file.close()
    
    # first line is headers
    headers = lines[0].split(',')
    
    # split all other lines
    all_values_temp = [x.split(',') for x in lines[1:]]
    
    # sometimes there is an empty line in the beginning (especially in the 
    # legacy log files), I need to remove it
    if [''] in all_values_temp:
        all_values_temp.remove([''])
 
    
    # create temporary file to save all data from a log file
    log_data = pd.DataFrame()
    
    # go through all headers and assign corresponding values 
    # to dict with the header-names key 
    for (index,key) in enumerate(headers):
        log_data[key] = [x[index] for x in all_values_temp]
        
    # this look-up table is used to transform columns to certain data types        
    transform_columns_lookup = {'to_ints': ['Trial Number','PreDelay','PreDelayElapsed','PostDelay','PostDelayElapsed',
                                'Modulation Frequency Low','Modulation Frequency High','ByHowMuchSlide'],
                    'to_floats': ['LowerFrequencyGain','HigherFrequencyGain','HintGain',
                                    'Low Freq Mod Begin Phase','HintDelay'],
                    'to_booleans': ['Correct?','Stimuli Delivered','SlideStimuli','RightGoesFirst',
                                    'High Freq Mod Begin Phase'],
                    'to_time': ['Time','Time Go Cue','Time Withdraw','Time Response']}

    # transforms all columns in all sessions to data types according
    # to a look up table *transform_columns_lookup*
    for to_what in transform_columns_lookup.iterkeys():
        for column_name in transform_columns_lookup[to_what]:
            if column_name in log_data.columns:
                log_data[column_name] = transform_column(log_data[column_name],to_what)
    
    if "Rat's Choice" in log_data.columns:
        log_data.loc[log_data["Rat's Choice"]=='None',"Rat's Choice"] = np.nan
        
    # in some legacy log files (e.g. 20130827V1) there is no "Rat's Choice"
    # logged, so I need to recreate it from other data
    if "PostDelayElapsed" in log_data.columns and "Rat's Choice" not in log_data.columns:
        log_data["Rat's Choice"] = pd.Series(dtype='str')
        for i in log_data.index.values:
            if log_data.at[i,"PostDelayElapsed"]>=log_data.at[i,"PostDelay"]:
                if log_data.at[i,"Rewarded Side"]=='Right':
                    if log_data.at[i,"Correct?"]:
                        log_data.at[i,"Rat's Choice"]='Right'
                    else:
                        log_data.at[i,"Rat's Choice"]='Left'
                elif log_data.at[i,"Rewarded Side"]=='Left':
                    if log_data.at[i,"Correct?"]:
                        log_data.at[i,"Rat's Choice"]='Left'
                    else:
                        log_data.at[i,"Rat's Choice"]='Right'
                
    return log_data

In [5]:
def get_channels_from_tdms(file_path):
    """
    This function extracts channels (waveforms) from TDMS log file and saves it 
    to pd.DataFrame. It is not robust to the number and sequence of channels, 
    so care should be taken if structure of TDMS log is changed.
    
    20151106 Sergey Antopolskiy 
    """
    
    tdms_file = TdmsFile(file_path)
    
    # get group names (in this case just "stim")
    trials = tdms_file.groups()
    
    channels = pd.DataFrame(index = trials, columns = \
    ['Time sensors','Nosepoke','Licking_R','Licking_L',\
    'Time shakers','Shaker_R','Shaker_L','Stimulus_R','Stimulus_L'])
    
    for trial in trials:
        if tdms_file.group_channels(trial):
            channels['Time sensors'][trial] = tdms_file.group_channels(trial)[0].time_track()
            channels['Nosepoke'][trial] = tdms_file.group_channels(trial)[0].data
            channels['Licking_R'][trial] = tdms_file.group_channels(trial)[1].data
            channels['Licking_L'][trial] = tdms_file.group_channels(trial)[2].data
            channels['Time shakers'][trial] = tdms_file.group_channels(trial)[3].time_track()
            channels['Shaker_R'][trial] = tdms_file.group_channels(trial)[4].data
            channels['Shaker_L'][trial] = tdms_file.group_channels(trial)[3].data
            
            if len(tdms_file.group_channels(trial))==7:
                channels['Stimulus_R'][trial] = tdms_file.group_channels(trial)[6].data
                channels['Stimulus_L'][trial] = tdms_file.group_channels(trial)[5].data
    
    return channels

In [6]:
def transform_column(column,to_what):
    """
    This function is used to tranform data columns to certain data types,
    which are: to_ints, to_floats, to_booleans, and to_time (currently doesn't work with
    time).
    """ 
    if to_what == 'to_ints':
        return [ int(x) for x in column ]
    elif to_what == 'to_floats':
        return [ float(x) for x in column ]
    elif to_what == 'to_booleans':
        return [ x == '1' for x in column ]
    #elif to_what == 'to_time':
    #    return 
    else: 
        return column



In [7]:
## UTILITY

def to_boolean(array_with_TRUE_FALSE_strings):
    a = np.array(array_with_TRUE_FALSE_strings,dtype=object)
    
    a[a=='FALSE'] = False
    a[a=='False'] = False
    a[a=='false'] = False
    a[a=='F']     = False
    a[a=='0']     = False
    
    a[a=='TRUE']  = True
    a[a=='True']  = True
    a[a=='true']  = True
    a[a=='T']     = True
    a[a=='1']     = True
    
    return a.astype(bool)

In [8]:
def parse_TDMS_log_correction(file_path):
    """
    Because of some changes Fabio did to the setup, some sessions 
    had a log file with mixed up data. Some data were lost (timing of 
    events), but the majority of data were just misplaced. Here I 
    correct these log files manually and save the correct CSV files.
    
    affected sessions:
    ss = ['20160516S8',  '20160517S8',  '20160518S8',
          '20160516S9',  '20160517S9',  '20160518S9',
          '20160516S10', '20160517S10', '20160518S10']
    """
    
    tdms_file = TdmsFile(file_path)
    
    # get group names (in this case just "stim")
    group_names = tdms_file.groups()

    log_data = pd.DataFrame()
    log_data['Trial'] = group_names[:-1]

    ## REASSIGNING
    def get_property(name):
        """
        Utility function for pulling out different properties (columns) for TDMS file.
        """
        return np.array([tdms_file.object(trial).property(name) for trial in log_data['Trial']])

    # is the trial considered correct? (i.e. valid and choice was correct)
    log_data['Correct?'] = to_boolean(get_property('TRIAL RESULT '))

    # trial is valid when rat waited enough, i.e. until the go cue
    log_data['Trial_valid'] = to_boolean(get_property('TRIAL VALID'))

    # the side rat had to go to get reward
    log_data['Rewarded Side'] = get_property('REWARD SIDE')
    log_data.loc[log_data['Rewarded Side'] == 'left','Rewarded Side'] = 'Left'
    log_data.loc[log_data['Rewarded Side'] == 'right','Rewarded Side'] = 'Right'

    # the side the rat has actually gone to
    log_data["Rat's Choice"] = get_property('RAT CHOICE')
    log_data.loc[log_data["Rat's Choice"] == 'left',"Rat's Choice"] = 'Left'
    log_data.loc[log_data["Rat's Choice"] == 'right',"Rat's Choice"] = 'Right'
    log_data.loc[log_data["Rat's Choice"] == 'Not applicable',"Rat's Choice"] = np.nan

    # modulation frequency of the sine envelope of the stimulus on this trial
    log_data['Stim_modulation_freq'] = get_property('HINT VOLUME')

    # gain of the stimulus; makes sense in the context of other gain valies
    # (i.e. comparison between lower and higher gain trials)
    log_data['Stim_gain'] = get_property('REWARD VOLUME')

    # stimuli are seeded: same seed has the same noise sequence
    # (this can also be used match stimuli with tracked offline sequences)
    if 'SEED ID' in tdms_file.object('trial 1').properties:
        log_data['Seed_ID'] = get_property('REW SIDE ALT PROT THRESHOLD').astype(int)

    # delay between nosepoke sensor activation and stimulus start
    log_data['PreDelay'] = get_property('T2 STIMULUS START  relative time (s)')

    # delay after stimulus has ended and the go cue
    log_data['PostDelay'] = get_property('T3 NOSEPOKE OUT  relative time (s)')

    # absolute timestamp for the beginning of the trial (green LED on)
    # (this timestamp is what other timestamps are relative to)
    log_data['T0_trial_begin_abs_sec'] = np.nan#get_property('T0 BLUE LED ON  absolute time (s)')
    log_data['T1_nosepoke_in_rel_sec'] = get_property('T1 NOSEPOKE IN  relative time (s)')
    log_data['T2_stimulus_start_rel_sec'] = np.nan#get_property('T2 STIMULUS START  relative time (s)')
    log_data['T3_nosepoke_out_rel_sec'] = np.nan#get_property('T3 NOSEPOKE OUT  relative time (s)')
    log_data['T4_first_lick_rel_sec'] = np.nan#get_property('T4 FIRST LICK  relative time (s)')
    log_data['T5_reward_delivery_rel_sec'] = np.nan#get_property('T5 REWARD DELIVERY relative time (s)')
    log_data['T6_next_trial_begin_rel_sec'] = np.nan#get_property('T6 NEXT BLUE LED ON relative time (s)')

    # second thought refers to when the rat in training mode has made a mistake on a previous trial
    # and went to the other side to pick up reward; second thought therefore refers to the 
    # previous trial
    log_data['Second_thought'] = to_boolean(get_property('SECOND THOUGHT'))
    log_data['T7_second_thought_abs_sec'] = get_property('REWARD SIDE BIAS')

    log_data['Pre_reward_delay'] = get_property('T4 FIRST LICK  relative time (s)')
    log_data['Reward_amount_turns'] = get_property('T5 REWARD DELIVERY relative time (s)')
    log_data['Hint_volume'] = get_property('T0 BLUE LED ON  absolute time (s)')
    log_data['Reward_sound_volume'] = get_property('T6 NEXT BLUE LED ON relative time (s)')

    # the delay that is applied after the trial has been finished and before
    # the new trial has begun; e.g. punishment delay in case of error
    log_data['Delay_after_trial'] = get_property('PRE STIMULUS DELAY (ms)')

    log_data['Stim_duration_ms'] = get_property('POST STIMULUS DELAY (ms)')
    log_data['Stim_sampling_freq_Hz'] = get_property('PRE REWARD DELAY (ms)')
    log_data['Stim_modulation_phase_deg'] = get_property('REWARD PUMP')
    log_data['Manual_reward_bias'] = get_property('FINAL DELAY (ms)')
    log_data['Reward_side_alternation_protection_threshold'] = get_property('STIM DURATION (ms)')
    log_data['Reward_side_max_repetitions'] = get_property('STIM SAMPLING RATE (Sps)')
    log_data['Choice_side_alternation_protection_threshold'] = get_property('STIM PHASE')
    log_data['Choice_side_repetition_protection_threshold'] = get_property('STIM MOD FREQ')
    log_data['Reward_size'] = get_property('T5 REWARD DELIVERY relative time (s)')
    log_data['N_for_choice_protections'] = get_property('STIM GAIN')
    log_data['Training?'] = to_boolean(get_property('TRAINING MODE'))
    log_data['Water_barrier'] = to_boolean(get_property('BARRIER'))
    log_data['Pre_lick_reward'] = to_boolean(get_property('PRE-LICK REWARD'))
    log_data['Detection?'] = to_boolean(get_property('DETECTION ONLY'))
    log_data['Hint?'] = to_boolean(get_property('HINT'))
    log_data['Stimulus_type'] = get_property('STIM TYPE')
    log_data['Reward_rule'] = get_property('STIMULUS RULE')

    #### OVERRIDES TO FIX LOG
    log_data.loc[log_data["Rat's Choice"].isnull(),'Correct?'] = False
    
    return log_data

In [16]:
# do sanity check on choices, reward sides and correctness of trials
# returns correct trials vector, changed in case of mismatch

def reconstruct_correct(reward,choice,correct):
    """
    Check how logged correctness of the trials corresponds to the choice and reward sides.
    Mismatch is a reason for alarm: normally they should correspond. There was a bug in the
    setup from 06/06/2016 to 24/06/16 which caused some trials to give incorrect reward, so
    these mismatches are not a cause for alarm.
    
    Parameters
    ----------
    reward : numpy.ndarray 
        Array with reward sides: 'Right', 'Left' or 'both' (e.g. for mixed trials)
    
    choice : numpy.ndarray
        Array with choice sides: 'Right', 'Left'
        
    correct : numpy.ndarray
        Boolean array with correctness of the trial
    
    Returns
    ----------
    out : numpy.ndarray
        If everything is coherent, returns *correct*. If there was mismatch,
        fixes it by changing correctness of mismatch trials
    
    out : list or None
        None if there was no mismatch, list of indexes of mismatched indexes in case of mismatch
    """
    df = pd.DataFrame([reward,choice,correct], index=['reward','choice','correct']).transpose()
    df = df.loc[df.reward != 'both'].dropna(subset=['choice'])
    
    if ((df.reward == df.choice) != df.correct).sum() == 0:
#         print "Columns 'choice', 'reward', and 'correct' are coherent."
        return correct.values, np.array([])
    else:
        mismatched = df.loc[((df.reward == df.choice) != df.correct)].index
        print("Mismatch! Returning corrected vector")
        return (reward == choice) | (reward == 'both'), mismatched

In [None]:
def compile_trials(ld):
    """
    Takes all sessions in the ld (log data frame generated by *load_behavioral_data* function)
    and puts the trials all together, leaving only some columns and giving them short and 
    convenient names.
    
    20160926 by Sergey Antopolskiy
    """
    
    trials = pd.DataFrame()
    for s, sd in ld.iteritems():
        # drop invalid trials 
        sd = sd.dropna(subset=["Rat's Choice"])
        # take relevant columns
        sd = sd[['Correct?','Rewarded Side',"Rat's Choice",'Stim_gain','Mixed_trial_info']]
        # add categorical columns: date and rat
        sd['date'] = pd.to_datetime(s.split('S')[0])
        sd['rat'] = 'S' + s.split('S')[1]
        # complite trials from different sessions together
        trials = trials.append(sd)

    trials = trials.reset_index()
    trials.columns = ['trial_num','correct','reward','choice','gain','mixed_sequence','date','rat']
    
    # correct gain values (some changes in the labview code were made to make 3 gains I used
    # to be 0.8, 1.0 and 1.2; the old value need to be corrected, but physically these are the same
    # stimuli gains)
    trials.loc[(trials.gain==1.38) | 
               (trials.gain==1.15) | 
               (trials.gain==0.92),'gain'] = \
    trials.loc[(trials.gain==1.38) | 
               (trials.gain==1.15) | 
               (trials.gain==0.92),'gain'] / 1.15
    
    return trials