In [1]:
import numpy as np
import pandas as pd
import os
import warnings

# ephys extraction utilities
import ephys_extractor as efex
import ephys_features as ft

# # ephys extraction utilities
from ephys_utils import get_time_voltage_current_currindex0, calculate_summary_statistics, syn_current_start

# import saver utility
import pickle

# load and read nwb files
from pynwb import NWBHDF5IO

# regression utility
from sklearn import linear_model
ransac = linear_model.RANSACRegressor()

In this notebook we preprocess the data. For all the raw electrophysiological data in the mouse motor cortex (M1) from the [Scala et al. 2020 study](https://www.nature.com/articles/s41586-020-2907-3) of cells measured at 25 degree Celcius, we extract the relevant electrophysiological features and pickle it for further use.
Follow the instructions to download the data from [here](https://dandiarchive.org/dandiset/000008/draft) and put it in `../data/raw_data/`.

Ephys feature names.

In [2]:
ephys_features = [
    'AP threshold',
     'AP amplitude',
     'AP width',
     'AHP',
     '3rd AP threshold',
     '3rd AP amplitude',
     '3rd AP width',
     '3rd AHP',
     'AP count',
     'AP count 1st 8th',
     'AP count 1st quarter',
     'AP count 1st half',
     'AP count 2nd half',
     'AP amp adapt',
     'AP average amp adapt',
     'AP CV',
     'ISI adapt',
     'ISI CV',
     'latency',
     'rest $V_{m}$ mean',
     '$V_{m}$ mean',
     '$V_{m}$ std',
     '$V_{m}$ skewness',
     'Vi',
     '1-comp area',
     'R_input',
     'tau'
]

Main function to extract ephys properties from raw membrane voltage traces.

In [3]:
def cell_features(data_tuple, names, ephys_features, current_val=300, liquid_junction_potential = 15.4, \
                  el_num = 2, current_step = 20, start=0.1, end=0.7):
    """ Analyses a stream of cell dictionaries and outputs all the cell's features in a concatenated DataFrame
    Parameters
    ----------
    data_tuple : tuple of dictionaries of data full of voltage (V) and time (s) traces for different cells
    names : tuple of names of the samples
    ephys_features : list of ephys feature names being extracted
    current_val : int, current value for the trace you'd like to extract features from (optional, 300 pA by default)
    liquid_junction_potential : float, potential to be substracted from all traces (optional, 15.4 mV by default)
    el_num : integer, from which electrode number has been measured (optional, 2 by default)
    current_step : float, which current step (pA) has been used between consecutive experiments (optional, 20 by default)
    start : float (s), start of the stimulation interval (optional, 0.1 by default)
    end: float (s), end of the stimulation interval (optional, 0.7 by default)
    
    Returns
    -------
    All_Cells_Features : DataFrame with values for all required features mentioned in get_cell_features
    """
    All_Cells_Features = pd.DataFrame()
    print('Extracting ephys properties cell by cell:')
    for data, name in zip(data_tuple, names):
        #print(name)
        print('.', end='')
        
        # Extract relevant time, voltage and current information and then set up the observation corresponding to
        # injecting current_val pA in the cell
        time_obs, voltage_obs, current_obs, curr_index_0_obs = get_time_voltage_current_currindex0(data)
        voltage_obs-=liquid_junction_potential
        start_index = (np.abs(time_obs - 0.1)).argmin() # Find closest index where the injection current starts
        end_index = (np.abs(time_obs - 0.7)).argmin() # Find closest index where the injection current ends
        
        
        # ------------------------------ #
        # Extract input resistance and membrane time constant #
        
        filter_ = 10
        if (1/time_obs[1]-time_obs[0]) < 20e3:
            filter_ = (1/time_obs[1]-time_obs[0])/(1e3*2)-0.5
        
        ####################################
        # Input resistance
        ####################################
        
        df_related_features = pd.DataFrame()
        for c, curr in enumerate(current_obs[curr_index_0_obs-4:curr_index_0_obs+4]):
            current_array = curr*np.ones_like(time_obs)
            current_array[:start_index] = 0
            current_array[end_index:len(current_array)] = 0
            EphysObject = efex.EphysSweepFeatureExtractor(t = time_obs, v = voltage_obs[:, curr_index_0_obs-4+c], \
                                                          i = current_array, start = start, \
                                                          end = end, filter = filter_)
            
            # Some easily found features
            df_features = EphysObject._sweep_features
            
            # Adding current (pA)
            df_features.update({'current': curr})
            
            # Adding minimal/maximal voltage deflection
            if curr < 0:
                v_peak_, peak_index = EphysObject.voltage_deflection("min")
                df_features.update({'deflection': v_peak_})
            elif curr==0:
                df_features.update({'deflection':np.average(voltage_obs[:,curr_index_0_obs]
                                                            [start_index:ft.find_time_index(time_obs, 0.2)])})
            elif curr>0:
                EphysObject.process_spikes()
                if EphysObject._spikes_df.empty:
                    v_peak_, peak_index = EphysObject.voltage_deflection("max")
                    df_features.update({'deflection': v_peak_})
            
            # Concatenating
            df_related_features = pd.concat([df_related_features, pd.DataFrame([df_features])], sort = True)
        
        indices = ~np.isnan(df_related_features['deflection'].values)
        ransac.fit(df_related_features['current'].values[indices].reshape(-1, 1), \
                   df_related_features['deflection'].values[indices].reshape(-1, 1))
        slope = ransac.estimator_.coef_[0][0]
        R_input = slope*1000
        
        
        ####################################
        # Extract firing trace related features and (pre-spike) membrane time constant
        ####################################
        
        tau=np.nan
        Vi=np.nan
        
        if current_val not in current_obs:
            x_o = np.ones((len(ephys_features)-2))*np.nan
        else:
            current_array = current_val*np.ones_like(time_obs)
            current_array[:start_index] = 0
            current_array[end_index:len(current_array)] = 0
            curr_index = np.where(current_obs==current_val)[0][0]
            Vi=voltage_obs[0,curr_index]
            EphysObject = efex.EphysSweepFeatureExtractor(t = time_obs, v = voltage_obs[:, curr_index], \
                                                          i = current_array, start = start, \
                                                          end = end, filter = filter_)
            EphysObject.process_spikes()
            
            if not EphysObject._spikes_df.empty: # There are APs and in the positive current regime
                if False in list(EphysObject._spikes_df['clipped']): # There should be spikes that are also not clipped
                    
                    time_first_spike=EphysObject._spikes_df['threshold_t'].values[0]
                    if time_first_spike>start:
                        while True:
                            try:
                                tau=ft.fit_prespike_time_constant(voltage_obs[:,np.where(current_obs==current_val)[0][0]],
                                                          time_obs,
                                                          0.1,
                                                          time_first_spike)*1000
                                break
                            except ValueError: # Pre-spike time cte could just not be reliably estimated
                                break
            
            I, t_on, t_off, dt, t, A_soma = syn_current_start(exp_input_res=R_input, exp_tau=tau, curr_level=3e-4)
            observation = {'data': voltage_obs[:20001, curr_index][np.newaxis,:], \
                           'time': time_obs[:20001]*1e3, 'dt':4*1e-5*1e3, 'I': I}

            # calculate summary statistics from the observation
            x_o = calculate_summary_statistics(observation)[0,:]
    
        # Calculate a tentative membrane time constant and input resistance derived 1-comp area
        # provided membrane capacitance would be 1
        area=tau*1e3/(R_input*1e6)*1e8
        
        
        # ------------------------------ #
        # Concatenating it all #        
        x_o=np.concatenate([x_o,np.array([Vi, area, R_input, tau])],axis=0)
        cell_features_obs = dict(zip(ephys_features, list(x_o)))
        Cell_Features_obs = pd.DataFrame([cell_features_obs])
        Cell_Features_obs = Cell_Features_obs.reindex(columns = ephys_features)
        All_Cells_Features = pd.concat([All_Cells_Features, Cell_Features_obs], sort = True)
    All_Cells_Features.insert(0, 'name sample', names)
    return All_Cells_Features

In [4]:
# M1 data .nwb files
# Check out all directories with its subdirectories and files. Every step in the for-loop checks the root, all the
# subdirectories and the files in the current directory (with name root). Save the file if .nwb extension.
M1 = []
M1_names = []
with warnings.catch_warnings(): # this relates to some subfields in the .nwb object that aren't there so a warning is throwed everytime
    warnings.simplefilter("ignore")
    for root, dirs, files in os.walk('../data/raw_data/000008'):
        if files:
            for file in files:
                if file.endswith('.nwb'):
                    M1.append(NWBHDF5IO(root + '/' + file, 'r', load_namespaces=True).read())
                    M1_names.append(root + '/' + file)

In [None]:
All_Cells_Features = cell_features(M1, M1_names, ephys_features, start=0.10009)

In [None]:
cells_w_ephys=['{}{}{}'.format(g.split('cell')[-1].split('-')[1], '_sample_', g.split('cell')[-1].split('-')[3].split('_')[0])
                   for g in All_Cells_Features['name sample']]
All_Cells_Features['cell id']=cells_w_ephys
All_Cells_Features=All_Cells_Features.set_index('cell id')
All_Cells_Features=All_Cells_Features.drop('name sample', axis=1)
All_Cells_Features=All_Cells_Features[ephys_features]
ind=~np.isnan(np.sum(All_Cells_Features.values, axis=1)) # Exclude Nan ephys feature neurons in the dataset
All_Cells_Features = All_Cells_Features.loc[ind, :]
M1_names=list(np.array(M1_names)[ind])

In [9]:
All_Cells_Features

Unnamed: 0_level_0,AP threshold,AP amplitude,AP width,AHP,3rd AP threshold,3rd AP amplitude,3rd AP width,3rd AHP,AP count,AP count 1st 8th,...,ISI CV,latency,rest $V_{m}$ mean,$V_{m}$ mean,$V_{m}$ std,$V_{m}$ skewness,Vi,1-comp area,R_input,tau
cell id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20190418_sample_8,-58.351621,76.058190,0.80,-6.186172,-48.654582,59.153114,0.84,-13.776202,3.784190,2.197225,...,-2.375144,0.970779,-78.395460,-50.947560,193.539314,2.893263,-78.926407,488.503122,186.342199,0.910287
20190418_sample_5,-54.743048,92.948910,2.32,10.885356,-36.477409,48.268432,4.96,-2.014672,2.772589,1.791759,...,-1.220111,1.472472,-79.374609,-39.309321,266.845980,2.420717,-81.652969,643.034147,243.795551,1.567689
20190418_sample_2,-50.611403,76.264057,1.40,-5.597327,-38.860969,43.194041,2.04,-9.111572,2.079442,2.079442,...,-1.773187,1.057790,-76.148038,-33.792001,46.843200,1.697684,-77.498745,304.764562,329.739791,1.004930
20190418_sample_9,-54.258905,103.987038,1.92,11.428908,-35.036411,56.167506,2.96,-2.832884,3.091042,1.945910,...,-1.600690,1.922788,-83.796163,-30.079619,205.671070,1.907002,-88.939615,895.597871,289.272419,2.590718
20190418_sample_10,-49.691711,91.925863,1.52,-4.834145,-35.067843,51.103311,1.76,-8.606639,2.890372,1.791759,...,-1.431359,1.934416,-85.955350,-37.927538,111.080152,2.946604,-88.104941,988.326652,321.246431,3.174964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200106_sample_1,-50.120790,71.778428,1.16,-4.276332,-40.072732,42.635692,2.04,-10.632437,2.944439,1.945910,...,-1.065157,1.962908,-78.582381,-45.480072,131.544287,3.167725,-79.124734,1065.449347,272.345385,2.901702
20200106_sample_5,-45.211889,91.272198,1.16,-0.675235,-37.472135,83.423840,1.36,-5.802974,2.484907,1.609438,...,-1.494626,2.967333,-71.020771,-41.747111,123.898218,5.657296,-72.378505,8547.879279,110.563555,9.450839
20190704_sample_12,-48.158880,86.425547,1.04,-1.319963,-26.116159,26.218272,1.36,-14.701048,2.302585,2.079442,...,-0.961530,1.568616,-93.045710,-31.872964,43.939352,0.767009,-96.440621,888.498343,235.303044,2.090664
20190704_sample_14,-40.615320,89.330398,1.04,-3.442490,-26.963890,37.558291,1.60,-8.270035,2.302585,2.079442,...,-1.100922,1.663926,-85.083524,-25.952040,45.243331,1.287647,-89.420185,729.826902,280.193104,2.044925


Save

In [None]:
M1_25degree={'exclude':ind, 'M1_names':M1_names, 'X_o': All_Cells_Features}
pickle.dump(M1_25degree, open('pickles/M1_features.pickle', 'wb'))