In [2]:
import numpy as np
import pickle
import os

from api_neurotask import *

import matplotlib.pyplot as plt
import seaborn as sns
from plot_utils import adjust_spines

## Datasets

In [3]:
dataset_names = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.parquet'):
            dataset_names.append(root + '/' + file)

In [5]:
dataset_names[::-1]

['./data/6_1_Churchland9_Maze.parquet',
 './data/6_1_Churchland8_Maze.parquet',
 './data/6_1_Churchland7_Maze.parquet',
 './data/6_1_Churchland6_Maze.parquet',
 './data/6_1_Churchland5_Maze.parquet',
 './data/6_1_Churchland4_Maze.parquet',
 './data/6_1_Churchland3_Maze.parquet',
 './data/6_1_Churchland2_Maze.parquet',
 './data/6_1_Churchland1_Maze.parquet',
 './data/5_1_Dyer_CO.parquet',
 './data/4_1_MaXuan_Key.parquet',
 './data/4_1_MaXuan_ISO.parquet',
 './data/4_1_MaXuan_CO.parquet',
 './data/3_30_GallegoCarracedo_CO.parquet',
 './data/2_10_Chowdhury_TRT.parquet',
 './data/2_10_Chowdhury_CObump.parquet',
 './data/1_4_Makin5_RT.parquet',
 './data/1_4_Makin4_RT.parquet',
 './data/1_4_Makin3_RT.parquet',
 './data/1_4_Makin2_RT.parquet']

Choose dataset:

In [3]:
dataset=dataset_names[0]
dataset

'./data/1_4_Makin2_RT.parquet'

Load dataset

In [4]:
df, bin = load_and_filter_parquet(dataset, ['A', 'I','F'])

Data loaded from ./data/1_4_Makin2_RT.parquet with bin size of 4 ms
Events columns: []
Covariates columns: ['target_pos_x', 'target_pos_y', 'cursor_pos_x', 'cursor_pos_y', 'finger_pos_z', 'finger_pos_x', 'finger_pos_y', 'cursor_vel_x', 'cursor_vel_y']


In [13]:
df.groupby(['animal', 'session'])['trial_id'].nunique().reset_index(name='unique_trials_per_session')

Unnamed: 0,animal,session,unique_trials_per_session
0,1,11,1025
1,1,12,296
2,1,13,346
3,1,14,294
4,1,15,321
5,1,16,317
6,1,17,329
7,1,18,293
8,1,19,295
9,1,20,346


In [5]:
neurons = [neuron for neuron in df.columns if neuron.startswith('Neuron')]

### Run GPFA on all sessions and animals in this dataset:

We run gpfa, based on core implementation in [elephant](https://elephant.readthedocs.io/en/latest/reference/gpfa.html), but without using Neo spike train preprocessing.

In [6]:
from gpfa_utils import dataframe_to_spike_trains
from elephant.gpfa import gpfa_core, gpfa_util

Hyperparameters: 

In [7]:
latent_dimensionality=6
min_var_frac=0.01
min_var_frac_explanation="""fraction of overall data variance for each observed dimension to set as
        the private variance floor.  This is used to combat Heywood cases,
        where ML parameter learning returns one or more zero private variances.
        Default: 0.01
        (See Martin & McDonald, Psychometrika, Dec 1975.)"""

tau_init=100.0 # ms # GP timescale initialization in msec
eps_init=1.0e-3 # GP noise variance initialization
em_tol=1.0e-8 # stopping criterion for EM
em_max_iters=500 # max EM iterations
freq_ll=5 # every freq_ll steps in EM likelihood is computed
verbose=False # feedback or not

We want bin sizes at least 10 ms big in this analysis.

In [8]:
new_bin=10.0

In [12]:
for animal in df['animal'].unique():
    print('Animal id: ', animal)
    print('---------')
    for session in df[df['animal']==animal]['session'].unique():
        print('Session id: ', session)
        df_gpfa = df[(df['animal']==animal)&(df['session']==session)]
        bin_width=bin
        
        if bin < 10.0:
            df_gpfa = rebin(df_gpfa, prev_bin_size = bin, new_bin_size = new_bin)
            bin_width = new_bin
        
        seqs = dataframe_to_spike_trains(df_gpfa, neurons)
        
        # Check if training data covariance is full rank
        y_all = np.hstack(seqs["y"])
        y_dim = y_all.shape[0]

        if np.linalg.matrix_rank(np.cov(y_all)) < y_dim:
            print('Observation covariance matrix is rank deficient.')
            print('Maybe repeated units, not enough observations.')
            print('Skipping this session.')
            continue
        
        #if verbose:
        #    print("Number of training trials: {}".format(len(seqs)))
        #    print("Latent space dimensionality: {}".format(latent_dimensionality))
        #    print(
        #        "Observation dimensionality: {}".format(
        #            has_spikes_bool.sum()
        #        )
        #    )
        
        
        # Fit
        params_estimated, fit_info = gpfa_core.fit(
            seqs_train=seqs,
            x_dim=latent_dimensionality,
            bin_width=bin_width,
            min_var_frac=min_var_frac,
            em_max_iters=em_max_iters,
            em_tol=em_tol,
            tau_init=tau_init,
            eps_init=eps_init,
            freq_ll=freq_ll,
            verbose=verbose
        )
        
        # Transform
        transform_info = dict()
        returned_data=['latent_variable', 'VsmGP']
        
        seqs, ll = gpfa_core.exact_inference_with_ll(
            seqs, params_estimated, get_ll=True
        )
        transform_info["log_likelihood"] = ll
        transform_info["num_bins"] = seqs["T"]
        
        # Orthonormalize columns in C, update latents
        Corth, seqs = gpfa_core.orthonormalize(params_estimated, seqs)
        
        transform_info["Corth"] = Corth
        if len(returned_data) == 1:
            gpfa_val_result = seqs[returned_data[0]]
        gpfa_val_result =  {x: seqs[x] for x in returned_data}
        
        with open('save_pickles/{}_animal_{}_session_{}_latent_dim_{}.pickle'.format(
            dataset.split('/')[-1].split('.')[0],
            animal,
            session,
            latent_dimensionality
        ), 'wb') as f:
            pickle.dump({
                'params':params_estimated,
                'latents':gpfa_val_result
            }, f)

Animal id:  1
---------
Session id:  11
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  12
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  13
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  14
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  15
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  16
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  17
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  18
Initializing parameters using factor analysis...

Fitting GPFA model...
Session id:  19


ValueError: Observation covariance matrix is rank deficient.
Possible causes: repeated units, not enough observations.

### Run GPFA on all sessions and animals in all datasets:

In [None]:
for dataset in dataset_names[::-1]:
    print('----------------------------------------------------------------')
    df, bin = load_and_filter_parquet(dataset, ['A', 'I','F'])
    print('----------------------------------------------------------------')
    neurons = [neuron for neuron in df.columns if neuron.startswith('Neuron')]
    for animal in df['animal'].unique():
        print('Animal id: ', animal)
        print('---------')
        for session in df[df['animal']==animal]['session'].unique():
            print('\nSession id: ', session)
            df_gpfa = df[(df['animal']==animal)&(df['session']==session)]
            bin_width=bin

            if bin < 10.0:
                df_gpfa = rebin(df_gpfa, prev_bin_size = bin, new_bin_size = new_bin)
                bin_width = new_bin

            seqs = dataframe_to_spike_trains(df_gpfa, neurons)

            # Check if training data covariance is full rank
            y_all = np.hstack(seqs["y"])
            y_dim = y_all.shape[0]

            if np.linalg.matrix_rank(np.cov(y_all)) < y_dim:
                print('Observation covariance matrix is rank deficient.')
                print('Maybe repeated units, not enough observations.')
                print('Skipping this session.')
                continue

            #if verbose:
            #    print("Number of training trials: {}".format(len(seqs)))
            #    print("Latent space dimensionality: {}".format(latent_dimensionality))
            #    print(
            #        "Observation dimensionality: {}".format(
            #            has_spikes_bool.sum()
            #        )
            #    )


            # Fit
            params_estimated, fit_info = gpfa_core.fit(
                seqs_train=seqs,
                x_dim=latent_dimensionality,
                bin_width=bin_width,
                min_var_frac=min_var_frac,
                em_max_iters=em_max_iters,
                em_tol=em_tol,
                tau_init=tau_init,
                eps_init=eps_init,
                freq_ll=freq_ll,
                verbose=verbose
            )

            # Transform
            transform_info = dict()
            returned_data=['latent_variable', 'VsmGP']

            seqs, ll = gpfa_core.exact_inference_with_ll(
                seqs, params_estimated, get_ll=True
            )
            transform_info["log_likelihood"] = ll
            transform_info["num_bins"] = seqs["T"]

            # Orthonormalize columns in C, update latents
            Corth, seqs = gpfa_core.orthonormalize(params_estimated, seqs)

            transform_info["Corth"] = Corth
            if len(returned_data) == 1:
                gpfa_val_result = seqs[returned_data[0]]
            gpfa_val_result =  {x: seqs[x] for x in returned_data}

            with open('save_pickles/{}_animal_{}_session_{}_latent_dim_{}.pickle'.format(
                dataset.split('/')[-1].split('.')[0],
                animal,
                session,
                latent_dimensionality
            ), 'wb') as f:
                pickle.dump({
                    'params':params_estimated,
                    'latents':gpfa_val_result
                }, f)

----------------------------------------------------------------
Data loaded from ./data/6_1_Churchland9_Maze.parquet with bin size of 1 ms
Events columns: ['EventGo_cue', 'EventMovement_end', 'EventMovement_start']
Covariates columns: ['hand_pos_x', 'hand_pos_y', 'cursor_pos_x', 'cursor_pos_y', 'target_pos_x', 'target_pos_y', 'maze_num_target', 'maze_num_barriers', 'cursor_vel_x', 'cursor_vel_y']
----------------------------------------------------------------
Animal id:  2
---------

Session id:  5
Initializing parameters using factor analysis...

Fitting GPFA model...
----------------------------------------------------------------
Data loaded from ./data/6_1_Churchland8_Maze.parquet with bin size of 1 ms
Events columns: ['EventGo_cue', 'EventMovement_end', 'EventMovement_start']
Covariates columns: ['hand_pos_x', 'hand_pos_y', 'cursor_pos_x', 'cursor_pos_y', 'target_pos_x', 'target_pos_y', 'maze_num_target', 'maze_num_barriers', 'cursor_vel_x', 'cursor_vel_y']
---------------------