In [1]:
from pathlib import Path
import json

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

# from mat73 import loadmat as loadmat_mat73
# from mat4py import loadmat as loadmat_mat4py

import h5py
from scipy.io import loadmat as loadmat_scipy

# Session analysis
Let's start with a session to look at the files

## Files and their description. A first take

In [2]:
project_root = Path("/Volumes/neurodata/buzaki/HuszarR")
session_path = Path(project_root, "optotagCA1/e13/e13_16f1/e13_16f1_210302")

# Dump to a file in the same folder
json_directory = Path.cwd() / "_json_files"
json_directory.mkdir(exist_ok=True)

# Dump project-wide information into a nested folder
project_json_directory = json_directory / "project"
project_json_directory.mkdir(exist_ok=True)

session_files_path_list = list(session_path.iterdir())
session_files_path_list

[PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/e13_16f1_210302.cell_metrics.cellinfo.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/._e13_16f1_210302.cell_metrics.cellinfo.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/e13_16f1_210302.ripples.events.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/._e13_16f1_210302.ripples.events.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/e13_16f1_210302.spikes.cellinfo.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/._e13_16f1_210302.spikes.cellinfo.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/e13_16f1_210302.mono_res.cellinfo.mat'),
 PosixPath('/Volumes/neurodata/buzaki/HuszarR/optotagCA1/e13/e13_16f1/e13_16f1_210302/._e13_16f1_210302.mon

Now as Cody mention, we have seen some experimental data from this lab already and we are familiar with the names.

In a [previous conversion](https://github.com/catalystneuro/buzsaki-lab-to-nwb/blob/master/buzsaki_lab_to_nwb/yuta_visual_cortex/files_documentation.ipynb) I found out that the matlab files contain the following information:



* `SleepState.states` : This can be considered processed data involving up-down intervals. This can be include as process data.
* `chanMap` : This seems to be concerned with information of the channels in the electrode. For example we find both the x and y coordinates of each of the channels. The structure of the files here is (1, n_channels) where n_channels is 64 for this setup.
* `session` : Contains behavioral info and general information related to the session such as the experimenter, the species, the strain and timestamps for the creation of the session.

As you see, we have the following descriptions missing:
* `.ripples.events.mat`: 
* `Behavior` : 

Which is something that we would do below.

#### Spike sorting
The files related to spike sorting were the following in a previous conversion.

For the previous conversion those were the files related to **cell explorer format / interface**:
* `metric_cell_info`
* `mono_res_cellinfo`
* `spikes.cell_info`

They don't seem equivalent to the ones here. We need to confirm that they are equivalent to the following files in this conversion:
* `cell_metrics.cellinfo.mat`
* `cell_metrics.cellinfo`
* `spikes.cellinfo`

This is done below.


## Exploring some files



### `SleepState.states`

Let's now see now which matlab file opener works best

In [16]:
file_path = session_files_path_list[0]
file_path = file_path.parent / "e13_16f1_210302.SleepState.states.mat"
# Open file_path with loadmat_scipy from scipy
mat_file = loadmat_scipy(file_path, simplify_cells=True)
# Iterate over the keys and print the type of the values
for key in mat_file.keys():
    print(key, type(mat_file[key]))

__header__ <class 'bytes'>
__version__ <class 'str'>
__globals__ <class 'list'>
SleepState <class 'dict'>


This is a recursive structure. Let's print the keys, types and shapes (if numpy array) for exploration

In [17]:

def build_keys_and_types(dictionary):
    output_dict = {}
    for key, value in dictionary.items():
        if isinstance(value, dict):
            output_dict[key] = build_keys_and_types(value)
        elif isinstance(value, np.ndarray):
            if value.size > 10:
                output_dict[key] = {
                    'type': str(type(value)),
                    'shape': str(value.shape)
                }
            else:
                # Print small arrays
                output_dict[key] = {
                    'type': str(type(value)),
                    'value': str(value)
                }
        elif isinstance(value, list):
            if len(value) > 10:
                output_dict[key] = {
                    'type': str(type(value)),
                    'length': len(value)
                }
            else:
                # Print small lists
                output_dict[key] = {
                    'type': str(type(value)),
                    'value': str(value)
                }
        else:
            output_dict[key] = {
                "type": str(type(value)),
                "value": str(value),
            }
    return output_dict

# Define your sleep_state_dict here

result = build_keys_and_types(mat_file)
json_output = json.dumps(result, indent=2)

with open(json_directory / 'sleep_state_dict.json', 'w') as f:
    f.write(json_output)
    

If you have a matlab licence, you can also just explore the file there. There are two things to look for, large arrays that might
correspond to behavorial data and metadata from the experiment.

Because I know the data from this lab I will be looking for the REM state.



In [8]:
sleep_state_dict = mat_file["SleepState"]
json_dict = json.dumps(build_keys_and_types(sleep_state_dict), indent=4)

In [9]:
sleep_state_dict["ints"]


{'WAKEstate': array([[    1,  1806],
        [ 2947,  3015],
        [ 3314,  3345],
        [ 3995,  4025],
        [ 4525,  5819],
        [ 6221,  6272],
        [ 6834,  6876],
        [ 7481,  7508],
        [ 7614,  7652],
        [ 7879,  7897],
        [ 8111, 14739],
        [15887, 21006]], dtype=uint16),
 'NREMstate': array([[ 1807,  2946],
        [ 3016,  3241],
        [ 3346,  3854],
        [ 4026,  4524],
        [ 5820,  6220],
        [ 6273,  6747],
        [ 6877,  7439],
        [ 7509,  7593],
        [ 7653,  7860],
        [ 7898,  8110],
        [14740, 15810]], dtype=uint16),
 'REMstate': array([[ 3242,  3313],
        [ 3855,  3994],
        [ 6748,  6833],
        [ 7440,  7480],
        [ 7594,  7613],
        [ 7861,  7878],
        [15811, 15886]], dtype=uint16)}

In [10]:
wake_state = sleep_state_dict["ints"]["WAKEstate"]
wake_state  # This is the start time and the end time of the wake state. Probably in frames.

array([[    1,  1806],
       [ 2947,  3015],
       [ 3314,  3345],
       [ 3995,  4025],
       [ 4525,  5819],
       [ 6221,  6272],
       [ 6834,  6876],
       [ 7481,  7508],
       [ 7614,  7652],
       [ 7879,  7897],
       [ 8111, 14739],
       [15887, 21006]], dtype=uint16)

So, we need to confirm the units and this can go as a `TimeIntervals` in a processing module

### `Behavior`

In [11]:
file_path = file_path.parent / "e13_16f1_210302.Behavior.mat" 
file_path.is_file()

True

In [12]:
mat_file = loadmat_scipy(file_path, simplify_cells=True)

# Output to an external file
with open(json_directory / 'behavior_dict.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

Importantly this contains a date in `date` and subject information in `animal`.
Surprisingly, this does not seem to contain any large vector as I expected

### `ripples.events.mat`

In [13]:
file_path = session_files_path_list[0]
file_path = file_path.parent / "e13_16f1_210302.ripples.events.mat"
file_path.is_file()

True

In [14]:
mat_file = loadmat_scipy(file_path, simplify_cells=True)

# Output to an external file
with open(json_directory / 'riples.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

This is a large file. We need to look into the paper to see what should be stored from here

### `chanMap.mat`

In [15]:
file_path = session_files_path_list[0]
file_path = file_path.parent / "chanMap.mat'"
file_path.is_file()


False

In [16]:
# Write this to a file for visualization
with open(json_directory / 'chanMap.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

There are some time series here but no information about the channels as I was expeting.

### `Session`

In [17]:
file_path = session_files_path_list[0]
file_path = file_path.parent / "e13_16f1_210302.session.mat"
file_path.is_file()


True

In [18]:
mat_file = loadmat_scipy(file_path, simplify_cells=True)

# Output to an external file
with open(json_directory / 'session.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

This contains useful information like subject, animal and an epoch file.
Probably less relevant is information about other sources of data such as spikesorting and some of the analogous channels.

# Spikesorting 

## Testing CellExplorer

In [20]:
from spikeinterface.extractors import CellExplorerSortingExtractor

file_path = session_files_path_list[0].parent / "e13_16f1_210302.spikes.cellinfo.mat"
extractor = CellExplorerSortingExtractor(file_path)

  from .autonotebook import tqdm as notebook_tqdm
Exception ignored in: <function tqdm.__del__ at 0x10d84e7a0>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/conversion/lib/python3.11/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/opt/anaconda3/envs/conversion/lib/python3.11/site-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x10d84e7a0>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/conversion/lib/python3.11/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/opt/anaconda3/envs/conversion/lib/python3.11/site-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='success', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


AssertionError: To use the CellExplorerSortingExtractor install scipy and hdf5storage: 

 pip install scipy  hdf5storage

No `sessionInfo` for this file so the above throws an assertion. Let's see what each of the files.

The `CellExplorerSortingExtractor` uses the `sessionInfo` to extract the sampling frequency. But that might be somehwere else. 

Importantly, the files should contain the fields `UID` and `times` in a field called `spikes`. Let's see if any of the files contain this information and if it is consistent across them

Looking at the files below, they do seem to agree with the basic information. It should be straighforward to role a new sorting extractor for this dataset. Or use NumpySortingExtractor and then add the `sessionInfo` manually. Not sure at this point on what it would be easier. 

## Individual spike files

### `spikes.cellinfo.mat`

In [20]:
file_path = session_files_path_list[0].parent / "e13_16f1_210302.spikes.cellinfo.mat"

mat_file = loadmat_scipy(file_path, simplify_cells=True)

# Output to an external file
with open(json_directory / 'cellinfo.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

In [21]:
spikes = mat_file["spikes"]
spikes.keys()

dict_keys(['ids', 'ts', 'times', 'cluID', 'maxWaveformCh', 'maxWaveformCh1', 'phy_amp', 'total', 'amplitudes', 'basename', 'numcells', 'UID', 'sr', 'shankID', 'rawWaveform', 'filtWaveform', 'rawWaveform_all', 'rawWaveform_std', 'filtWaveform_all', 'filtWaveform_std', 'timeWaveform', 'timeWaveform_all', 'peakVoltage', 'channels_all', 'peakVoltage_sorted', 'maxWaveform_all', 'peakVoltage_expFitLengthConstant', 'processinginfo'])

In [22]:
sampling_rate = spikes["sr"]
cluster_id = spikes["cluID"]
times = spikes["times"]
unit_ids = spikes["UID"]

sampling_rate, cluster_id.shape, times.shape, unit_ids.shape, unit_ids[:3], cluster_id[:3], times[:3][0][:3]


(30000,
 (135,),
 (135,),
 (135,),
 array([1, 2, 3], dtype=uint8),
 array([ 119, 1162, 1167], dtype=uint16),
 array([ 68.71136667, 255.81043333, 256.2958    ]))

### `cell_metrics.cellinfo.mat`

In [23]:
file_path = session_files_path_list[0].parent / "e13_16f1_210302.cell_metrics.cellinfo.mat"

# Ouput to an external file
with open(json_directory / 'cell_metrics.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))

In [24]:
spikes = mat_file["spikes"]
spikes.keys()

dict_keys(['ids', 'ts', 'times', 'cluID', 'maxWaveformCh', 'maxWaveformCh1', 'phy_amp', 'total', 'amplitudes', 'basename', 'numcells', 'UID', 'sr', 'shankID', 'rawWaveform', 'filtWaveform', 'rawWaveform_all', 'rawWaveform_std', 'filtWaveform_all', 'filtWaveform_std', 'timeWaveform', 'timeWaveform_all', 'peakVoltage', 'channels_all', 'peakVoltage_sorted', 'maxWaveform_all', 'peakVoltage_expFitLengthConstant', 'processinginfo'])

In [25]:
sampling_rate = spikes["sr"]
cluster_id = spikes["cluID"]
times = spikes["times"]
unit_ids = spikes["UID"]

sampling_rate, cluster_id.shape, times.shape, unit_ids.shape, unit_ids[:3], cluster_id[:3], times[:3][0][:3]


(30000,
 (135,),
 (135,),
 (135,),
 array([1, 2, 3], dtype=uint8),
 array([ 119, 1162, 1167], dtype=uint16),
 array([ 68.71136667, 255.81043333, 256.2958    ]))

### `mono_ress.cellinfo.mat`

In [26]:
file_path = session_files_path_list[0].parent / "e13_16f1_210302.mono_res.cellinfo.mat"

# Output to an external file
with open(json_directory / 'mono_res.json', 'w') as f:
    f.write(json.dumps(build_keys_and_types(mat_file), indent=4))
    


In [27]:
spikes = mat_file["spikes"]
spikes.keys()

dict_keys(['ids', 'ts', 'times', 'cluID', 'maxWaveformCh', 'maxWaveformCh1', 'phy_amp', 'total', 'amplitudes', 'basename', 'numcells', 'UID', 'sr', 'shankID', 'rawWaveform', 'filtWaveform', 'rawWaveform_all', 'rawWaveform_std', 'filtWaveform_all', 'filtWaveform_std', 'timeWaveform', 'timeWaveform_all', 'peakVoltage', 'channels_all', 'peakVoltage_sorted', 'maxWaveform_all', 'peakVoltage_expFitLengthConstant', 'processinginfo'])

In [28]:
sampling_rate = spikes["sr"]
cluster_id = spikes["cluID"]
times = spikes["times"]
unit_ids = spikes["UID"]

sampling_rate, cluster_id.shape, times.shape, unit_ids.shape, unit_ids[:3], cluster_id[:3], times[:3][0][:3]


(30000,
 (135,),
 (135,),
 (135,),
 array([1, 2, 3], dtype=uint8),
 array([ 119, 1162, 1167], dtype=uint16),
 array([ 68.71136667, 255.81043333, 256.2958    ]))

## Check for Consistency across Files
The following code will loop over all the folders contained in a given directory and track where they occur. This will give us an indication of the file *structure* consistency—which we can also use to detect whether the data in those files is homogeneous as well.

In [32]:
import os
from tqdm.notebook import tqdm

def aggregate_file_info(parent_folder, func, aggregator=None):
    
    if aggregator is None:
       aggregator = []


    naming_conventions = {}
    # for root, dirs, files in tqdm(os.walk(parent_folder)):
    #     for file in tqdm(files):
    for root, dirs, files in os.walk(parent_folder):
        for file in files:
            filename, ext = os.path.splitext(file)
            # if filename and filename[0] != '.':
            if filename:
                if (filename[0] == '.'): 
                    filename = filename[1:]
                split_path = filename.split('.')
                naming_convention = '.'.join(split_path[1:]) if len(split_path) > 1 else filename # Remove the '.' character
                if naming_convention not in naming_conventions:
                    naming_conventions[naming_convention] = [] if (isinstance(aggregator, list)) else {} # Use aggregator type as a base
                
                info = func(
                    file = file,
                    naming_convention = naming_convention,
                    root = root,
                    parent_folder = parent_folder
                )

                if isinstance(naming_conventions[naming_convention], list):
                    naming_conventions[naming_convention].append(info)

                else: 
                    naming_conventions[naming_convention][file] = info
                    
    return naming_conventions

def get_file_name(root, parent_folder, **kwargs):
    return os.path.relpath(root, parent_folder)

def count_file_naming_conventions(parent_folder):
    return aggregate_file_info(parent_folder, get_file_name)

### Organize Project Structure by File Types

In [34]:
structure = count_file_naming_conventions(project_root)
with open(project_json_directory / 'structure.json', 'w') as f:
    f.write(json.dumps(structure, indent=4))

### Check for Missing File Types

In [35]:
def add_unique_entries(arr1, arr2):
    for item in arr1:
        if item not in arr2:
            arr2.append(item)

def filter_list(original_list, filter_list):
    return [value for value in original_list if value not in filter_list]

def check_missing_files(dictionary):
    lengths = {}
    files = []
    for key, value in dictionary.items():
        lengths[key] = len(value)
        add_unique_entries(value, files)
    
    missing = dict()
    for filetype, length in lengths.items():
        if (len(files) != length):
            original_list = dictionary[filetype]
            missing[filetype] = filter_list(files, original_list)

    if len(missing.values()):
        print('This dataset has some missing files')
        
    return missing

In [36]:
missing = check_missing_files(structure)
with open(project_json_directory / 'missing_files.json', 'w') as f:
    f.write(json.dumps(missing, indent=4))

This dataset has some missing files


### Aggregate Data from Files

In [39]:
def get_file_data(file, root, **kwargs):
    filename, ext = os.path.splitext(file_path)
    if (ext == '.mat'):
        try:
            mat_file = loadmat_scipy(Path(root, file), simplify_cells=True)
            return build_keys_and_types(mat_file)
        except: 
            print(f'{file} is not readable by loadmat_scipy')

    else:
        print(f'Cannot handle {file} file type')
        
    return {}
        
def aggregate_data(parent_folder):
    return aggregate_file_info(parent_folder, get_file_data, {})

#### Register all the data associated with a project

In [40]:
data = aggregate_data(project_root)
with open(project_json_directory / 'data.json', 'w') as f:
    f.write(json.dumps(data, indent=4))

#### Use the saved data to compare across sessions

In [42]:
with open(project_json_directory / 'data.json') as user_file:
  file_contents = user_file.read()
  
project_data_json = json.loads(file_contents)


In [43]:
def check_consistency(data, user_check_function, base = ''):
    expected_props = set()
    inconsistencies = {}

    # Loop through objects to discover expected properties
    for k, obj in data.items():
        props = set(obj.keys())
        expected_props = expected_props.union(props)

    # Check objects for inconsistencies
    registered_nested_properties = set()
    for file, obj in data.items():

        inconsistent_props = user_check_function(obj, expected_props, base)
        if (inconsistent_props and len(inconsistent_props)):
            inconsistencies[file] = { f'{base}.{key}': message for key, message in inconsistent_props.items() }
            # inconsistencies[file] = [ f'{base}.{key}' if base else key for key, message in inconsistent_props.items() ]

        else:
            for key in obj.keys():
                if isinstance(obj[key], dict):
                    registered_nested_properties.add(key)

    if (len(registered_nested_properties)):
        properties = {}
        for key in registered_nested_properties:
            nested_property_object = { file: obj[key] for file, obj in data.items() if obj.get(key) }
            nested_inconsistencies = check_consistency(nested_property_object, user_check_function, f'{base}.{key}' if base else key)

            if (len(nested_inconsistencies)):
                properties[key] = nested_inconsistencies

        if (len(properties)): 
            
            for key, value in properties.items():
                for file, item in value.items():
                    if (file not in inconsistencies):
                        inconsistencies[file] = {}

                    inconsistencies[file].update(item) #f'{base}.{item_string}' if base else item_string)

    return inconsistencies



def check_missing_props(parent, expected_props, base):
        # Check if any properties are missing or have inconsistent values
        props = set(parent.keys())
        
        if props != expected_props:
            missing_props = expected_props - props

            if missing_props:
                return { key: 'Missing' for key in missing_props }


# inconsistencies = { key: check_consistency(data, check_missing_props) for key, data in project_data_json.items() }

first_key = next(iter(project_data_json))
print(project_data_json[first_key].keys())
inconsistencies = { key: check_consistency(project_data_json[first_key], check_missing_props) for key, data in project_data_json.items() }

with open(project_json_directory / 'data_inconsistencies.json', 'w') as f:
    f.write(json.dumps( 
        { key: data for key, data in inconsistencies.items() if len(data) } , indent=4))
    

dict_keys(['._optotagCA1'])


AttributeError: 'str' object has no attribute 'keys'