# Files documentation
The workflow here is to have this notebook to describe in more details the files that are available. For version control
purposes this file should be commited without output and only run locally.

In [None]:
from pathlib import Path
from pprint import pprint

import numpy as np
import scipy as sp
import pandas as pd
import h5py
import mat73
from scipy.io import loadmat

# Data loading
Here we load our base path:

In [None]:
data_location = '/home/heberto/globus_data'  # Change this with the right location
data_path = Path(data_location)
author_path = Path("SenzaiY")
base_path = data_path.joinpath(author_path)

Now, this data sets is organized with one folder per subject. Let's peak inside of  `base_path`: 

In [None]:
subject_path_dic = {p.stem:p for p in base_path.iterdir() if p.is_dir()}
subject_path_dic.keys()

The output should be something like ['YMV01', 'YMV02', ...] indicating the different subjects

Inside each of the subjects we can find a folder per sesion:

In [None]:
subject = 'YMV01'
sessions_path_dic = {p.stem:p for p in subject_path_dic[subject].iterdir() if p.is_dir()}
sessions_path_dic.keys()

The ouput of this should be: `YMV01_170818`. 

The name of the sessions fits the following pattern `{subject}_{date}`.

Let's gather all the available sessions in one dic for convenience

In [None]:
data_path = Path("/home/heberto/globus_data")
author_path = Path("SenzaiY")
base_path = data_path.joinpath(author_path)

session_list = [
    session
    for subject in base_path.iterdir()
    if subject.is_dir() and "YMV" in subject.name
    for session in subject.iterdir()
]
session_path_dic = {session.stem:session for session in session_list if session.is_dir()}
session_path_dic

The output here should be a combination of session:path for all the sessions

# An overview of the available data
Let's find out which data types are available. The files with formats `.jpg`, `.png`, `.fig`, `.pdf`, `.svg` are either photos, vector or documents and we will not be concerned about them so we remove them. We  focus here on the first session on the index:

In [None]:
not_data_formats = ['.jpg', '.png', '.pdf', '.svg', '.fig', '.py']

subject = 'YMV01'
date = '170818'
session = f"{subject}_{date}"
session_path = session_path_dic[session]

format_list = list({p.suffix for p in session_path.rglob('*') if not p.is_dir()})
format_list.sort()
format_list = [p for p in format_list if p not in not_data_formats]
pprint(format_list, compact=True)

The output should be something like this:

    ['', '.1', '.dat', '.eeg', '.json', '.log', '.mat', '.npy', '.nrs',
    '.pkl', '.tsv', '.xml']

The goal of this document is to explore the data available on the rest of the formats and we will do so the following sections. Meanwhile, for orientation purposes, here is a brief description of the available formats and the files associated with them

1. First we have the format '.l' which are actually two formats `.res.1` and `.clu.1`. These are plain files related to the neuroscope sorting format.

2. Then we have the typical '.dat' and '.egg' formats that account for the raw data and the local field potential respectively

3. The `.json` seem to be associated with hidden files corresponding to the `.phy` format. This is related to spike sorting.

4. The `.log` extension is the log file that corresponds to the `phy` program.

5. There is a variety of `.mat` files:

6. There is a varety of `.npy` files.

7. `.nrs`

8. `.pkl` pickled file

9. `.tsv` tabular separated data.

10. `.xml` an xml file



# Neuroscope res and clu
These files have a name ofr hte format `{session}.res` and `{session}.clu`. Those should be the keys of the 
following dics

In [None]:
sorting_files_dic = {p.stem:p for p in session_path.rglob('*') if p.suffix == '.1'}
sorting_files_dic.keys()

These are plain text files and can be opened with pandas as a data frame

In [None]:
clu_file_name = f"{session}.clu"
res_file_name = f"{session}.res"

clu_df = pd.read_csv(sorting_files_dic[clu_file_name], header=None, names=['unit'])
res_df = pd.read_csv(sorting_files_dic[res_file_name], header=None, names=['times'])
res_df.shape, clu_df.shape

The files should have the same shape. As mentioned those are related to spike sorting. `.clu` contains the units and `.res` the times.
We can concatenat them to have the associated ready

In [None]:
pd.concat([clu_df, res_df], axis=1).sample(n=5)

# Json files

In [None]:
json_files_dic= {p.stem:p for p in session_path.rglob('*') if p.suffix == '.json'}
json_files_dic

These files correspond to some meta data of the `phy` software

# Mat files
Let's gather all the mat files

In [None]:
mat_files_dic = {p.stem:p for p in session_path.iterdir() if p.suffix=='.mat'}
mat_files_dic.keys()

In [None]:
'cell_metrics'

In [None]:
file_name = 'cell_metrics'
mat_file_path = mat_files_dic[file_name]
try:
    mat_file = loadmat(mat_file_path)
except NotImplementedError:
    mat_file = mat73.loadmat(mat_file_path, use_attrdict=True)
print(mat_file_path.name, type(mat_file))
print(mat_file.keys())

In [None]:
for file_path in mat_files_dic.values():
    try:
        mat_file = loadmat(file_path)
    except NotImplementedError:
        mat_file = mat73.loadmat(file_path, use_attrdict=True)
    print(file_path.name, type(mat_file))
    print(mat_file.keys())

# Numpy files

In [None]:
numpy_files_dic = {p.stem:p for p in session.rglob('*') if p.suffix == '.npy'}
numpy_files_dic.keys()

Let's the spike_times file to explore

In [None]:
numpy_file = np.load(numpy_files_dic['spike_times'])
numpy_file.shape

In [None]:
numpy_file = np.load(numpy_files_dic['amplitudes'])
numpy_file.shape

In [None]:
numpy_file = np.load(numpy_files_dic['channel_map'])
numpy_file.shape

In [None]:
numpy_file = np.load(numpy_files_dic['spike_clusters'])
np.unique(numpy_file)

In [None]:
numpy_file = np.load(numpy_files_dic['templates'])
numpy_file.shape

# NRS

# Pickled

# TSV - Tabular separated file

# XML