In [34]:
# Standard libraries
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import IntProgress
from IPython.display import display
import datetime

# Append base directory
import os,sys,inspect
rootname = "pub-2020-exploratory-analysis"
thispath = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
rootpath = os.path.join(thispath[:thispath.index(rootname)], rootname)
sys.path.append(rootpath)
print("Appended root directory", rootpath)

from mesostat.utils.qt_helper import gui_fnames, gui_fpath
from mesostat.utils.hdf5_io import DataStorage
from mesostat.utils.system import getfiles_walk
from mesostat.utils.matlab_helper import loadmat, matstruct2dict
from mesostat.utils.dictionaries import merge_dicts

from lib.sych.data_read import read_neuro_perf

%load_ext autoreload
%autoreload 2

Appended root directory /media/aleksejs/DataHDD/work/codes/comp-neuro/analysis-mesoscopic/pub-2020-exploratory-analysis
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Pooling Data From LVM

In [None]:
def find_all_substring_locations(s, subs):
    start = 0
    while True:
        start = s.find(subs, start)
        if start == -1: return
        yield start
        start += len(subs) # use start += 1 to find overlapping matches
        
def find_all_substring_rows(l, subs):
    return [i for i in range(len(l)) if subs in l[i]]

def get_files_raw_df(fpathData):
    fileswalk = getfiles_walk(fpathData, ['.lvm'])

    # Convert to pandas
    df = pd.DataFrame(fileswalk, columns=['path', 'fname'])

    # Drop all LVM files that are not of correct format
    df = df[df['fname'].str.contains("mvg")]

    df['session'] = [os.path.basename(path) for path in df['path']]
    df['path'] = [os.path.join(path, fname) for path, fname in zip(df['path'], df['fname'])]
    df['mousename'] = [os.path.splitext(fname)[0] for fname in df['fname']]

    return df.drop('fname', axis=1)

def parse_raw_header(headerText):
    dataHeader = headerText.replace("\t", " ").split('\n')
    dataHeader = [d.strip() for d in dataHeader]
    idxsDate = find_all_substring_rows(dataHeader, "Date ")
    idxsTime = find_all_substring_rows(dataHeader, "Time ")

    dateThis = dataHeader[idxsDate[-1]].split(' ')[-1]
    timeThis = dataHeader[idxsTime[-1]].split(' ')[-1][:15] # Too many decimal points in seconds bad
    return datetime.datetime.strptime(dateThis + ' ' + timeThis, '%Y/%m/%d %H:%M:%S.%f')

def parse_raw_main(mainText):
    dataMain = mainText.replace("\t", " ").split('\n')[2:]
    arrValue = np.array([s.split(' ')[-1] for s in dataMain if len(s) > 0], dtype=float)
    
    nChannel = 49
    nEntry = len(arrValue)
    nTimestep = len(arrValue) // 49
    nRemainder = len(arrValue) % 49

    if nRemainder != 0:
        raise IOError("Unexpected array length", nEntry)

    return arrValue.reshape((nTimestep, nChannel))    

In [None]:
#fpathData = gui_fpath("Root directory for raw data", "./")
fpathData = '/mnt/neurophys-storage2/Sych/Yaro/data_raw/'

In [None]:
dfFiles = get_files_raw_df(fpathData)
dfFiles

In [None]:
mice = set(dfFiles['mousename'])
mice

In [None]:
for mousename in mice:
    ds = DataStorage('raw_'+mousename+'.h5')
    rows = dfFiles[dfFiles['mousename'] == mousename]
    
    for idx, row in rows.iterrows():
        print('Processing', mousename, row['session'])
        
        with open(row['path'], 'r') as f:
            data = f.read()

            headerEndKey = "***End_of_Header***"
            splitIdx = list(find_all_substring_locations(data, headerEndKey))[-1] + len(headerEndKey)

            dateTimeThis = parse_raw_header(data[:splitIdx])

            caIndMat = parse_raw_main(data[splitIdx:])
            
            attrsDict = {
                'mousename': mousename,
                'metric': "raw",
                'target_dim': "(timesteps, channels)",
                'datetime': dateTimeThis
            }

            ds.save_data(row['session'], caIndMat, attrsDict)

# Moving all datasets to data group

In [None]:
def get_files_raw_h5_df(fpath):
    fileswalk = getfiles_walk(fpath, ['raw', '.h5'])
    df = pd.DataFrame(fileswalk, columns=['path', 'fname'])
    df['mousename'] = [os.path.splitext(f)[0][4:] for f in df['fname']]
    df['path'] = [os.path.join(path, fname) for path, fname in zip(df['path'], df['fname'])]
    return df.drop('fname', axis=1)

In [None]:
# fpathDataH5 = gui_fpath("Directory for data files", "./")
fpathDataH5 = '/media/aleksejs/DataHDD/work/codes/comp-neuro/analysis-mesoscopic/pub-2020-exploratory-analysis/analysis-sych'

In [None]:
dfRawH5 = get_files_raw_h5_df(fpathDataH5)

In [None]:
for idx, row in dfRawH5.iterrows():
    with h5py.File(row['path'], 'a') as h5file:
        if 'data' not in h5file.keys():
            grp = h5file.create_group("data")
        
        for key in h5file.keys():
            if key != 'data':
                print(key)
                session = ''.join(list(h5file[key].attrs['name']))
                h5file.move(key, 'data/'+session)

In [None]:
dfRawH5

In [None]:
# Sanity check
for idx, row in dfRawH5.iterrows():
    with h5py.File(row['path'], 'r') as h5file:
        for session in h5file['data'].keys():
            print(session, h5file['data'][session].shape)

# Mark Starts and ends of Trials

In [None]:
for idx, row in dfRawH5.iterrows():
    print(row['mousename'])
    
    with h5py.File(row['path'], 'a') as h5file:
        if 'trialStartIdxs' not in h5file.keys():
            grp = h5file.create_group('trialStartIdxs')
        if 'interTrialStartIdxs' not in h5file.keys():
            grp = h5file.create_group('interTrialStartIdxs')
            
        for session in list(h5file['data'].keys()):
            print(session)

            traceThis = h5file['data'][session][:, -1]
            traceBin = (traceThis > 2).astype(int)
            traceDT = traceBin[1:] - traceBin[:-1]

            idxTrialStart = np.where(traceDT==1)[0] + 1
            idxIntervStart = np.hstack(([0], np.where(traceDT==-1)[0] + 1))

            nTrial = len(idxTrialStart)
            nInterv = len(idxIntervStart)
            
            if nTrial == nInterv:
                idxIntervStart = np.hstack((idxIntervStart, [len(traceThis)]))
                nInterv += 1

            tTrial = idxIntervStart[1:] - idxTrialStart

            FPS = 20 if np.median(tTrial) < 200 else 40
            
            h5file['trialStartIdxs'].create_dataset(session, data=idxTrialStart)
            h5file['interTrialStartIdxs'].create_dataset(session, data=idxIntervStart)
            h5file['data'][session].attrs['FPS'] = FPS
            
#             print(nTrial, nInterv, FPS)
#             print('low', tTrial[tTrial < 8 * FPS] / FPS)
#             print('high', tTrial[tTrial > 12 * FPS] / FPS)

# Appending Channel Labels

In [None]:
def get_files_channel_labels_df(fpath):
    fileswalk = getfiles_walk(fpathDataOrig, ['channel_labels.mat'])
    df = pd.DataFrame(fileswalk, columns=['path', 'fname'])

    df['mousename'] = [os.path.basename(p) for p in df['path']]
    df['path'] = [os.path.join(path, fname) for path, fname in zip(df['path'], df['fname'])]
    return df.drop('fname', axis=1)

In [None]:
# fpathDataOrig = gui_fpath("Directory for original data tree", "./")
fpathDataOrig = '/media/aleksejs/DataHDD/work/data/yaro/neuronal/mvg48'

In [None]:
dfLabels = get_files_channel_labels_df(fpathDataOrig)
dfLabels

In [None]:
for idx, row in dfLabels.iterrows():
    print(row['mousename'])
    
    M = loadmat(row['path'])
    
    rowH5 = dfRawH5[dfRawH5['mousename'] == row['mousename']]
    pathH5 = list(rowH5['path'])[0]
    
    with h5py.File(pathH5, 'a') as h5file:
        if 'channelLabels' not in h5file.keys():
            h5file.create_dataset('channelLabels', data=M['channel_labels'].astype('S'))

# Adding behaviour

**Problems/Bugs**:
1. [early_go_trials, iGO_inhibition] overlap - within this framework could be solved by multiplexing enum
2. mvg_8_2018_11_22_a has 406 trials in behaviour but only 142 in neuro - crop to neuro

In [31]:
def get_files_orig_neuro_df(fpath):
    fileswalk = getfiles_walk(fpathDataOrig, ['data.mat'])
    df = pd.DataFrame(fileswalk, columns=['path', 'fname'])

    df['session'] = [os.path.basename(p) for p in df['path']]
    df['mousename'] = [os.path.basename(os.path.dirname(p)) for p in df['path']]
    return df.drop('fname', axis=1)

In [None]:
dfNeuro = get_files_orig_neuro_df(fpathDataOrig)
dfNeuro

In [73]:
keysNeeded = ['iGO', 'iNOGO', 'iFA', 'iMISS']

for mousename in set(dfNeuro['mousename']):
    rows = dfNeuro[dfNeuro['mousename'] == mousename]
    
    rowH5 = dfRawH5[dfRawH5['mousename'] == mousename]
    pathH5 = list(rowH5['path'])[0]
    
    with h5py.File(pathH5, 'a') as h5file:
        if 'trialTypeNames' not in h5file.keys():
            h5file.create_dataset('trialTypeNames', data=np.array(keysNeeded).astype('S'))
        if 'trialTypes' not in h5file.keys():
            grp = h5file.create_group('trialTypes')
        
        for idx, row in rows.iterrows():
            session = row['session']

            pwd = os.path.join(row['path'], 'behaviorvar.mat')

            behavior = loadmat(pwd)
    #         behavior['trials'] = merge_dicts([matstruct2dict(obj) for obj in behavior['trials']])
            fixint = lambda v: v if not isinstance(v, int) else np.array([v])
            behavior = {k : fixint(v) for k, v in behavior.items()}
    
            keysLst = set([key for key in behavior.keys() if len(behavior[key]) > 0])
            keysLst -= set(['trials'])
            keysLst = list(sorted(keysLst))
            
#             for keyA in keysLst:
#                 for keyB in keysLst:
#                     if keyA != keyB:
#                         inter = set(behavior[keyA]).intersection(set(behavior[keyB]))
#                         if len(inter) > 0:
#                             print(keyA, keyB, len(inter))

            minTrial = np.min([np.min(behavior[k]) for k in keysLst if len(behavior[k]) > 0])
            maxTrial = np.max([np.max(behavior[k]) for k in keysLst if len(behavior[k]) > 0])

            nTrialsExp = len(h5file['trialStartIdxs'][session])

            enumArr = np.full(maxTrial, -1, dtype=int)
            for i, key in enumerate(keysNeeded):
                idxs = (behavior[key] - 1).astype(int)
                assert np.all(enumArr[idxs] == -1)
                enumArr[idxs] = i
            
            print(session, len(enumArr==-1), minTrial, maxTrial, nTrialsExp, maxTrial==nTrialsExp)
            
            if maxTrial > nTrialsExp:
                print('--cropping to neuronal', nTrialsExp)
                enumArr = enumArr[:nTrialsExp]
            
            del h5file['trialTypes'][session]
            
            h5file['trialTypes'].create_dataset(session, data=enumArr)

mvg_4_2017_11_10_a 1000 1 1000 1000 True
mvg_4_2017_11_13_a 896 1 896 896 True
mvg_4_2017_11_14_a 810 1 810 810 True
mvg_4_2017_11_15_a 370 1 370 370 True
mvg_4_2017_11_16_a 640 1 640 640 True
mvg_4_2017_11_17_a 540 1 540 540 True
mvg_4_2017_11_20_a 682 1 682 682 True
mvg_4_2017_11_21_a 700 1 700 700 True
mvg_4_2017_11_22_a 892 1 892 892 True
mvg_4_2017_11_23_a 715 1 715 715 True
mvg_4_2017_11_24_a 700 1 700 700 True
mvg_8_2018_11_12_a 354 1 354 354 True
mvg_8_2018_11_13_a 282 1 282 282 True
mvg_8_2018_11_14_a 430 1 430 430 True
mvg_8_2018_11_15_a 692 1 692 692 True
mvg_8_2018_11_16_a 458 1 458 458 True
mvg_8_2018_11_17_a 554 1 554 554 True
mvg_8_2018_11_18_a 310 1 310 310 True
mvg_8_2018_11_19_a 242 1 242 242 True
mvg_8_2018_11_20_a 385 1 385 385 True
mvg_8_2018_11_21_a 334 1 334 334 True
mvg_8_2018_11_22_a 406 1 406 142 False
--cropping to neuronal 142
mvg_8_2018_11_23_a 358 1 358 358 True
mvg_8_2018_11_27_a 527 1 527 527 True
mvg_8_2018_11_28_a 456 1 456 456 True
mvg_8_2018_11_29_a 