In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import IntProgress
from IPython.display import display

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Append base directory
import os,sys,inspect
rootname = "pub-2020-exploratory-analysis"
thispath = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
rootpath = os.path.join(thispath[:thispath.index(rootname)], rootname)
sys.path.append(rootpath)
print("Appended root directory", rootpath)

from mesostat.utils.qt_helper import gui_fnames, gui_fpath
from mesostat.metric.metric import MetricCalculator
from mesostat.utils.hdf5_io import DataStorage
from mesostat.stat.anova import as_pandas, as_pandas_lst, anova_homebrew

from lib.sych.data_fc_db_raw import DataFCDatabase
from lib.sych.plot_helper import imshow_dataset_by_mouse, imshow_dataset_by_session
from lib.common.metric_helper import metric_by_session

%load_ext autoreload
%autoreload 2

Appended root directory /home/alyosha/work/git/pub-2020-exploratory-analysis


In [3]:
# tmp_path = root_path_data if 'root_path_data' in locals() else "./"
params = {}
# params['root_path_data'] = './'
params['root_path_data'] = '/media/alyosha/Data/TE_data/yarodata/sych_preprocessed'
# params['root_path_data'] = gui_fpath('h5path', './')

In [4]:
dataDB = DataFCDatabase(params)

Searching for data files
Extracting trial type names
Extracting data types
Reading area color map


In [5]:
ds = DataStorage('sych_result_individual_region.h5')

In [6]:
mc = MetricCalculator(serial=True, verbose=False)

In [7]:
print(dataDB.mice)
print(dataDB.dataTypes)
print(dataDB.trialTypeNames)

{'mvg_7', 'mvg_9', 'mvg_4', 'mvg_8'}
{'bn_trial', 'raw', 'bn_session'}
{'iGO', 'iFA', 'iMISS', 'iNOGO'}


# Analysis of Variance

* Across sessions
    - Explained by performance
* Across channels, trials, timesteps
    - Explained by trial type

Things to understand:
* How to compare different rows?
* What models make sense?
* Try linear mixed models?

In [8]:
#trialTypeNames = dataDB.get_trial_type_names()
trialTypeNames = ['iGO', 'iNOGO']
intervNames = dataDB.get_interval_names()

dfDict = {}
for mousename in dataDB.mice:
    sessions = dataDB.get_sessions(mousename)
    dfThis = pd.DataFrame()
    for session in sessions:
        for trialType in trialTypeNames:
            dataTrialLst = []
            for intervName in intervNames:
                data = dataDB.get_neuro_data({'session' : session}, datatype='bn_session',
                                             trialType=trialType, intervName=intervName)[0]
                data = np.mean(data, axis=1)  # Average over timesteps
                dataDF = as_pandas(data, ('trials', 'channels'))
                dataDF['trialType'] = trialType
                dataDF['interval'] = intervName
                dataDF['session'] = session
                dfThis = dfThis.append(dataDF, ignore_index=True)
            
    dfThis = dfThis.drop('trials', axis=1)
    dfDict[mousename] = dfThis

In [None]:
model = '''
    rez ~ C(channels)
    + C(trialType)
    + C(interval)
    + C(session)
    + C(trialType)*C(session)
    + C(trialType)*C(channels)
    + C(interval)*C(channels)
    + C(interval)*C(trialType)
'''

# Session-wide
for mousename in sorted(dataDB.mice):
    print(mousename)
    linModel = ols(model, data=dfDict[mousename]).fit()
    display(sm.stats.anova_lm(linModel, typ=1))

mvg_4


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(channels),47.0,0.741083,0.015768,109.946004,0.0
C(trialType),1.0,1.951072,1.951072,13604.547946,0.0
C(interval),2.0,11.313084,5.656542,39442.256137,0.0
C(session),10.0,0.033239,0.003324,23.177013,3.7298599999999996e-44
C(trialType):C(session),10.0,0.141273,0.014127,98.507539,4.534434e-205
C(trialType):C(channels),47.0,5.733095,0.121981,850.554231,0.0
C(interval):C(channels),94.0,13.674308,0.145471,1014.350939,0.0
C(interval):C(trialType),2.0,2.661126,1.330563,9277.825163,0.0
Residual,609626.0,87.428447,0.000143,,


mvg_7


In [None]:
# Across-sessions
nMice = len(dataDB.mice)
fig, ax = plt.subplots(ncols = nMice, figsize=(5*nMice, 5))

model = '''
rez ~ C(channels)+C(trialType)+C(interval)
'''

for iMouse, mousename in enumerate(sorted(dataDB.mice)):
    dfThis = dfDict[mousename]
    print(mousename)
    sessions = dataDB.get_sessions(mousename)
    performances = dataDB.get_performance_mouse(mousename)
    plotData = []
    
    for session in sessions:
        dfSession = dfThis[dfThis['session'] == session]
        linModel = ols(model, data=dfSession).fit()
        rezStat = sm.stats.anova_lm(linModel, typ=1)
        rezStat = rezStat.drop('Residual')
        plotData += [np.array(rezStat['mean_sq'])]

    names = ['channels', 'trialType', 'interval']
    plotData = np.array(plotData).T
    
    ax[iMouse].set_ylabel('mean_sq')
    for name, x in zip(names, plotData):
        ax[iMouse].semilogy(performances, x, '.', label=name)

#     ax[iMouse].set_xticks(np.arange(len(sessions)))
#     ax[iMouse].set_xticklabels(sessions, rotation=90)
    ax[iMouse].set_title(mousename)
    ax[iMouse].legend()

plt.savefig('pics/ANOVA_bn_session_scatter.png')
plt.show()
plt.close()