In [None]:
import os, random, sys, warnings, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import dask
import h5py
import joblib
# from skimage.external import tifffile as tff
import tifffile as tff
from sklearn.mixture import GaussianMixture as GMM
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

codeDir = r'\\dm11\koyamalab/code/python/code'
sys.path.append(codeDir)
import apCode.FileTools as ft
import apCode.volTools as volt
from apCode.machineLearning import ml as mlearn
import apCode.behavior.FreeSwimBehavior as fsb
import apCode.behavior.headFixed as hf
import apCode.SignalProcessingTools as spt

from apCode import util as util

from apCode.behavior import gmm as my_gmm

plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42


try:
    if __IPYTHON__:
        get_ipython().magic('load_ext autoreload')
        get_ipython().magic('autoreload 2')
except NameError:
    pass

# Setting seed for reproducability
seed = 143
random.seed = seed


plt.style.use(('fivethirtyeight', 'seaborn-talk'))
plt.figure()
plt.close()

print(time.ctime())

In [None]:
#%% Path to excel sheet storing paths to data and other relevant info
dir_xls = r'\\Koyama-S2\Data3\Avinash\Projects\RS recruitment\GCaMP imaging'
dir_group = r'\\Koyama-S2\Data3\Avinash\Projects\RS recruitment\GCaMP imaging\Group'

file_xls = 'GCaMP volumetric imaging summary.xlsx'
xls = pd.read_excel(os.path.join(dir_xls, file_xls), sheet_name='Sheet1')

In [None]:
#%% Get all fish paths and check for dataframes in each path
inds_fish = np.array(xls.FishIdx.dropna())
pathList = np.array([xls.loc[xls.FishIdx == ind].Path.iloc[0] for ind in inds_fish])

paths_df = []
paths_hFile = []
for path_ in pathList:
    file = ft.findAndSortFilesInDir(path_, ext = 'pickle', search_str='dataFrame')
    if len(file)>0:
        paths_df.append(os.path.join(path_, file[-1]))
    file = ft.findAndSortFilesInDir(path_, ext = 'h5', search_str='procData')
    if len(file)>0:
        paths_hFile.append(os.path.join(path_, file[-1]))


### *Load available tail angles and clean wiwht wavelet*

In [None]:
#%% Read tail angles from all the available HDF files
%time dic_ta = hf.tailAngles_from_hdf_concatenated_by_trials(pathList)
ta = np.concatenate(dic_ta['tailAngles'], axis=1)
%time ta, _, svd  = hf.cleanTailAngles(ta, dt=1/500)

In [None]:
#%$ Save data for later access

dic_ta['tailAngles_clean'] = ta.copy()
%time np.save(os.path.join(dir_group, 'tailAngles_clean.npy'), dic_ta)
%time joblib.dump(svd, os.path.join(dir_group,'svd_object_tailAngles.pkl'));


In [None]:
%%time
#%% Reload data to continue from here
fName = 'tailAngles_clean.npy'
dic_ta = np.load(os.path.join(dir_group, fName), allow_pickle = True)[()]
ta_clean = dic_ta['tailAngles_clean']

In [None]:
#%% Extract features (for GMM) from tail angle timeseries
hpf = 1
dt_behav = 1/500

%time df_features = hf.swimEnvelopes_multiLoc(ta_clean)
arr_feat = np.array(df_features)
labels_feat = df_features.columns
ta_tot = ta_clean[-1]
ta_tot = spt.chebFilt(ta_clean[-1], dt_behav, hpf, btype = 'highpass')

In [None]:
%%time 

onOffThr = 3 # Threshold for swim onset of offset in maximum of envelopes
t_smooth = 30e-3 # Smoothing kernel length in ms
nKer = int(np.round(t_smooth*(1/dt_behav)))

scaler = StandardScaler(with_mean=False).fit(arr_feat)
polarity = np.zeros_like(ta_tot)
envelopes = spt.emd.envelopesAndImf(ta_tot, interp_kind = 'quadratic')['env']
polarity[np.where(envelopes['crests']>onOffThr)]=1
polarity[np.where(envelopes['troughs']<-onOffThr)]=-1
maxEnv = envelopes['max']
arr_feat_scaled = scaler.transform(arr_feat)

arr_feat_scaled = np.array(dask.compute(*[dask.delayed(spt.causalConvWithSemiGauss1d)(x, nKer*2)\
                   for x in arr_feat_scaled.T])).T

# arr_feat_scaled = np.c_[polarity, arr_feat_scaled] # Hoping that incorporating the polarity vector
#                                                    # will help GMM distinguish by turn direction
    
inds_supraThresh = np.where(maxEnv > onOffThr)[0]
arr_feat_supra = arr_feat_scaled[inds_supraThresh,:]

path_ = os.path.join(dir_group, 'standardScaler.pkl')
joblib.dump(scaler,path_);
print(f'Saved standard scaler at\n {path_}')

In [None]:
subSample = 5
comps = np.arange(1,36)

X = arr_feat_supra[::subSample,:]

%time out = mlearn.gmm_information_vs_nComponents(X,comps = comps)

plt.figure(figsize = (10,5))
x = np.arange(len(out['aic'])) + 1
plt.plot(comps, out['aic'],'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, out['bic'],'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

In [None]:
#%% Try computing GMM metrics for dimensionality reduced feature array

pca = PCA(n_components = 0.95, random_state = 143)
%time X_pca = pca.fit_transform(X)
print(f'Number of reduced features = {X_pca.shape[1]}')

%time out = mlearn.gmm_information_vs_nComponents(X_pca,comps = comps)

plt.figure(figsize = (10,5))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

In [None]:
#%% Fit GMM model with specified number of components, predict labels for data, 
## and plot in low dimensions with PCA using labels as colors
n_comps = 10
subSample = 4
alpha = 0.5

X = arr_feat_supra[::subSample,:]

%time gmm = mlearn.GMM(n_components= n_comps, covariance_type='full', random_state=143,\
                       n_init=3, verbose = 0).fit(X_pca)
%time labels = gmm.predict(X_pca)
# orderedLabels = gmm.sorted_labels() # This will be used to orer labels in subsequent uses
# labels_sorted = gmm.relabel_by_norm(labels)

pca = PCA(n_components = 3, random_state = 143)
%time x_pca = pca.fit_transform(X_pca)

fh,ax = plt.subplots(2,2,figsize = (15,10))
ax = ax.flatten()

# clrs = [plt.cm.tab10(_) for _ in labels_sorted]
# clrs = plt.cm.tab20(labels_sorted)
inds = np.linspace(0,1,n_comps)
clrs = plt.cm.nipy_spectral(inds)[labels_sorted]

ax[0].scatter(x_pca[:,0], x_pca[:,1], s = 10, c = clrs, alpha = alpha)
ax[0].set_xlabel('pca 1')
ax[0].set_ylabel('pca 2')

ax[1].scatter(x_pca[:,0], x_pca[:,2], s = 10, c = clrs, alpha = alpha)
ax[1].set_xlabel('pca 1')
ax[1].set_ylabel('pca 3')

ax[2].scatter(x_pca[:,1], x_pca[:,2], s = 10, c = clrs, alpha = alpha)
ax[2].set_xlabel('pca 2')
ax[2].set_ylabel('pca 3')
fh.tight_layout()

clrs = plt.cm.nipy_spectral(inds)
x = np.arange(len(orderedLabels))
y = np.ones_like(orderedLabels)
plt.figure(figsize = (20,5))
plt.scatter(x,y, c= clrs,s =2000, marker = 's')
plt.yticks([])
plt.xticks(x, fontsize = 20);
plt.title('Norm-ordered colors', fontsize = 20)
None


In [None]:
#%% Save fitted gmm model
joblib.dump(gmm, os.path.join(dir_group, f'group_fitted_gmm_{util.timestamp()}.pkl'));

In [None]:
#%% Fit GMM on tail angles from multiple fish and save the fitter
n_gmm = 15

%time fitter = my_gmm.train_on_tailAngles(ta_clean, n_gmm=n_gmm)
n_pca = fitter['pca'].n_components_
n_gmm = fitter['gmm'].n_components

joblib.dump(fitter, os.path.join(dir_group, f'gmm_fitter_object_pca-{n_pca}_gmm-{n_gmm}_{util.timestamp()}.pkl'));

In [None]:
trlLen = 4000
iTrl = 132

%matplotlib qt
inds = np.arange(trlLen*(iTrl-1), trlLen*iTrl)
ta_sub = ta_clean[:,inds]
labels, arr_feat = my_gmm.predict_on_tailAngles(ta_sub, fitter)

y = ta_sub[-1]
x = np.arange(len(y))
clrs = labels/fitter['gmm'].n_components
plt.figure(figsize = (20,10))
plt.plot(x,y, lw = 0.5, c= 'k')
plt.scatter(x,y,c = plt.cm.tab20(clrs), s= 10)
# plt.xlim(0, 1000)
plt.ylim(-200,200)

In [None]:
file_svd = 'svd_object_tailAngles.pkl'
path_svd = os.path.join(dir_group, file_svd)
foo = joblib.load(path_svd)

In [None]:
#%% Try computing GMM metrics for dimensionality reduced feature array

comps = np.arange(1,31)

%time out = mlearn.gmm_information_vs_nComponents(X_svd, comps = comps)

%matplotlib inline
plt.figure(figsize = (10,5))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

# SVD based approach

In [None]:
#%% How many components to use
n_svd = 3
comps = np.arange(5, 26)
%matplotlib inline
%time X_svd, svd = my_gmm.tailAngles_to_svd_featureArray(ta_clean,n_svd= n_svd, use_envelopes=True)

%time out = mlearn.gmm_information_vs_nComponents(X_svd,comps = comps, warm_start = True)

plt.figure(figsize = (10,10))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

In [None]:
#%% SVD based fitter
fName = f'gmm_fitter_svd-{fitter["svd"].n_components}_gmm-{fitter["gmm"].n_components}.pkl'

%time fitter = my_gmm.train_on_tailAngles_svd(ta_clean, n_gmm = 22, n_svd =3, use_envelopes=True)

### Save the fitter
%time joblib.dump(fitter, os.path.join(dir_group, fName))


In [None]:
#%% How many components to use
n_svd = 3
comps = np.arange(20, 31)
%matplotlib inline
%time X_svd, svd = my_gmm.tailAngles_to_svd_featureArray(ta_clean,n_svd= n_svd, use_envelopes=True)

%time out = mlearn.gmm_information_vs_nComponents(X_svd,comps = comps, warm_start = True)

plt.figure(figsize = (10,10))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

## Try the above, but use PCA to reduce dimensionality beforehand

In [None]:
#%% Demo that PCA does not influence GMM components
# x, y = make_blobs(n_samples=1000, centers=[(4,3,0,0), (10,-1, 2,0),(11, -17, 1,0), (-13, 27, 0,0)])
# x += np.random.randn(*x.shape)*0.1
# x_pca = PCA(n_components=0.99).fit_transform(x)
# print(x.shape, x_pca.shape)
# print(np.unique(y))

# comps = np.arange(1,11)
# %time foo = mlearn.gmm_information_vs_nComponents(x, comps= comps)
# %time foo_pca = mlearn.gmm_information_vs_nComponents(x, comps= comps)

# plt.figure(figsize = (10,5))
# plt.subplot(121)
# plt.plot(comps,foo['aic'],'.-', label = 'aic')
# plt.plot(comps, foo['bic'],'.-', label = 'bic')
# plt.legend()
# plt.grid()

# plt.subplot(122)
# plt.plot(comps,foo_pca['aic'],'.-', label = 'aic')
# plt.plot(comps, foo_pca['bic'],'.-', label = 'bic')
# plt.legend()
# plt.grid()

In [None]:
#%% How many components to use
n_svd = 3
comps = np.arange(20, 31)
%matplotlib inline
%time X_svd, svd = my_gmm.tailAngles_to_svd_featureArray(ta_clean,n_svd= n_svd, use_envelopes=True)

%time out = mlearn.gmm_information_vs_nComponents(X_svd,comps = comps, warm_start = True)

plt.figure(figsize = (10,10))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

## New and improved SVD based GMM model with PCA for dimensionality reduction

In [None]:
n_svd = 3
n_gmm = 20
use_envelopes = True
pca_percVar = 0.98
verbose = 1

gmm_env = my_gmm.SvdGmm(n_svd = n_svd,use_envelopes=use_envelopes,pca_percVar=pca_percVar,n_gmm=n_gmm,
                        verbose = verbose)
%time gmm_env = gmm_env.fit(ta_clean)
%time labels, features = gmm_env.predict(ta_clean)

#-- Save model
file_model = f'gmm_svd-{n_svd}_env_pca-{gmm_env.pca.n_components_}_gmm-{n_gmm}_{util.timestamp()}.pkl'
joblib.dump(gmm_env, os.path.join(dir_group, file_model))

In [None]:
#-- Save model
file_model = f'gmm_svd-{n_svd}_env_pca-{gmm_env.pca.n_components_}_gmm-{n_gmm}_{util.timestamp()}.pkl'
joblib.dump(gmm_env, os.path.join(dir_group, file_model))

In [None]:
model = joblib.load(os.path.join(dir_group, file_model))

In [None]:
model.predict?

In [None]:
#%% How many components to use
n_svd = 3
comps = np.arange(17, 31)
pca_percVar = 0.99

%matplotlib inline
%time X_svd, svd = my_gmm.tailAngles_to_svd_featureArray(ta_clean,n_svd= n_svd, use_envelopes=True)

%time X_svd_pca = PCA(n_components=pca_percVar).fit_transform(X_svd)
print(f'Reduced to {X_svd_pca.shape[1]} pca components')

%time out = mlearn.gmm_information_vs_nComponents(X_svd_pca,comps = comps, warm_start = False)

plt.figure(figsize = (10,10))
plt.plot(comps, spt.standardize(out['aic']),'o-', alpha = 0.5, label = 'AIC')
plt.plot(comps, spt.standardize(out['bic']),'o-', alpha = 0.5, label = 'BIC')
plt.xticks(comps)
plt.yticks(np.arange(0,1.1,0.1))
plt.xlabel('Number of components')
plt.grid(True)
plt.legend()

In [None]:
#%% SVD based fitter
%time fitter_svd_pca = my_gmm.train_on_tailAngles_svd(ta_clean, n_gmm = 22, n_svd =3, use_envelopes=True)

In [None]:
markers = ['\\alpha', '\\beta', '\gamma', '\sigma','\infty', \
            '\spadesuit', '\heartsuit', '\diamondsuit', '\clubsuit', \
            '\\bigodot', '\\bigotimes', '\\bigoplus', '\imath', '\\bowtie', \
            '\\bigtriangleup', '\\bigtriangledown', '\oslash' \
           '\ast', '\\times', '\circ', '\\bullet', '\star', '+', \
            '\Theta', '\Xi', '\Phi', \
            '\$', '\#', '\%', '\S']
def getMarker(i):
    # Use modulus in order not to have the index exceeding the length of the list (markers)
    return "$"+markers[i % len(markers)]+"$"

In [None]:
trlLen = 4000
iTrl = 99
cmap = plt.cm.nipy_spectral_r
plt.style.use(('seaborn-white'))

%matplotlib auto
inds = np.arange(trlLen*(iTrl-1), trlLen*iTrl)
ta_sub = ta_clean[:,inds]

labels, X = my_gmm.predict_on_tailAngles_svd(ta_sub, fitter)

y = ta_sub[-1]
x = np.arange(len(y))
clrs = labels/fitter['gmm'].n_components
plt.figure(figsize = (20,10))
plt.plot(x,y, lw = 0.5, c= 'k')
lbls_unique = np.unique(labels)
for lbl in lbls_unique:
    inds = np.where(labels==lbl)[0]
#     plt.scatter(x[inds],y[inds],c = 'k', s = 200,marker = getMarker(lbl), alpha = 0.5)
    plt.scatter(x[inds],y[inds],c = 'k', s = 200,marker = r"${}$".format(str(lbl)), alpha = 0.5)
# plt.scatter(x,y, s= 20, c = clrs)
# plt.xlim(0, 1000)
plt.ylim(-200,200)

In [None]:
labels,X = my_gmm.predict_on_tailAngles_svd(ta_clean, fitter)

In [None]:
pca = PCA(n_components=0.98).fit(X)


In [None]:
#%% State matrix
m = labels.max()+1
n  = len(labels)
T = np.zeros((m,n))
F = T.copy()
t = np.arange(len(labels))
T[labels,t] = 1
F[labels,t]= labels



In [None]:
plt.imshow(F, aspect = 'auto', cmap = 'viridis')

In [None]:
X_pca = PCA(n_components=2).fit_transform(F)

In [None]:
plt.plot(X_pca[:,0], X_pca[:,1],'.-')

In [None]:
plt.plot(foo[:,0])

In [None]:
np.linalg.pinv(np.eye(3))