In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from mpl_toolkits.axes_grid1 import make_axes_locatable

from scipy.spatial import distance
from scipy.stats import ttest_ind, chisquare

import scipy.cluster as cluster

from statsmodels.stats.multitest import multipletests
from statsmodels.formula.api import ols

from sklearn import metrics

In [None]:
pheno_et = pd.read_csv('pheno_et.csv')
pheno_et.set_index(['EID'], inplace=True)
pheno_et = pheno_et.astype(np.float64)

In [None]:
x = np.load('et_time_series_x.npy')
y = np.load('et_time_series_y.npy')

In [None]:
x = np.clip(x, 0, 800)
y = np.clip(y, 0, 600)

In [None]:
et = np.stack((x, y), axis=2)

# Euclidean distance

In [None]:
dis = np.zeros((et.shape[0], et.shape[0]))

min_e = np.inf
for i in range(et.shape[0] - 1):
    for j in range(i + 1, et.shape[0]):
        e = np.mean(np.sqrt((et[i,:,0] - et[j,:,0])**2 + (et[i,:,1] - et[j,:,1])**2))
        if e < min_e:
            min_e = e
        dis[i,j] = e
        dis[j,i] = e

In [None]:
# bootstrapping
niter = 1000
chunk = int(et.shape[0] * 0.9)

# create indicator matrix of selected sample pairs
ind_mat = np.zeros((et.shape[0], et.shape[0]))

# create consensus matrices for K = 2:20
consensus_mats = [np.zeros((et.shape[0], et.shape[0])) for i in range(19)]

for i in range(niter):
    idx = np.random.choice(et.shape[0], chunk, replace=False) # 90% resampling
    idx.sort()
    
    # update indicator matrix
    idx_a = np.zeros((et.shape[0],))
    idx_a[idx] = 1
    idx_a = idx_a[:,np.newaxis]
    ac = np.dot(idx_a, idx_a.T)
    ac[ac != 1] = 0
    ind_mat += ac
    
    # partition distance matrix and perform hierarchical clustering for K = 2:20
    dis_boot = dis[idx][:,idx]
    cd = distance.squareform(dis_boot) # since inputting dense distance matrix into linkage function, must convert to 1D condensed distance vector
    Z = cluster.hierarchy.linkage(cd, method='ward')
    for c in range(2, 21):
        cm = consensus_mats[c - 2]
        clusters = cluster.hierarchy.fcluster(Z, c, criterion='maxclust') # 'maxclust' cuts dendrogram so that it creates c optimal clusters
        
        # in order to use the clever method below, preserve unselected indices as 0 (cluster labels start at 1)
        ct = np.zeros((et.shape[0]),)
        ct[idx] = clusters
        
        ####### extremely clever way of creating consensus matrix -- taken from Aki's PyBASC #######
        ct = ct[:,np.newaxis]
        cm += (np.dot(ct**-1., ct.T) == 1).astype(np.float64)
        ############################################################################################
        
for mat in consensus_mats:
    mat /= ind_mat

In [None]:
def CDF(X, resolution=100):
    """
    Calculate empirical cumulative distribution (CDF) for given consensus matrix
    """
    
    N = X.shape[0]
    
    X = np.copy(X)
    ur,uc = np.triu_indices(X.shape[0],1)
    X = X[ur,uc]
    
    cdf = []
    for i in np.linspace(0,1,resolution):
        cdf.append(np.sum(X <= i) / (N * (N - 1) / 2))
        
    return np.array(cdf)

In [None]:
cdfs = []
for mat in consensus_mats:
    cdfs.append(CDF(mat))

In [None]:
plt.figure(figsize=(10,10))
for cdf in cdfs:
    plt.plot(cdf)
plt.xticks(ticks=np.linspace(0,100,6), labels=[0,0.2,0.4,0.6,0.8,1])
plt.title('Empirical CDFs for K = 2:20', fontsize=18)
plt.xlabel('Consensus index value', fontsize=18)
plt.ylabel('CDF', fontsize=18)
plt.savefig('figs_clust/eucl_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
dx = np.mean(np.diff(np.linspace(0,1,100)))
auc = []
for cdf in cdfs:
    auc.append(np.trapz(cdf, dx=dx))
auc = [auc[0]] + auc

In [None]:
del_auc = np.diff(np.array(auc))

In [None]:
plt.figure(figsize=(10,10))
plt.plot(del_auc, 'bo-')
_ = plt.xticks(ticks=np.arange(19), labels=np.arange(2,21))
plt.title('Change in AUC in CDFs', fontsize=18)
plt.xlabel('Number of clusters', fontsize=18)
plt.ylabel('Change in AUC', fontsize=18)
plt.savefig('figs_clust/eucl_del_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
best = np.argmax(del_auc)

In [None]:
cm = np.copy(consensus_mats[best])
cm = 1 - cm
cd = distance.squareform(cm)
Z = cluster.hierarchy.linkage(cd, method='ward')
clusters = cluster.hierarchy.fcluster(Z, best + 2, criterion='maxclust')

In [None]:
np.unique(clusters, return_counts=True)

In [None]:
# peuc = pheno_et.copy()
# peuc['clusters'] = clusters
# peuc.to_csv('et_subs_euc.csv')

In [None]:
plt.figure(figsize=(10,10))
dn = cluster.hierarchy.dendrogram(Z, color_threshold=5.2, no_labels=True)
plt.title('Dendrogram of optimal consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/eucl_dendrogram.png', bbox_inches='tight', dpi=300)

In [None]:
cm = np.copy(consensus_mats[best])
cm = cm[np.argsort(clusters)][:,np.argsort(clusters)]

plt.figure(figsize=(10,10))
im = plt.imshow(cm, cmap='Reds')
plt.colorbar(im)
plt.title('Consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/eucl_consensus.png', bbox_inches='tight', dpi=300)

In [None]:
u, c = np.unique(clusters, return_counts=True)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(35,20))

for i in range(u.size):
    ax[0].plot(np.mean(x[clusters == u[i]], axis=0))
    ax[1].plot(np.mean(y[clusters == u[i]], axis=0), label='%d; N=%d' % (u[i], c[i]))
ax[0].set_title('x-direction mean time series', fontsize=18, fontweight='bold')
ax[1].set_title('y-direction mean time series', fontsize=18, fontweight='bold')
ax[1].legend(loc=0, fontsize=18)
fig.savefig('figs_clust/eucl_mts.png', bbox_inches='tight', dpi=300)

In [None]:
peuc = pheno_et.copy()
peuc['clusters'] = clusters

In [None]:
results = ols('Age ~ C(clusters)', data=peuc).fit()
results.summary()

In [None]:
freq = []
for i in u:
    freq.append(
        np.sum(peuc['Sex'][peuc['clusters'] == i] == 0) / np.sum(peuc['clusters'] == i)
    )

In [None]:
chisquare(freq)

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(30,30))
sns.violinplot(x='clusters', y='ASSQ_Total', data=peuc, ax=ax[0])
sns.violinplot(x='clusters', y='SCQ_Total', data=peuc, ax=ax[1])
sns.violinplot(x='clusters', y='SAS_Tot', data=peuc, ax=ax[2])
sns.violinplot(x='clusters', y='SRS_Total_T', data=peuc, ax=ax[3])
ax[0].set_title('ASSQ_Total', fontsize=18, fontweight='bold')
ax[1].set_title('SCQ_Total', fontsize=18, fontweight='bold')
ax[2].set_title('SAS_Tot', fontsize=18, fontweight='bold')
ax[3].set_title('SRS_Total_T', fontsize=18, fontweight='bold')
fig.savefig('figs_clust/eucl_violin.png', bbox_inches='tight', dpi=300)

In [None]:
pheno_c = []
for i in np.unique(clusters):
    pheno_c.append(pheno_et.iloc[clusters == i,:4])

In [None]:
pvals = np.zeros((np.unique(clusters).size, np.unique(clusters).size, 4))
for i in range(np.unique(clusters).size - 1):
    for j in range(i + 1, np.unique(clusters).size):
        for k in range(4):
            p = ttest_ind(pheno_c[i].iloc[:,k], pheno_c[j].iloc[:,k], equal_var=False)[1]
            pvals[i,j,k] = p
            pvals[j,i,k] = p

In [None]:
p_corr = np.zeros(pvals.shape)
nmu = np.triu_indices(pvals.shape[0], 1)
for i in range(pvals.shape[2]):
    nm = np.triu_indices(np.unique(clusters).size, 1)
    p_unc = pvals[...,i][nm[0],nm[1]]
    res = multipletests(p_unc, method='fdr_bh')
    p_corr[nmu[0],nmu[1],i] = res[1]
    p_corr[...,i] += p_corr[...,i].T

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(11,11))
ax_flat = ax.flat
scores = pheno_et.columns.tolist()[:-1]
for i in range(p_corr.shape[2]):
    im = ax_flat[i].imshow(p_corr[...,i], cmap='jet', vmax=0.05)
    ax_flat[i].set_title(scores[i], fontsize=18)
    ax_flat[i].set_xticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_yticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_xticklabels(np.arange(p_corr.shape[0]) + 1)
    ax_flat[i].set_yticklabels(np.arange(p_corr.shape[0]) + 1)
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.95)
fig.savefig('figs_clust/eucl_pheno.png', bbox_inches='tight', dpi=300)

# Cosine distance

In [None]:
dis_cos = np.zeros((et.shape[0], et.shape[0]))

# originally, origin of ET data is top left corner--this does not suit cosine distance well, so need to center data at center of screen
et[...,0] -= 400
et[...,1] -= 300

min_cos = np.inf
for i in range(et.shape[0] - 1):
    for j in range(i + 1, et.shape[0]):
        cos = distance.cosine(np.concatenate((et[i,:,0], et[i,:,1])), np.concatenate((et[j,:,0], et[j,:,1])))
        if cos < min_cos:
            min_cos = cos
        dis_cos[i,j] = cos
        dis_cos[j,i] = cos

In [None]:
# bootstrapping
niter = 1000
chunk = int(et.shape[0] * 0.9)

# create indicator matrix of selected sample pairs
ind_mat = np.zeros((et.shape[0], et.shape[0]))

# create consensus matrices for K = 2:20
consensus_mats_cos = [np.zeros((et.shape[0], et.shape[0])) for i in range(19)]

for i in range(niter):
    idx = np.random.choice(et.shape[0], chunk, replace=False) # 90% resampling
    idx.sort()
    
    # update indicator matrix
    idx_a = np.zeros((et.shape[0],))
    idx_a[idx] = 1
    idx_a = idx_a[:,np.newaxis]
    ac = np.dot(idx_a, idx_a.T)
    ac[ac != 1] = 0
    ind_mat += ac
    
    # partition distance matrix and perform hierarchical clustering for K = 2:20
    dis_boot = dis_cos[idx][:,idx]
    cd = distance.squareform(dis_boot) # since inputting dense distance matrix into linkage function, must convert to 1D condensed distance vector
    Z = cluster.hierarchy.linkage(cd, method='ward')
    for c in range(2, 21):
        cm = consensus_mats_cos[c - 2]
        clusters = cluster.hierarchy.fcluster(Z, c, criterion='maxclust') # 'maxclust' cuts dendrogram so that it creates c optimal clusters
        
        # in order to use the clever method below, preserve unselected indices as 0 (cluster labels start at 1)
        ct = np.zeros((et.shape[0]),)
        ct[idx] = clusters
        
        ####### extremely clever way of creating consensus matrix -- taken from Aki's PyBASC #######
        ct = ct[:,np.newaxis]
        cm += (np.dot(ct**-1., ct.T) == 1).astype(np.float64)
        ############################################################################################
        
for mat in consensus_mats_cos:
    mat /= ind_mat

In [None]:
cdfs = []
for mat in consensus_mats_cos:
    cdfs.append(CDF(mat))

In [None]:
plt.figure(figsize=(10,10))
for cdf in cdfs:
    plt.plot(cdf)
plt.xticks(ticks=np.linspace(0,100,6), labels=[0,0.2,0.4,0.6,0.8,1])
plt.title('Empirical CDFs for K = 2:20', fontsize=18)
plt.xlabel('Consensus index value', fontsize=18)
plt.ylabel('CDF', fontsize=18)
plt.savefig('figs_clust/cos_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
dx = np.mean(np.diff(np.linspace(0,1,100)))
auc = []
for cdf in cdfs:
    auc.append(np.trapz(cdf, dx=dx))
auc = [auc[0]] + auc

In [None]:
del_auc = np.diff(np.array(auc))

In [None]:
plt.figure(figsize=(10,10))
plt.plot(del_auc, 'bo-')
_ = plt.xticks(ticks=np.arange(19), labels=np.arange(2,21))
plt.title('Change in AUC in CDFs', fontsize=18)
plt.xlabel('Number of clusters', fontsize=18)
plt.ylabel('Change in AUC', fontsize=18)
plt.savefig('figs_clust/cos_del_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
best = np.argmax(del_auc)

In [None]:
cm = np.copy(consensus_mats_cos[best])
cm = 1 - cm
cd = distance.squareform(cm)
Z = cluster.hierarchy.linkage(cd, method='ward')
clusters = cluster.hierarchy.fcluster(Z, best + 2, criterion='maxclust')

In [None]:
np.unique(clusters, return_counts=True)

In [None]:
# pcos = pheno_et.copy()
# pcos['clusters'] = clusters
# pcos.to_csv('et_subs_cos.csv')

In [None]:
plt.figure(figsize=(10,10))
dn = cluster.hierarchy.dendrogram(Z, no_labels=True)
plt.title('Dendrogram of optimal consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/cos_dendrogram.png', bbox_inches='tight', dpi=300)

In [None]:
cm = np.copy(consensus_mats_cos[best])
cm = cm[np.argsort(clusters)][:,np.argsort(clusters)]

plt.figure(figsize=(10,10))
im = plt.imshow(cm, cmap='Reds')
plt.colorbar(im)
plt.title('Consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/cos_consensus.png', bbox_inches='tight', dpi=300)

In [None]:
u, c = np.unique(clusters, return_counts=True)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(35,20))

for i in range(u.size):
    ax[0].plot(np.mean(x[clusters == u[i]], axis=0))
    ax[1].plot(np.mean(y[clusters == u[i]], axis=0), label='%d; N=%d' % (u[i], c[i]))
ax[0].set_title('x-direction mean time series', fontsize=18, fontweight='bold')
ax[1].set_title('y-direction mean time series', fontsize=18, fontweight='bold')
ax[1].legend(loc=0, fontsize=18)
fig.savefig('figs_clust/cos_mts.png', bbox_inches='tight', dpi=300)

In [None]:
pcos = pheno_et.copy()
pcos['clusters'] = clusters

In [None]:
results = ols('Age ~ C(clusters)', data=pcos).fit()
results.summary()

In [None]:
freq = []
for i in u:
    freq.append(
        np.sum(pcos['Sex'][pcos['clusters'] == i] == 0) / np.sum(pcos['clusters'] == i)
    )

In [None]:
chisquare(freq)

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(30,30))
sns.violinplot(x='clusters', y='ASSQ_Total', data=pcos, ax=ax[0])
sns.violinplot(x='clusters', y='SCQ_Total', data=pcos, ax=ax[1])
sns.violinplot(x='clusters', y='SAS_Tot', data=pcos, ax=ax[2])
sns.violinplot(x='clusters', y='SRS_Total_T', data=pcos, ax=ax[3])
ax[0].set_title('ASSQ_Total', fontsize=18, fontweight='bold')
ax[1].set_title('SCQ_Total', fontsize=18, fontweight='bold')
ax[2].set_title('SAS_Tot', fontsize=18, fontweight='bold')
ax[3].set_title('SRS_Total_T', fontsize=18, fontweight='bold')
fig.savefig('figs_clust/cos_violin.png', bbox_inches='tight', dpi=300)

In [None]:
pheno_c = []
for i in np.unique(clusters):
    pheno_c.append(pheno_et.iloc[clusters == i,:-1])

In [None]:
pvals = np.zeros((np.unique(clusters).size, np.unique(clusters).size, 4))
for i in range(np.unique(clusters).size - 1):
    for j in range(i + 1, np.unique(clusters).size):
        for k in range(4):
            p = ttest_ind(pheno_c[i].iloc[:,k], pheno_c[j].iloc[:,k], equal_var=False)[1]
            pvals[i,j,k] = p
            pvals[j,i,k] = p

In [None]:
p_corr = np.zeros(pvals.shape)
nmu = np.triu_indices(pvals.shape[0], 1)
for i in range(pvals.shape[2]):
    nm = np.triu_indices(np.unique(clusters).size, 1)
    p_unc = pvals[...,i][nm[0],nm[1]]
    res = multipletests(p_unc, method='fdr_bh')
    p_corr[nmu[0],nmu[1],i] = res[1]
    p_corr[...,i] += p_corr[...,i].T

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(11,11))
ax_flat = ax.flat
scores = pheno_et.columns.tolist()[:-1]
for i in range(p_corr.shape[2]):
    im = ax_flat[i].imshow(p_corr[...,i], cmap='jet', vmax=0.05)
    ax_flat[i].set_title(scores[i], fontsize=18)
    ax_flat[i].set_xticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_yticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_xticklabels(np.arange(p_corr.shape[0]) + 1)
    ax_flat[i].set_yticklabels(np.arange(p_corr.shape[0]) + 1)
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.95)
fig.savefig('figs_clust/cos_pheno.png', bbox_inches='tight', dpi=300)

# Combination of Euclidean and cosine distances

In [None]:
# Euclidean distance on z-scored time series
xz = np.copy(x)
yz = np.copy(y)
xz = (xz - np.mean(xz, axis=1, keepdims=True)) / np.std(xz, axis=1, dtype=np.float64, ddof=1, keepdims=True)
yz = (yz - np.mean(yz, axis=1, keepdims=True)) / np.std(yz, axis=1, dtype=np.float64, ddof=1, keepdims=True)
et_euc = np.stack((xz, yz), axis=2)

eucz = np.zeros((et_euc.shape[0], et_euc.shape[0]))
for i in range(et_euc.shape[0] - 1):
    for j in range(i + 1, et_euc.shape[0]):
        e = np.mean(np.sqrt((et_euc[i,:,0] - et_euc[j,:,0])**2 + (et_euc[i,:,1] - et_euc[j,:,1])**2))
        eucz[i,j] = e
        eucz[j,i] = e

In [None]:
# cosine distance on z-scored time series
xz = np.copy(x)
yz = np.copy(y)
xz -= 400
yz -= 300
xz = (xz - np.mean(xz, axis=1, keepdims=True)) / np.std(xz, axis=1, dtype=np.float64, ddof=1, keepdims=True)
yz = (yz - np.mean(yz, axis=1, keepdims=True)) / np.std(yz, axis=1, dtype=np.float64, ddof=1, keepdims=True)
et_cos = np.stack((xz, yz), axis=2)

cosz = np.zeros((et_cos.shape[0], et_cos.shape[0]))
for i in range(et_cos.shape[0] - 1):
    for j in range(i + 1, et_cos.shape[0]):
        cos = distance.cosine(np.concatenate((et_cos[i,:,0], et_cos[i,:,1])), np.concatenate((et_cos[j,:,0], et_cos[j,:,1])))
        cosz[i,j] = cos
        cosz[j,i] = cos

In [None]:
dis_both = (eucz + cosz) / 2

In [None]:
# bootstrapping
niter = 1000
chunk = int(et.shape[0] * 0.9)

# create indicator matrix of selected sample pairs
ind_mat = np.zeros((et.shape[0], et.shape[0]))

# create consensus matrices for K = 2:20
consensus_mats_both = [np.zeros((et.shape[0], et.shape[0])) for i in range(19)]

for i in range(niter):
    idx = np.random.choice(et.shape[0], chunk, replace=False) # 90% resampling
    idx.sort()
    
    # update indicator matrix
    idx_a = np.zeros((et.shape[0],))
    idx_a[idx] = 1
    idx_a = idx_a[:,np.newaxis]
    ac = np.dot(idx_a, idx_a.T)
    ac[ac != 1] = 0
    ind_mat += ac
    
    # partition distance matrix and perform hierarchical clustering for K = 2:20
    dis_boot = dis_both[idx][:,idx]
    cd = distance.squareform(dis_boot) # since inputting dense distance matrix into linkage function, must convert to 1D condensed distance vector
    Z = cluster.hierarchy.linkage(cd, method='ward')
    for c in range(2, 21):
        cm = consensus_mats_both[c - 2]
        clusters = cluster.hierarchy.fcluster(Z, c, criterion='maxclust') # 'maxclust' cuts dendrogram so that it creates c optimal clusters
        
        # in order to use the clever method below, preserve unselected indices as 0 (cluster labels start at 1)
        ct = np.zeros((et.shape[0]),)
        ct[idx] = clusters
        
        ####### extremely clever way of creating consensus matrix -- taken from Aki's PyBASC #######
        ct = ct[:,np.newaxis]
        cm += (np.dot(ct**-1., ct.T) == 1).astype(np.float64)
        ############################################################################################
        
for mat in consensus_mats_both:
    mat /= ind_mat

In [None]:
cdfs = []
for mat in consensus_mats_both:
    cdfs.append(CDF(mat))

In [None]:
plt.figure(figsize=(10,10))
for cdf in cdfs:
    plt.plot(cdf)
plt.xticks(ticks=np.linspace(0,100,6), labels=[0,0.2,0.4,0.6,0.8,1])
plt.title('Empirical CDFs for K = 2:20', fontsize=18)
plt.xlabel('Consensus index value', fontsize=18)
plt.ylabel('CDF', fontsize=18)
plt.savefig('figs_clust/both_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
dx = np.mean(np.diff(np.linspace(0,1,100)))
auc = []
for cdf in cdfs:
    auc.append(np.trapz(cdf, dx=dx))
auc = [auc[0]] + auc

In [None]:
del_auc = np.diff(np.array(auc))

In [None]:
plt.figure(figsize=(10,10))
plt.plot(del_auc, 'bo-')
_ = plt.xticks(ticks=np.arange(19), labels=np.arange(2,21))
plt.title('Change in AUC in CDFs', fontsize=18)
plt.xlabel('Number of clusters', fontsize=18)
plt.ylabel('Change in AUC', fontsize=18)
plt.savefig('figs_clust/both_del_cdf.png', bbox_inches='tight', dpi=300)

In [None]:
best = np.argmax(del_auc)

In [None]:
cm = np.copy(consensus_mats_both[best])
cm = 1 - cm
cd = distance.squareform(cm)
Z = cluster.hierarchy.linkage(cd, method='ward')
clusters = cluster.hierarchy.fcluster(Z, best + 2, criterion='maxclust')

In [None]:
np.unique(clusters, return_counts=True)

In [None]:
# pboth = pheno_et.copy()
# pboth['clusters'] = clusters
# pboth.to_csv('et_subs_both.csv')

In [None]:
plt.figure(figsize=(10,10))
dn = cluster.hierarchy.dendrogram(Z, color_threshold=4.5, no_labels=True)
plt.title('Dendrogram of optimal consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/both_dendrogram.png', bbox_inches='tight', dpi=300)

In [None]:
cm = np.copy(consensus_mats_both[best])
cm = cm[np.argsort(clusters)][:,np.argsort(clusters)]

plt.figure(figsize=(10,10))
im = plt.imshow(cm, cmap='Reds')
plt.colorbar(im)
plt.title('Consensus matrix (K=%d)' % (best + 2), fontsize=18)
plt.savefig('figs_clust/both_consensus.png', bbox_inches='tight', dpi=300)

In [None]:
u, c = np.unique(clusters, return_counts=True)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(35,20))

for i in range(u.size):
    ax[0].plot(np.mean(x[clusters == u[i]], axis=0))
    ax[1].plot(np.mean(y[clusters == u[i]], axis=0), label='%d; N=%d' % (u[i], c[i]))
ax[0].set_title('x-direction mean time series', fontsize=18, fontweight='bold')
ax[1].set_title('y-direction mean time series', fontsize=18, fontweight='bold')
ax[1].legend(loc=0, fontsize=18)
fig.savefig('figs_clust/both_mts.png', bbox_inches='tight', dpi=300)

In [None]:
pboth = pheno_et.copy()
pboth['clusters'] = clusters

In [None]:
results = ols('Age ~ C(clusters)', data=pboth).fit()
results.summary()

In [None]:
freq = []
for i in u:
    freq.append(
        np.sum(pboth['Sex'][pboth['clusters'] == i] == 0) / np.sum(pboth['clusters'] == i)
    )

In [None]:
chisquare(freq)

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(30,30))
sns.violinplot(x='clusters', y='ASSQ_Total', data=pboth, ax=ax[0])
sns.violinplot(x='clusters', y='SCQ_Total', data=pboth, ax=ax[1])
sns.violinplot(x='clusters', y='SAS_Tot', data=pboth, ax=ax[2])
sns.violinplot(x='clusters', y='SRS_Total_T', data=pboth, ax=ax[3])
ax[0].set_title('ASSQ_Total', fontsize=18, fontweight='bold')
ax[1].set_title('SCQ_Total', fontsize=18, fontweight='bold')
ax[2].set_title('SAS_Tot', fontsize=18, fontweight='bold')
ax[3].set_title('SRS_Total_T', fontsize=18, fontweight='bold')
fig.savefig('figs_clust/both_violin.png', bbox_inches='tight', dpi=300)

In [None]:
pheno_c = []
for i in np.unique(clusters):
    pheno_c.append(pheno_et.iloc[clusters == i,:-1])

In [None]:
pvals = np.zeros((np.unique(clusters).size, np.unique(clusters).size, 4))
for i in range(np.unique(clusters).size - 1):
    for j in range(i + 1, np.unique(clusters).size):
        for k in range(4):
            p = ttest_ind(pheno_c[i].iloc[:,k], pheno_c[j].iloc[:,k], equal_var=False)[1]
            pvals[i,j,k] = p
            pvals[j,i,k] = p

In [None]:
p_corr = np.zeros(pvals.shape)
nmu = np.triu_indices(pvals.shape[0], 1)
for i in range(pvals.shape[2]):
    nm = np.triu_indices(np.unique(clusters).size, 1)
    p_unc = pvals[...,i][nm[0],nm[1]]
    res = multipletests(p_unc, method='fdr_bh')
    p_corr[nmu[0],nmu[1],i] = res[1]
    p_corr[...,i] += p_corr[...,i].T

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(11,11))
ax_flat = ax.flat
scores = pheno_et.columns.tolist()[:-1]
for i in range(p_corr.shape[2]):
    im = ax_flat[i].imshow(p_corr[...,i], cmap='jet', vmax=0.05)
    ax_flat[i].set_title(scores[i], fontsize=18)
    ax_flat[i].set_xticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_yticks(np.arange(p_corr.shape[0]))
    ax_flat[i].set_xticklabels(np.arange(p_corr.shape[0]) + 1)
    ax_flat[i].set_yticklabels(np.arange(p_corr.shape[0]) + 1)
cbar = fig.colorbar(im, ax=ax.ravel().tolist(), shrink=0.95)
fig.savefig('figs_clust/both_pheno.png', bbox_inches='tight', dpi=300)