In [1]:
import pandas, numpy, seaborn, umap
import scipy, scipy.signal
import sklearn, sklearn.cluster
import MulticoreTSNE

In [2]:
import matplotlib, matplotlib.pyplot
matplotlib.rcParams.update({'font.family':'sans-serif',
                            'font.size':20, 
                            'xtick.labelsize':30, 
                            'ytick.labelsize':30, 
                            'figure.figsize':(16, 9), 
                            'axes.labelsize':40})

# user-defined variables

In [3]:
expression_file = '/home/adrian/scratch/zscore.tsv'
output_dir = '/home/adrian/scratch/'

In [4]:
iso1 = 'ENST00000354449.7'
iso2 = 'ENST00000354956.9'

# read data

In [None]:
%%time
zscore_df = pandas.read_csv(expression_file, index_col=0, sep='\t')
print(zscore_df.shape)
zscore_df.head()

In [None]:
print(zscore_df.max().max())
print(zscore_df.min().min())

# exploratory visualization

## heatmap

In [None]:
zscore_df.rename_axis('Transcripts', inplace=True)
zscore_df.rename_axis('Samples', axis='columns', inplace=True)
print(zscore_df.shape)
zscore_df

In [None]:
# reduction of 10 runs for 68 CPU s
# reduction of 3 runs for 7 CPU min

print(zscore_df.shape)

reduction = 1
test = zscore_df.iloc[:int(zscore_df.shape[0]/reduction), :] 

print(test.shape)
test.tail()

In [None]:
# nomiss = zscore_df.loc[[iso1, iso2], :]
# print(test.shape)
# print(nomiss.shape)
# test = pandas.concat([test, nomiss])
# print(test.shape)
# test.tail()

In [None]:
%%time
seaborn.clustermap(test, 
                   cmap='bwr', 
                   row_cluster=False, 
                   col_cluster=False, 
                   vmin=-20, vmax=20, 
                   cbar_kws={'label':'z-score'},
                   xticklabels=False, yticklabels=False, 
                  )
matplotlib.pyplot.show()

In [None]:
%%time
linkage_method = 'complete'
distance_metric = 'cosine'

seaborn.clustermap(test, 
                   cmap='bwr', 
                   row_cluster=True, 
                   col_cluster=True, 
                   vmin=-20, vmax=20, 
                   cbar_kws={'label':'z-score'},
                   xticklabels=False, yticklabels=False, 
                   method=linkage_method, metric=distance_metric
                  )
matplotlib.pyplot.show()

## dimensionality reduction

### UMAP

In [None]:
%%time
runUmap = umap.UMAP(metric='correlation',
                    n_neighbors=5,
                    min_dist=0,
                    learning_rate=0.5
                   ).fit_transform
dr_umap = runUmap(test)
positionsu = pandas.DataFrame(dr_umap, columns=['X', 'Y'], index=test.index)

matplotlib.pyplot.plot(positionsu['X'], positionsu['Y'], 'o', alpha=1/10, markeredgecolor='None', ms=20, markevery=5)

matplotlib.pyplot.plot(positionsu.loc[iso1, 'X'], positionsu.loc[iso1, 'Y'], 'o', alpha=0.8, markeredgecolor='None', ms=20, color='tab:red', label='ATG7-201')
matplotlib.pyplot.plot(positionsu.loc[iso2, 'X'], positionsu.loc[iso2, 'Y'], 'o', alpha=0.9, markeredgecolor='None', ms=20, color='tab:orange', label='ATG7-202')

matplotlib.pyplot.legend()
matplotlib.pyplot.grid(ls=':')
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.show()

print(positionsu.shape)
positionsu

### tSNE

In [None]:
%%time
tsne = MulticoreTSNE.MulticoreTSNE(n_jobs=20)
dr_tsne = tsne.fit_transform(test)
positionst = pandas.DataFrame(dr_tsne, columns=['X', 'Y'], index=test.index)

matplotlib.pyplot.plot(positionst['X'], positionst['Y'], 'o', alpha=1/10, markeredgecolor='None', ms=20)

matplotlib.pyplot.plot(positionst.loc[iso1, 'X'], positionst.loc[iso1, 'Y'], 'o', alpha=0.8, markeredgecolor='None', ms=20, color='tab:red', label='ATG7-201')
matplotlib.pyplot.plot(positionst.loc[iso2, 'X'], positionst.loc[iso2, 'Y'], 'o', alpha=0.9, markeredgecolor='None', ms=20, color='tab:orange', label='ATG7-202')

matplotlib.pyplot.legend()
matplotlib.pyplot.grid(ls=':')
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.show()

## heatmap with labels

In [None]:
print(test.index.get_loc(iso1))
print(test.index.get_loc(iso2))

In [None]:
observed_best_k = 4
best_index = all_ks.index(observed_best_k)
labels = groupings[best_index]

print(set(labels))

print('ATG7-201 is in cluster {}'.format(labels[test.index.get_loc(iso1)]))
print('ATG7-202 is in cluster {}'.format(labels[test.index.get_loc(iso2)]))

In [None]:
%%time

cmap = matplotlib.pyplot.cm.get_cmap('tab20')
k_colors = cmap(labels)

linkage_method = 'complete'
distance_metric = 'correlation'

clustergrid = seaborn.clustermap(test, 
                   cmap='bwr', 
                   row_cluster=True, 
                   col_cluster=True, 
                   vmin=-20, vmax=20, 
                   cbar_kws={'label':'z-score'},
                   xticklabels=False, yticklabels=False, 
                   method=linkage_method, metric=distance_metric,
                   row_colors=k_colors
                  )

# adding where isoforms fall
loc1 = clustergrid.dendrogram_row.reordered_ind.index(test.index.get_loc(iso1))
loc2 = clustergrid.dendrogram_row.reordered_ind.index(test.index.get_loc(iso2))
ax = clustergrid.ax_heatmap
ax.add_patch(matplotlib.patches.Arrow(test.shape[1], loc1, dx=-100, dy=0, color='gold', width=100))
ax.add_patch(matplotlib.patches.Arrow(test.shape[1], loc2, dx=-100, dy=0, color='skyblue', width=100))
matplotlib.pyplot.show()

In [None]:
iso1clusterlabel = labels[test.index.get_loc(iso1)]
iso2clusterlabel = labels[test.index.get_loc(iso2)]
print(iso1clusterlabel, iso2clusterlabel)

In [None]:
# only the two isoforms
print(iso1)
print(iso2)
seaborn.set(font_scale=1) 
seaborn.clustermap(
    test.iloc[[test.index.get_loc(iso1), test.index.get_loc(iso2)], clustergrid.dendrogram_col.reordered_ind], 
    cmap='bwr', 
    vmin=-20, vmax=20,
    row_cluster=False, col_cluster=False, 
    cbar_kws={'label':'z-score'},
    xticklabels=False, yticklabels=True,  
    row_colors=cmap([iso1clusterlabel, iso2clusterlabel])
)
matplotlib.pyplot.show()

In [None]:
# only the two clusters
cluster_indexes = []
cluster_labels = []
iso1_cluster_names = []; iso2_cluster_names = []
for row in clustergrid.dendrogram_row.reordered_ind:
    if labels[row] == iso1clusterlabel:
        cluster_indexes.append(row); cluster_labels.append(iso1clusterlabel)
        iso1_cluster_names.append(test.index[row])
    if labels[row] == iso2clusterlabel:
        cluster_indexes.append(row); cluster_labels.append(iso2clusterlabel)
        iso2_cluster_names.append(test.index[row])

seaborn.clustermap(
    test.iloc[cluster_indexes, clustergrid.dendrogram_col.reordered_ind], 
    cmap='bwr', 
    vmin=-20, vmax=20,
    row_cluster=False, col_cluster=False, 
    cbar_kws={'label':'z-score'},
    xticklabels=False, yticklabels=False,  
    row_colors=cmap(cluster_labels)
)
matplotlib.pyplot.show()

print(len(iso1_cluster_names), len(iso2_cluster_names))
print(iso1 in iso1_cluster_names, iso1 in iso2_cluster_names)
print(iso2 in iso2_cluster_names, iso2 in iso1_cluster_names)

In [None]:
linkage_method = 'complete'
distance_metric = 'correlation'

twoc = test.iloc[cluster_indexes, clustergrid.dendrogram_col.reordered_ind]

clustergrid = seaborn.clustermap(
    twoc, 
    cmap='bwr', 
    row_cluster=True, 
    col_cluster=False, 
    vmin=-20, vmax=20, 
    cbar_kws={'label':'z-score'},
    xticklabels=False, yticklabels=False, 
    method=linkage_method, metric=distance_metric,
    row_colors=cmap(cluster_labels)
)

# adding where isoforms fall
loc1 = clustergrid.dendrogram_row.reordered_ind.index(twoc.index.get_loc(iso1))
loc2 = clustergrid.dendrogram_row.reordered_ind.index(twoc.index.get_loc(iso2))
ax = clustergrid.ax_heatmap
ax.add_patch(matplotlib.patches.Arrow(twoc.shape[1], loc1, dx=-100, dy=0, color='gold', width=100))
ax.add_patch(matplotlib.patches.Arrow(twoc.shape[1], loc2, dx=-100, dy=0, color='skyblue', width=100))
matplotlib.pyplot.show()