In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc


In [None]:
# !ls /project/johnston_retina/human/2_single_cell/HumanOrgData/Cellranger\ output/Control1
# !cp -r /project/johnston_retina/human/2_single_cell/HumanOrgData/Cellranger\ output/* ./renamed/

In [None]:
sample_paths = [
    "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/Control1/outs/filtered_feature_bc_matrix/",
    "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/Control2/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/T3treat10day1/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/T3treat10day2/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/T3treat200day1/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/T3treat200day2/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/T3treat200day3/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/HROrg_Con_10d_200d_T3aggr083019/outs/filtered_feature_bc_matrix/",
#     "/project/johnston_retina/human/2_single_cell/HumanOrgData/renamed/HROrg_Con_10d_200d_T3aggrnorm083019/outs/filtered_feature_bc_matrix/",
]

In [None]:
!ls ./renamed/Control1/outs/filtered_feature_bc_matrix/
# !unpigz ./renamed/Control1/outs/filtered_feature_bc_matrix/barcodes.tsv.gz
# !wc -l ./renamed/Control1/outs/filtered_feature_bc_matrix/g

In [None]:
# Strategies for integrating across different samples will depend on the lane setup. Hopefully they were smart enough
# to run different samples in each lane. For the initial pass it's probably best to avoid anything particularly fancy.

In [None]:
objects = []
headers = []

# for sample_path in sample_paths:
#     !unpigz {sample_path}/*.tsv.gz

for sample_path in sample_paths:
    
    scp_object = sc.read(sample_path + "matrix.mtx.gz")
    scp_object = scp_object.transpose()
    header = np.loadtxt(sample_path+"features.tsv",dtype=str)
    scp_object.var_names = header[:,0]
    objects.append(scp_object)
    headers.append(header)

In [None]:
master1 = objects[0].copy()
master2 = objects[1].copy()
bias1 = objects[0].copy()
bias2 = objects[1].copy()
sc.pp.downsample_counts(master1,counts_per_cell=1200)
sc.pp.downsample_counts(master2,counts_per_cell=1200)
master1.X = master1.X.astype(dtype=float)
master2.X = master2.X.astype(dtype=float)


In [None]:
master1.shape

In [None]:
# sc.pp.filter_genes(master1,min_counts=100)
# sc.pp.filter_genes(master2,min_counts=100)
# sc.pp.filter_genes(bias1,min_counts=100)
# sc.pp.filter_genes(bias2,min_counts=100)

# We may altnernatively want a uniform filter across all samples:
min_gene_mask = np.array(np.sum(master1.X,axis=0) > 100).ravel()

master1 = master1[:,min_gene_mask]
master2 = master2[:,min_gene_mask]
bias1 = bias1[:,min_gene_mask]
bias2 = bias2[:,min_gene_mask]

In [None]:
filter1 = master1.copy()
filter2 = master2.copy()
filter_bias1=bias1.copy()
filter_bias2=bias2.copy()

sc.pp.normalize_per_cell(filter1)
sc.pp.normalize_per_cell(filter2)
sc.pp.normalize_per_cell(filter_bias1)
sc.pp.normalize_per_cell(filter_bias2)

# selecting variable genes
filter_result1 = sc.pp.filter_genes_dispersion(  
    filter1.X, flavor='cell_ranger', n_top_genes=4000, log=False
)
filter_result2 = sc.pp.filter_genes_dispersion(  
    filter2.X, flavor='cell_ranger', n_top_genes=4000, log=False
)
filter_bias_result1 = sc.pp.filter_genes_dispersion(  
    filter_bias1.X, flavor='cell_ranger', n_top_genes=4000, log=False
)
filter_bias_result2 = sc.pp.filter_genes_dispersion(  
    filter_bias2.X, flavor='cell_ranger', n_top_genes=4000, log=False
)

# Unfortunately here we need a unified set of features if we plan to do prediction later, so let's subset to the 
# genes in control 1

# subsetting the genes
umi1 = master1[:, filter_result1.gene_subset].copy()     
umi2 = master2[:, filter_result1.gene_subset].copy()    
log1 = master1[:, filter_result1.gene_subset].copy()
log2 = master2[:, filter_result1.gene_subset].copy()
umi_bias1 = bias1[:, filter_result1.gene_subset].copy()
umi_bias2 = bias2[:, filter_result1.gene_subset].copy()
log_bias1 = bias1[:, filter_result1.gene_subset].copy()
log_bias2 = bias2[:, filter_result1.gene_subset].copy()


sc.pp.normalize_per_cell(log1)
sc.pp.log1p(log1)
sc.pp.scale(log1)

sc.pp.normalize_per_cell(log2)
sc.pp.log1p(log2)
sc.pp.scale(log2)

sc.pp.normalize_per_cell(log_bias1)
sc.pp.log1p(log_bias1)
sc.pp.scale(log_bias1)

sc.pp.normalize_per_cell(log_bias2)
sc.pp.log1p(log_bias2)
sc.pp.scale(log_bias2)

In [None]:
# Here we sanity-check the counts of the two controls pre-normalization. They were run in mixed lanes so there 
# shouldn't be anything drastic going on.  

plt.figure()
plt.title("Control 1, Cell UMI Count Frequency")
plt.hist(np.sum(umi_bias1.X,axis=1),bins=np.arange(0,4000,200))
plt.xlabel("Total UMIs")
plt.ylabel("Frequency")
plt.show()

plt.figure()
plt.title("Control 2, Cell UMI Count Frequency")
plt.hist(np.sum(umi_bias2.X,axis=1),bins=np.arange(0,4000,200))
plt.xlabel("Total UMIs (Cell)")
plt.ylabel("Frequency")
plt.show()

plt.figure()
plt.title("Control 2, Total Gene Expression Frequency")
plt.hist(np.sum(umi_bias2.X.T,axis=1),bins=np.arange(0,2000,100))
plt.xlabel("Total UMIs (Gene)")
plt.ylabel("Frequency")
plt.show()

# And yet here we are. Nearly 2 fold change in mode expression

In [None]:
757/11

In [None]:
umi1_array = umi1.X.todense()
log1_array = log1.X

umi2_array = umi2.X.todense()
log2_array = log2.X

umi_bias1_array = umi_bias1.X.todense()
log_bias1_array = log_bias1.X

umi_bias2_array = umi_bias2.X.todense()
log_bias2_array = log_bias2.X

# The headers were unified for ease of use later

gene_dictionary = {gene_entry[0]:gene_entry[1] for gene_entry in headers[0]}
umi_header = [gene_dictionary[ens] for ens in umi1.var_names]
log_header = [gene_dictionary[ens] for ens in log1.var_names]
print(umi_header)
print(log_header)

In [None]:
sc.pp.neighbors(umi1)
sc.pp.pca(umi1)
sc.tl.umap(umi1)
sc.tl.tsne(umi1)


sc.pp.neighbors(log1)
sc.pp.pca(log1)
sc.tl.umap(log1)
sc.tl.tsne(log1)

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(umi1,resolution=1)
    sc.tl.louvain(log1,resolution=1)
    
sc.pl.umap(umi1,color='louvain')
sc.pl.tsne(umi1,color='louvain')

sc.pl.umap(log1,color='louvain')
sc.pl.tsne(log1,color='louvain')

In [None]:
sc.pp.neighbors(umi2)
sc.pp.pca(umi2)
sc.tl.umap(umi2)
sc.tl.tsne(umi2)


sc.pp.neighbors(log2)
sc.pp.pca(log2)
sc.tl.umap(log2)
sc.tl.tsne(log2)

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(umi2,resolution=1)
    sc.tl.louvain(log2,resolution=1)
    
sc.pl.umap(umi2,color='louvain')
sc.pl.tsne(umi2,color='louvain')

sc.pl.umap(log2,color='louvain')
sc.pl.tsne(log2,color='louvain')

In [None]:
sc.pp.neighbors(umi_bias1)
sc.pp.pca(umi_bias1)
sc.tl.umap(umi_bias1)
sc.tl.tsne(umi_bias1)


sc.pp.neighbors(log_bias1)
sc.pp.pca(log_bias1)
sc.tl.umap(log_bias1)
sc.tl.tsne(log_bias1)

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(umi_bias1,resolution=1)
    sc.tl.louvain(log_bias1,resolution=1)
    
sc.pl.umap(umi_bias1,color='louvain')
sc.pl.tsne(umi_bias1,color='louvain')

sc.pl.umap(log_bias1,color='louvain')
sc.pl.tsne(log_bias1,color='louvain')

In [None]:
sc.pp.neighbors(umi_bias2)
sc.pp.pca(umi_bias2)
sc.tl.umap(umi_bias2)
sc.tl.tsne(umi_bias2)


sc.pp.neighbors(log_bias2)
sc.pp.pca(log_bias2)
sc.tl.umap(log_bias2)
sc.tl.tsne(log_bias2)

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(umi_bias2,resolution=1)
    sc.tl.louvain(log_bias2,resolution=1)
    
sc.pl.umap(umi_bias2,color='louvain')
sc.pl.tsne(umi_bias2,color='louvain')

sc.pl.umap(log_bias2,color='louvain')
sc.pl.tsne(log_bias2,color='louvain')

In [None]:
size_factor1 = np.array(np.sum(umi_bias1.X,axis=1)).ravel()
scatter1 = log1.obsm['X_umap']
scatter_bias1 = log_bias1.obsm['X_umap']
# size_factor1

plt.figure()
plt.scatter(*scatter1.T,s=2,c=np.log(size_factor1))
plt.colorbar(label="Log Size Factor (Total UMIs)")
plt.show()

plt.figure()
plt.scatter(*scatter_bias1.T,s=2,c=np.log(size_factor1))
plt.colorbar(label="Log Size Factor (Total UMIs)")
plt.show()

In [None]:
size_factor2 = np.array(np.sum(umi_bias2.X,axis=1)).ravel()
scatter2 = log2.obsm['X_umap']
scatter_bias2 = log_bias2.obsm['X_umap']
# size_factor1

plt.figure()
plt.scatter(*scatter2.T,s=2,c=np.log(size_factor2))
plt.colorbar(label="Log Size Factor (Total UMIs)")
plt.show()

plt.figure()
plt.scatter(*scatter_bias2.T,s=2,c=np.log(size_factor2))
plt.colorbar(label="Log Size Factor (Total UMIs)")
plt.show()

# Forest Analysis

In [None]:
import sys
sys.path.append('/project/johnston_retina/human/2_single_cell/HumanOrgData/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

In [None]:
# forest1 = lumberjack.fit(
#         log1_array,
#         header=log_header,
#         trees=100,
#         braids=3,
#         ifs=1000,
#         ofs=1000,
#         ss=200,
#         depth=8,
#         leaves=100,
#         sfr=.5,
#         norm="l2"
#     )

forest1.set_cache(True)
forest1.backup('control1_forest')

In [None]:
# forest2 = lumberjack.fit(
#         log2_array,
#         header=log_header,
#         trees=100,
#         braids=3,
#         ifs=1000,
#         ofs=1000,
#         ss=200,
#         depth=8,
#         leaves=100,
#         sfr=.5,
#         norm="l2"
#     )

forest2.set_cache(True)
forest2.backup('control2_forest')

In [None]:
import sys
sys.path.append('/project/johnston_retina/human/2_single_cell/HumanOrgData/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest1 = tr.Forest.reconstitute('control1_forest')
forest2 = tr.Forest.reconstitute('control2_forest')

In [None]:
np.mean(forest.mean_matrix(forest.leaves()),axis=0).shape

In [None]:
# forest1.reset_sample_clusters()
forest2.reset_sample_clusters()

# forest2.reset_leaf_clusters()
# forest2.cluster_leaves_samples(metric='cos',pca=100,sub=.5,k=20)
# forest.cluster_leaves_predictions(metric='cos',pca=100,sub=.5,k=20)
# forest2.cluster_samples_leaf_cluster()

# forest1.cluster_samples_encoding(metric='cosine',pca=100,sub=.5,k=20,depth=8)
forest2.cluster_samples_encoding(metric='cosine',pca=100,sub=.5,k=20,depth=8)

# forest.cluster_samples_simple(metric='cos',pca=100,sub=.5,k=20)

In [None]:
# forest.tsne_coordinates=log_filtered.obsm['X_umap']
forest1.tsne(pca=100)
forest1.plot_sample_clusters()

forest2.tsne(pca=100)
forest2.plot_sample_clusters()

In [None]:
forest1.reset_split_clusters()
forest1.interpret_splits(mode='additive_mean',relatives=True,depth=6,sub=.8,k=20,pca=100,metric='cos')

forest2.reset_split_clusters()
forest2.interpret_splits(mode='additive_mean',relatives=True,depth=6,sub=.8,k=20,pca=100,metric='cos')

## Forest Representation Summaries



In [None]:
f2_node_sample = forest2.node_sample_encoding(forest2.nodes(depth=5))
f2_node_feature = forest2.node_representation(forest2.nodes(depth=5),mode='additive_mean')

print(f2_node_sample.shape)
print(f2_node_feature.shape)

In [None]:
from sklearn.decomposition import PCA

f2_ns_pca = PCA(n_components=100).fit_transform(f2_node_sample)
f2_nf_pca = PCA(n_components=100).fit_transform(f2_node_feature)

f2_ns_pca_t = PCA(n_components=100).fit_transform(f2_node_sample.T)
f2_nf_pca_t = PCA(n_components=100).fit_transform(f2_node_feature.T)



In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

node_sample_sort = dendrogram(linkage(f2_ns_pca_t,metric='cos',method='average'),no_plot=True)['leaves']
node_feature_sort = dendrogram(linkage(f2_nf_pca,metric='cos',method='average'),no_plot=True)['leaves']

print("Nodes Sorted")

sample_sort_node = dendrogram(linkage(f2_ns_pca,metric='cos',method='average'),no_plot=True)['leaves']
feature_sort_node = dendrogram(linkage(f2_nf_pca_t,metric='cos',method='average'),no_plot=True)['leaves']

In [None]:
plt.figure()
plt.title("Membership of Samples in Nodes")
plt.xlabel("Nodes")
plt.ylabel("Samples")
plt.imshow(f2_node_sample[sample_sort_node].T[node_sample_sort].T,aspect='auto',cmap='binary')
plt.colorbar(label="Sample in Node?")
plt.show()

plt.figure()
plt.title("Change in Feature Mean For Node")
plt.xlabel("Nodes")
plt.ylabel("Features")
plt.imshow(f2_node_feature[node_feature_sort].T[feature_sort_node],aspect='auto',cmap='seismic',vmin=-5,vmax=5)
plt.colorbar(label="Change in Mean vs Parent")
plt.show()


In [None]:
# f2_leaf_sample = forest2.node_sample_encoding(forest2.leaves())
# f2_ls_pca_t = PCA(n_components=100).fit_transform(f2_leaf_sample.T)

# leaf_sort = dendrogram(linkage(f2_ls_pca_t,metric='cos',method='average'),no_plot=True)['leaves']
# sample_sort = np.argsort(forest2.sample_labels)

plt.figure()
plt.axes([0,0,.7,1])
plt.title("Sample Leaf Membership")
plt.imshow(f2_leaf_sample[sample_sort].T[leaf_sort].T,cmap='binary',interpolation='none',aspect='auto')
plt.axes([.9,0,.1,1])
plt.title("Cluster")
plt.imshow(np.array([forest2.sample_labels[sample_sort],]).T,cmap='rainbow',aspect='auto')
plt.show()


# Calinski Evaluation

In [None]:
from sklearn.metrics import calinski_harabasz_score,silhouette_score,silhouette_samples

# print(calinski_harabasz_score(log_array,log_filtered.obs['louvain']))
# print(calinski_harabasz_score(log_array,forest.sample_labels))

# print(silhouette_score(log_array,log_filtered.obs['louvain'],metric='cosine'))
# print(silhouette_score(log_array,forest.sample_labels,metric='cosine'))

# louvain_silhouettes = silhouette_samples(log_array,log_filtered.obs['louvain'],metric='cosine')
# forest_silhouettes = silhouette_samples(log_array,forest.sample_labels,metric='cosine')

# plt.figure()
# plt.scatter(*log_filtered.obsm['X_umap'].T,c=louvain_silhouettes,cmap='seismic',vmin=-.2,vmax=.2)
# plt.colorbar()
# plt.show

# plt.figure()
# plt.scatter(*log_filtered.obsm['X_umap'].T,c=forest_silhouettes,cmap='seismic',vmin=-.2,vmax=.2)
# plt.colorbar()
# plt.show

# plt.figure()
# plt.scatter(*log_filtered.obsm['X_umap'].T,c=prerequisite_factor,s=1)
# plt.colorbar()
# plt.show


# plt.figure()
# plt.scatter(*log_filtered.obsm['X_umap'].T,c=forest.split_clusters[2].sister_scores()>.05,s=1)
# plt.colorbar()
# plt.show

# plt.figure()
# plt.scatter(*forest1.tsne(pca=100).T,c=forest1.sample_labels,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()

# plt.figure()
# plt.scatter(*forest1.tsne(pca=100).T,c=f1_self_predictions,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()

# plt.figure()
# plt.scatter(*forest1.tsne(pca=100).T,c=f2_predictions,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()


# plt.figure()
# plt.scatter(*forest2.tsne(pca=100).T,c=forest2.sample_labels,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()

# plt.figure()
# plt.scatter(*forest2.tsne(pca=100).T,c=f2_self_predictions,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()

# plt.figure()
# plt.scatter(*forest2.tsne(pca=100).T,c=f1_predictions,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()


# plt.figure()
# plt.scatter(*forest1.tsne(pca=100).T,c=f2_predictions==7,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()



# plt.figure()
# plt.scatter(*forest2.tsne(pca=100).T,c=forest2.sample_labels == 5,s=1,cmap='rainbow')
# plt.colorbar()
# plt.show()





In [None]:
print(silhouette_score(forest2.output,f2_self_predictions,metric='cosine'))
print(silhouette_score(forest2.output,forest2.sample_labels,metric='cosine'))
print(silhouette_score(forest2.output,f1_predictions,metric='cosine'))
print(silhouette_score(forest2.output,np.random.randint(0,15,forest2.output.shape[0]),metric='cosine'))
print(silhouette_score(forest2.output,log2.obs['louvain'],metric='cosine'))

In [None]:
print(silhouette_score(forest1.output,f1_self_predictions,metric='cosine'))
print(silhouette_score(forest1.output,forest1.sample_labels,metric='cosine'))
print(silhouette_score(forest1.output,f2_predictions,metric='cosine'))
print(silhouette_score(forest1.output,np.random.randint(0,15,forest1.output.shape[0]),metric='cosine'))
print(silhouette_score(forest1.output,log1.obs['louvain'],metric='cosine'))

In [None]:
c1_predictions = np.array([log1.obs['louvain'],forest1.sample_labels,f1_self_predictions,f2_predictions,np.random.randint(0,15,forest1.output.shape[0])])

c1_labels =  ["","Louv","Encd","Self-Pred","Other-Pred","Rand"]

In [None]:

c2_predictions = np.array([log2.obs['louvain'],forest2.sample_labels,f2_self_predictions,f1_predictions,np.random.randint(0,15,forest2.output.shape[0])])

c2_labels = ["","Louv","Encd","Self-Pred","Other-Pred","Rand"]

In [None]:
# # len(f2_predictions)
# from scipy.cluster.hierarchy import dendrogram,linkage

f1_mask = np.logical_or(f2_predictions == 5, f2_predictions == 7)
f1_sort = np.argsort(f2_predictions[f1_mask])
f2_mask = np.logical_or(f2_self_predictions == 5, f2_self_predictions == 7)
f2_sort = np.argsort(f2_self_predictions[f2_mask])

# feature_sort = dendrogram(linkage(forest1.output.T[:4000],metric='cos',method='average'),no_plot=True)['leaves']

f1_sub_selection = forest1.output[f1_mask][f1_sort]

f2_sub_selection = forest2.output[f2_mask][f2_sort]

plt.figure()
plt.title("Control 1, Clusters 5/7")
plt.imshow(f1_sub_selection.T[feature_sort].T,aspect='auto',vmin=-1,vmax=3)
plt.colorbar(label="Truncated Log Expression")
plt.show()

plt.figure()
plt.title("Control 1, Clusters 5/7, Resorted")
plt.imshow(forest1.output[f1_mask][f1_sample_agglomeration].T[feature_sort].T,aspect='auto',vmin=-1,vmax=3)
plt.colorbar(label="Truncated Log Expression")
plt.show()

plt.figure()
plt.title("Control 2, Clusters 5/7")
plt.imshow(f2_sub_selection.T[feature_sort].T,aspect='auto',vmin=-1,vmax=3)
plt.colorbar(label="Truncated Log Expression")
plt.show()



In [None]:
# feature_sort = dendrogram(linkage(umi1.X.todense().T,metric='cos',method='average'),no_plot=True)['leaves']

plt.figure()
plt.title("Control 1, Own Clustering")
plt.imshow(forest1.output[np.argsort(forest1.sample_labels)].T[feature_sort].T,vmin=-1,vmax=3,aspect='auto')
plt.colorbar(label="Truncated Log Expression")
plt.show()

plt.figure()
plt.title("Control 1, Cross Clustering")
plt.imshow(forest1.output[np.argsort(f2_predictions)].T[feature_sort].T,vmin=-1,vmax=3,aspect='auto')
plt.colorbar(label="Truncated Log Expression")
plt.show()

plt.figure()
plt.title("Control 2, Own Clustering")
plt.imshow(forest2.output[np.argsort(forest2.sample_labels)].T[feature_sort].T,vmin=-1,vmax=3,aspect='auto')
plt.colorbar(label="Truncated Log Expression")
plt.show()

plt.figure()
plt.title("Control 2, Cross Clustering")
plt.imshow(forest2.output[np.argsort(f1_predictions)].T[feature_sort].T,vmin=-1,vmax=3,aspect='auto')
plt.colorbar(label="Truncated Log Expression")
plt.show()



In [None]:
mutual_information_matrix = np.zeros((5,5))

for i,p1 in enumerate(c1_predictions):
    for j,p2 in enumerate(c1_predictions):
        mutual_information_matrix[i,j] = adjusted_mutual_info_score(p1,p2)

plt.figure()
plt.title("Adjusted Partition Mutual Information")
plt.imshow(mutual_information_matrix)
ax1 = plt.gca()
for (j,i),label in np.ndenumerate(np.round(mutual_information_matrix,3)):
    ax1.text(i,j,label,ha='center',va='center')
    ax1.text(i,j,label,ha='center',va='center')
ax1.set_yticklabels(c1_labels)
plt.colorbar()

In [None]:
mutual_information_matrix = np.zeros((5,5))

for i,p1 in enumerate(c2_predictions):
    for j,p2 in enumerate(c2_predictions):
        mutual_information_matrix[i,j] = adjusted_mutual_info_score(p1,p2)

plt.figure()
plt.title("Adjusted Partition Mutual Information")
plt.imshow(mutual_information_matrix)
ax1 = plt.gca()
for (j,i),label in np.ndenumerate(np.round(mutual_information_matrix,3)):
    ax1.text(i,j,label,ha='center',va='center')
    ax1.text(i,j,label,ha='center',va='center')
ax1.set_yticklabels(c2_labels)
plt.colorbar()

In [None]:
# shuffled = tr.Forest.agglomerate_representation(weighted_adjacency)

plt.figure()
plt.imshow(shuffled)
plt.show()

In [None]:
# leaf_shuffled = tr.Forest.agglomerate_representation(representation)
sample_sort = dendrogram(linkage(representation,metric='cos',method='average'),no_plot=True)['leaves']
leaf_sort = dendrogram(linkage(representation.T,metric='cos',method='average'),no_plot=True)['leaves']


plt.figure()
plt.imshow(representation[sample_sort].T[leaf_sort].T)
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

plt.figure()
plt.imshow(representation[np.argsort(forest.sample_labels)].T[leaf_sort].T)
plt.show()

plt.figure()
plt.imshow(representation[np.argsort(log_filtered.obs['louvain'])].T[leaf_sort].T)
plt.show()

In [None]:
from sklearn.neighbors import kneighbors_graph
neighbors = kneighbors_graph(representation,10,metric='cosine')

In [None]:
len(set(groups))

In [None]:
plt.figure()
plt.scatter(forest.output[:,1004],forest.output[:,2510])
plt.show()

In [None]:
forest.split_clusters[2].changed_absolute_sister()

In [None]:
plt.figure(figsize=(10,10))
for i,gene1 in enumerate(candidate_genes): 
    for j,gene2 in enumerate(candidate_genes):
        ii = log_header.index(gene1)
        ji = log_header.index(gene2)
        
        plt.subplot(7,7,(i*7)+(j+1))
        plt.scatter(forest.output[:,ii],forest.output[:,ji],s=1)
plt.show()
    

In [None]:
subselection1 = forest.output.T[candidate_indices].T
np.corrcoef(subselection1.T)

In [None]:
subselection2 = forest.output[forest.split_clusters[2].sister_scores() > 0.05].T[candidate_indices].T
subselection2.shape
np.corrcoef(subselection2.T)

In [None]:
plt.figure()
plt.imshow(np.corrcoef(subselection1.T) - np.corrcoef(subselection2.T),cmap='bwr')
plt.colorbar()
plt.show()

In [None]:
forest.split_clusters[2].html_cluster_summary()

In [None]:
plt.figure(figsize=(10,10))
for i,gene1 in enumerate(candidate_genes): 
    for j,gene2 in enumerate(candidate_genes):
        ii = log_header.index(gene1)
        ji = log_header.index(gene2)
        
        plt.subplot(7,7,(i*7)+(j+1))
        plt.scatter(forest.output[:,ii][forest.split_clusters[2].sister_scores() > 0.05],forest.output[:,ji][forest.split_clusters[2].sister_scores() > 0.05],s=1)
plt.show()
    

In [None]:
len(log_header)

In [None]:
forest.output_features

In [None]:
difference = np.mean(forest.mean_matrix(forest.split_clusters[2].nodes),axis=0) - np.mean(forest.mean_matrix([n.sister() for n in forest.split_clusters[2].nodes]),axis=0)
ratio = np.log(np.mean(forest.mean_matrix(forest.split_clusters[2].nodes),axis=0) / np.mean(forest.mean_matrix([n.sister() for n in forest.split_clusters[2].nodes]),axis=0))


In [None]:
# np.sum(np.isfinite(ratio))
# mask = np.isfinite(ratio)
# forest.output_features[mask]
# ratio[mask]
# difference_sort = np.argsort(difference)
# ratio_sort = np.argsort(ratio[mask])
forest.output_features[mask][ratio_sort]
# ratio[mask][ratio_sort]

In [None]:
forest.output_features[difference_sort][-10:]

In [None]:
list(forest.output_features).index("CALR")

In [None]:
forest.split_clusters[2].changed_absolute_sister()

In [None]:
feature_clusters = forest.cluster_features(sub=.8,k=20,depth=6,pca=100)

In [None]:
plt.figure()
plt.imshow(forest.output[np.argsort(forest.sample_labels)].T[np.argsort(feature_clusters)].T,aspect='auto')
plt.colorbar()
plt.show()

In [None]:
forest.output.shape

In [None]:
feature_clusters.shape

## Sample Cross Training

In [None]:
f1_self_predictions = forest1.predict_matrix_clusters(forest1.input)

In [None]:
f2_self_predictions = forest2.predict_matrix_clusters(forest2.input)

In [None]:
f1_predictions = forest1.predict_matrix_clusters(forest2.input)
f2_predictions = forest2.predict_matrix_clusters(forest1.input)

In [None]:
print(len(f2_predictions))
print(len(forest1.samples))

In [None]:
from sklearn.metrics import adjusted_mutual_info_score

# print(adjusted_mutual_info_score(forest1.sample_labels,f1_self_predictions))
# print(adjusted_mutual_info_score(forest2.sample_labels,f2_self_predictions))

print(adjusted_mutual_info_score(forest2.sample_labels,f1_predictions))
print(adjusted_mutual_info_score(forest1.sample_labels,f2_predictions))

In [None]:
# tsne = forest1.tsne(pca=100)
# plt.figure()
# plt.scatter(*tsne.T,c=f1_predictions,s=1,cmap='rainbow')
# plt.show()
# plt.figure()
# plt.scatter(*tsne.T,c=forest1.sample_labels,s=1,cmap='rainbow')
# plt.show()

tsne = forest2.tsne(pca=100)
plt.figure()
plt.scatter(*tsne.T,c=f2_predictions,s=1,cmap='rainbow')
plt.show()
plt.figure()
plt.scatter(*tsne.T,c=forest2.sample_labels,s=1,cmap='rainbow')
plt.show()

In [None]:
encoding = forest1.node_sample_encoding(forest1.leaves())
for i,sample in enumerate(forest1.input):
    predicted = forest1.predict_node_sample_encoding(np.array([sample,]))
    print(predicted.shape)
#     print(np.corrcoef(encoding[i],predicted[0]))[0,1]
    

In [None]:
np.corrcoef(forest1.sample_labels,f1_predictions)

In [None]:
forest1.test_self_predictions()

In [None]:
list(f1_predictions)

In [None]:
list(forest1.sample_labels)

## Factor Cross-Training

In [None]:
f1_factors = forest1.factor_matrix()
f2_factors = forest2.factor_matrix()

print(f1_factors.shape)
print(f2_factors.shape)

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

f1_f_sort = dendrogram(linkage(f1_factors.T,metric='cos',method='average'),no_plot=True)['leaves']
f2_f_sort = dendrogram(linkage(f2_factors.T,metric='cos',method='average'),no_plot=True)['leaves']

# plt.figure()
# plt.imshow(np.corrcoef(f1_factors[f1_f_sort].T[f1_f_sort]))
# plt.show()

# plt.figure()
# plt.imshow(np.corrcoef(f2_factors[f2_f_sort].T[f2_f_sort]))
# plt.show()



In [None]:
print(f1_factors.shape)
print(f1_factor_prediction.shape)

In [None]:
f1_factor_prediction = forest2.predict_factor_matrix(forest1.input)
f1_factor_prediction.shape

# f2_factor_self_prediction = forest2.predict_factor_matrix(forest2.input)
# f2_factor_self_prediction.shape

In [None]:
from matplotlib.colors import DivergingNorm

from scipy.spatial.distance import cdist,squareform

correlations = cdist(f1_factors.T,f1_factor_prediction.T,metric='correlation') - 1

plt.figure()
plt.title("Correlations of Factor Predictions For Control 1")
plt.imshow(correlations[f1_f_sort].T[f2_f_sort].T,cmap='bwr',interpolation='none')
plt.xlabel("Control 2 Factors")
plt.ylabel("Own Factors")
plt.colorbar(label="Correlation")
plt.show()


In [None]:
from matplotlib.colors import DivergingNorm

f1_sort = np.argsort(forest1.sample_labels)

plt.figure()
plt.title("Factors Learned From Control 1")
plt.imshow(f1_factors[f1_sort].T[f1_f_sort].T,aspect='auto',cmap='bwr',norm=DivergingNorm(0),interpolation='none')
plt.colorbar(label="Factor Value (-1 to 1, arbitrary)")
plt.xlabel("Factors")
plt.ylabel("Cells")
plt.show()

plt.figure()
plt.title("Factors Learned From Control 2, Projected onto Control 1")
plt.imshow(f1_factor_prediction[f1_sort].T[f2_f_sort].T,aspect='auto',cmap='bwr',norm=DivergingNorm(0),interpolation='none')
plt.colorbar(label="Factor Value (-1 to 1, arbitrary)")
plt.xlabel("Factors")
plt.ylabel("Cells")
plt.show()


In [None]:
from matplotlib.colors import DivergingNorm

for factor in f1_factor_prediction.T:
    plt.figure()
    plt.scatter(*forest1.coordinates(no_plot=True).T,c=factor,cmap='bwr',norm=DivergingNorm(0))
    plt.colorbar()
    plt.show()

## FACTOR EXTRACTION

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

# feature_sort = dendrogram(linkage(forest2.output.T,metric='correlation',method='average'),no_plot=True)['leaves']
# sample_sort = dendrogram(linkage(forest2.output,metric='cos',method='average'),no_plot=True)['leaves']

plt.figure()
plt.imshow(forest2.output[sample_sort].T[feature_sort].T,aspect='auto',interpolation='none',vmin=-1,vmax=3)
plt.colorbar()
plt.show()

In [None]:
correlations = np.corrcoef(forest2.output.T)
correlations = correlations[feature_sort].T[feature_sort]



plt.figure()
plt.title("Correlations of Features In Johnston Controls")
plt.imshow(correlations,cmap='seismic',vmin=-1,vmax=1,interpolation='none')
plt.colorbar()
plt.show()


In [None]:
for cluster in forest2.split_clusters[15:17]:
    factor = cluster.sister_scores()
    factor_correlations = np.corrcoef(forest2.output.T,factor)[-1,:-1]
    plt.figure()
    plt.axes([0,0,.7,1])
    plt.title("Global Correlations")
    plt.imshow(correlations,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
    plt.axes([.9,0,.1,1])
    plt.title(f"Cluster {cluster.id}")
    plt.ylabel("Correlations of Cluster Sister Scores to Features")
    plt.imshow(np.array([factor_correlations[feature_sort],]).T,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
    plt.show()

In [None]:
from sklearn.decomposition import PCA

pcs = PCA(n_components=10).fit_transform(forest2.output)
factor_16 = forest2.split_clusters[16].sister_scores()

pc_16_correlations = np.corrcoef(pcs.T,factor_16)

print(pc_16_correlations[10])

# plt.figure()
# plt.scatter(pcs[:,0],factor_16,s=2)
# plt.xlabel("PC1")
# plt.ylabel("Factor 16")
# plt.plot()
# plt.show()

# from sklearn.linear_model import LinearRegression
# regr = LinearRegression()
# regr.fit(pcs[:,0].reshape(-1, 1), factor_16)

# pred = regr.predict(pcs[:,0].reshape(-1, 1))

# print('Coefficients: \n', regr.coef_)
# # print('Coefficient of determination: %.2f'
# #       % r2_score(diabetes_y_test, diabetes_y_pred))

# plt.figure()
# plt.title("PC1 vs Factor 16")
# plt.scatter(pcs[:,0],factor_16,s=2)
# plt.plot(pcs[:,0],pred,color='r',label="Linear Fit")
# plt.legend()
# plt.xlabel("PC1")
# plt.ylabel("Factor 16")
# plt.plot()
# plt.show()

In [None]:
from sklearn.decomposition import PCA

pcs = PCA(n_components=10).fit_transform(forest2.output)
factor_20 = forest2.split_clusters[20].sister_scores()

pc_20_correlations = np.corrcoef(pcs.T,factor_20)
pc_20_correlations.shape

# print(pc_20_correlations[8:])

plt.figure()
plt.title("Corrrelations of Factor 20 to PCs")
plt.bar(np.arange(10),pc_20_correlations[10,:10])
plt.xlabel("PCs")
plt.ylabel("Correlation to Factor 20")
plt.gca().set_xticks(np.arange(11))
plt.gca().set_xticklabels(np.arange(1,10))
plt.plot()
plt.show()

from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(pcs[:,3].reshape(-1, 1), factor_20)

pred = regr.predict(pcs[:,3].reshape(-1, 1))

print('Coefficients: \n', regr.coef_)
# print('Coefficient of determination: %.2f'
#       % r2_score(diabetes_y_test, diabetes_y_pred))

plt.figure()
plt.title("PC4 vs Factor 20")
plt.scatter(pcs[:,3],factor_16,s=2)
plt.plot(pcs[:,3],pred,color='r',label="Linear Fit")
plt.legend()
plt.xlabel("PC4")
plt.ylabel("Factor 20")
plt.plot()
plt.show()

In [None]:
# sister_correlation_matrix = np.zeros((len(forest2.split_clusters),len(forest2.output_features)))

# for i,cluster in enumerate(forest2.split_clusters):
#     factor = cluster.sister_scores()
#     factor_correlations = np.corrcoef(forest2.output.T,factor)[-1,:-1]
#     sister_correlation_matrix[i] = factor_correlations
    
plt.figure(figsize=(6,4))
plt.axes([0,0,.7,1])
plt.title("Johnston Control 2, Feature Correlations")
plt.imshow(correlations,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
plt.axes([.8,0,.2,1])
plt.title("Extracted Factors")
plt.imshow(sister_correlation_matrix.T[feature_sort],interpolation='none',cmap='seismic',aspect='auto',vmin=-1,vmax=1)
plt.show()

In [None]:
factor_matrix = np.zeros((forest2.output.shape[0],len(forest2.split_clusters)))

for i,cluster in enumerate(forest2.split_clusters):
    factor_matrix[:,i] = cluster.sister_scores()

In [None]:
plt.figure()
plt.imshow(factor_matrix[sample_sort],cmap='seismic',vmin=-1,vmax=1,interpolation='none',aspect='auto')
plt.show()

In [None]:
plt.figure()
plt.imshow(factor_matrix[np.argsort(forest2.sample_labels)],cmap='seismic',vmin=-1,vmax=1,interpolation='none',aspect='auto')
plt.show()

In [None]:
plt.figure()
plt.imshow(factor_matrix[np.argsort(f2_self_predictions)],cmap='seismic',vmin=-1,vmax=1,interpolation='none',aspect='auto')
plt.show()

In [None]:
factor_sort = dendrogram(linkage(factor_matrix.T,metric='cos',method='average'),no_plot=True)['leaves']

In [None]:
plt.figure()
plt.title("Johnston Control 2, Factor Values")
plt.imshow(factor_matrix[np.argsort(f2_self_predictions)].T[factor_sort].T,cmap='seismic',vmin=-1,vmax=1,interpolation='none',aspect='auto')
plt.gca().set_xticks(np.arange(36))
plt.gca().set_xticklabels(np.arange(36)[factor_sort],rotation=90)
plt.xlabel("Factors")
plt.ylabel("Cells")
plt.show()

In [None]:
print(f"ARR3:{list(forest2.output_features).index('ARR3')}")
print(f"DDIT3:{list(forest2.output_features).index('DDIT3')}")
print(f"PIK3R1:{}")
print(f"CKB:{list(forest2.output_features).index('CKB')}")


In [None]:
mask = forest2.split_clusters[16].sister_scores() > 0
indices = [1711,2546,1112,2882]
genes = ["ARR3","DDIT3","PIK3R1","CKB"]

selection = forest2.output[:,indices]
subselection = forest2.output[mask][:,indices]

print(np.corrcoef(selection.T))
print(np.corrcoef(subselection.T))

plt.figure(figsize=(3,3))
plt.title("Feature Correlations Globally")
plt.imshow(np.corrcoef(selection.T),cmap='bwr',vmin=-.5,vmax=.5)
for (j,i),label in np.ndenumerate(np.round(np.corrcoef(selection.T),3)):
    plt.gca().text(i,j,label,ha='center',va='center')
    plt.gca().text(i,j,label,ha='center',va='center')
plt.gca().set_yticks(np.arange(4))
plt.gca().set_yticklabels(genes)
plt.gca().set_xticks(np.arange(4))
plt.gca().set_xticklabels(genes)
plt.show()

plt.figure(figsize=(3,3))
plt.title("Feature Correlations Locally")
plt.imshow(np.corrcoef(subselection.T),cmap='bwr',vmin=-.5,vmax=.5)
for (j,i),label in np.ndenumerate(np.round(np.corrcoef(subselection.T),3)):
    plt.gca().text(i,j,label,ha='center',va='center')
    plt.gca().text(i,j,label,ha='center',va='center')
plt.gca().set_yticks(np.arange(4))
plt.gca().set_yticklabels(genes)
plt.gca().set_xticks(np.arange(4))
plt.gca().set_xticklabels(genes)
plt.show()


In [None]:
# fuzz = np.random.random(size=(forest2.output.shape[0]))

# plt.figure()
# plt.scatter(forest2.output[:,1711]+fuzz,forest2.output[:,1112]+fuzz,s=2,alpha=.5)
# plt.show()

# plt.figure()
# plt.scatter((forest2.output[:,1711]+fuzz)[mask],(forest2.output[:,1112] + fuzz)[mask],s=2,alpha=.5)
# plt.show()

In [None]:
mask = forest2.split_clusters[16].sister_scores() > 0
genes = ["ARR3","MYL4","GNGT1","NRL"]
indices = [list(forest2.output_features).index(gene) for gene in genes]

selection = forest2.output[:,indices]
subselection = forest2.output[mask][:,indices]

print(np.corrcoef(selection.T))
print(np.corrcoef(subselection.T))

plt.figure(figsize=(3,3))
plt.title("Feature Correlations Globally")
plt.imshow(np.corrcoef(selection.T),cmap='bwr',vmin=-1,vmax=1)
for (j,i),label in np.ndenumerate(np.round(np.corrcoef(selection.T),3)):
    plt.gca().text(i,j,label,ha='center',va='center')
    plt.gca().text(i,j,label,ha='center',va='center')
plt.gca().set_yticks(np.arange(4))
plt.gca().set_yticklabels(genes)
plt.gca().set_xticks(np.arange(4))
plt.gca().set_xticklabels(genes)
plt.show()

plt.figure(figsize=(3,3))
plt.title("Feature Correlations Locally")
plt.imshow(np.corrcoef(subselection.T),cmap='bwr',vmin=-1,vmax=1)
for (j,i),label in np.ndenumerate(np.round(np.corrcoef(subselection.T),3)):
    plt.gca().text(i,j,label,ha='center',va='center')
    plt.gca().text(i,j,label,ha='center',va='center')
plt.gca().set_yticks(np.arange(4))
plt.gca().set_yticklabels(genes)
plt.gca().set_xticks(np.arange(4))
plt.gca().set_xticklabels(genes)
plt.show()

In [None]:
print(np.corrcoef(pcs.T,forest2.output[:,1711])[10])
print(np.corrcoef(factor_20,forest2.output[:,1711]))

In [None]:
# representation = forest2.node_representation(forest2.nodes(),mode='additive_mean')

# in_factor_correlations = np.corrcoef(representation.T)
# in_factor_sort = dendrogram(linkage(in_factor_correlations,metric='cos',method='average'),no_plot=True)['leaves']

unsorted_correlations = np.corrcoef(forest2.output.T)
unsorted_correlations.shape

correlation_difference = in_factor_correlations - unsorted_correlations


In [None]:
plt.figure()
plt.title("Correlations of Feature Gains Per Node")
plt.imshow(in_factor_correlations[feature_sort].T[feature_sort],cmap='seismic',vmin=-1,vmax=1)
plt.show()

plt.figure()
plt.title("Correlations of Feature Gains Per Node")
plt.imshow(in_factor_correlations[in_factor_sort].T[in_factor_sort],cmap='seismic',vmin=-1,vmax=1)
plt.show()

In [None]:

# plt.figure()
# plt.imshow(correlation_difference[feature_sort].T[feature_sort],cmap='seismic',vmin=-1,vmax=1)
# plt.colorbar()
# plt.show()

plt.figure()
plt.title("Feature Locality (Node Correlation - Global Correlation)")
plt.imshow(correlation_difference[in_factor_sort].T[in_factor_sort],cmap='seismic',vmin=-1,vmax=1)
plt.show()


In [None]:
paths = forest2.sample_split_class_dependence()

In [None]:
plt.figure()
plt.title("Conditional Probability of Factor X Given Factor Y")
plt.imshow((paths/np.array([paths[0],]).T)[1:])
plt.colorbar()
plt.show()