In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of citeseq data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# The nesterowa data comes pre-normalized (I don't want to go digging for the unnormalized counts for now)

data_location = '/Users/bbrener1/taylor/raw_data/nesterowa/'

nesterowa = sc.read(data_location+'nesterowa_counts.txt')
header = np.loadtxt(data_location+'nesterowa_gene_header.txt',dtype=str)
# cell_type_matrix = np.loadtxt('nesterowa_cell_type_membership.txt').astype(dtype=bool)
# cell_type_header = np.loadtxt('nesterowa_cell_type_header.txt',dtype=str)
nesterowa.shape

In [None]:
# It also does not need to be transposed or filtered in its current form

In [None]:
# plt.figure()
# plt.hist(np.sum(forest.output,axis=1))
# plt.show()

In [None]:
import sys
# sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest = lumberjack.fit(
    np.array(nesterowa.X),
    header=header,
    trees=100,
    braids=1,
    ifs=1000,
    ofs=1000,
    ss=500,
    depth=8,
    leaves=100,
    sfr=.5,
    norm='l1',
    reduce_input='true',
)


forest.set_cache(True)
# forest.backup("scanpy_cmp_nesterowa")


In [None]:
import sys
sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest = tr.Forest.reconstitute('scanpy_cmp_nesterowa')
forest.arguments

In [None]:
forest.reset_sample_clusters()
# forest.cluster_samples_encoding(sub=.8,k=10,depth=8,metric='jaccard')
# forest.cluster_samples_encoding(sub=.5,k=20,depth=8,metric='cosine',pca=100)

# forest.reset_leaf_clusters()
# forest.cluster_leaves_samples(sub=.5,k=20,depth=6,metric="jaccard")
# forest.cluster_leaves_samples(sub=.8,k=20,metric="cosine",pca=100)
# forest.cluster_leaves_predictions(sub=.8,k=20,metric="cosine",pca=100,mode="mean")
# forest.cluster_samples_leaf_cluster()

In [None]:
forest.tsne(pca=100)
forest.plot_sample_clusters()
# forest.trees[0].plot()

In [None]:
# for cluster in forest.leaf_clusters:
#     cluster.plot_sample_counts()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(
    relatives=True,
    pca=100,
    depth=6,
    mode='additive_mean',
    metric='cosine',
    k=100)

In [None]:
len(forest.split_clusters)

In [None]:
forest.tsne_coordinates = nesterowa.obsm["X_umap"]
# forest.tsne(pca=100)
# forest.most_likely_tree(depth=6,mode='sample')
forest.maximum_spanning_tree(depth=6,mode='samples')
forest.html_tree_summary()

In [None]:
# We now establish the neighbor graph because several methods rely on it

sc.pp.neighbors(nesterowa)

In [None]:
sc.tl.umap(nesterowa)

In [None]:
sc.pl.umap(nesterowa)

In [None]:
# We want to do clusterin via Louvain as one of the gold standards
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(nesterowa,resolution=1)

In [None]:
sc.pl.umap(nesterowa,color='louvain')

In [None]:
sc.tl.tsne(nesterowa)

In [None]:
sc.pl.tsne(nesterowa,color='louvain')

In [None]:
# plt.figure(figsize=(10,10))
# plt.scatter(*citeseq_scaled.obsm['X_tsne'].T,c=forest.sample_labels,s=4,cmap='rainbow')
# plt.show()

plt.figure(figsize=(10,10))
plt.scatter(*nesterowa.obsm['X_umap'].T,c=forest.sample_labels,s=4,cmap='rainbow')
plt.show()

In [None]:
forest.tsne_coordinates = nesterowa.obsm['X_umap']

In [None]:
forest.plot_sample_clusters()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',depth=5,sub=.5,k=20,metric='cosine',relatives=True)

In [None]:
for split_cluster in forest.split_clusters:
    split_cluster.html_sister_scores()

In [None]:
# forest.maximum_spanning_tree(depth=5)
# forest.most_likely_tree(depth=5)
forest.html_tree_summary(n=10)

In [None]:
len(forest.nodes(root=True,depth=5))

## Marker Comparison

In [None]:
# We have annotations of cell type based on fluorescence values for these cells, currently one-hot encoded:
print(cell_type_header)
print(cell_type_matrix.shape)

In [None]:
# How many overlaps in broad peaks?

print(np.sum(np.sum(cell_type_matrix[:,:11].astype(dtype=int),axis=1) > 1))

# How many no-calls?

print(np.sum(np.sum(cell_type_matrix[:,:11].astype(dtype=int),axis=1) < 1))

# How many overlaps in narrow peaks? 

print(np.sum(np.sum(cell_type_matrix[:,11:].astype(dtype=int),axis=1) > 1))

# How many no-calls?

print(np.sum(np.sum(cell_type_matrix[:,11:].astype(dtype=int),axis=1) < 1))



In [None]:
for i,cell_type in enumerate(cell_type_header):
    plt.figure(figsize=(3,3))
    plt.title(cell_type)
    plt.scatter(*nesterowa.obsm['X_tsne'].T,c=cell_type_matrix[:,i],s=4)
    plt.show()

In [None]:
# Finding consensus calls may not be deeply informative regardless, so let's try this:

louvain_sort = np.argsort(nesterowa.obs['louvain'])
forest_sort = np.argsort(forest.sample_labels)

plt.figure()
plt.title("Cell Types Vs Louvain")
plt.imshow(cell_type_matrix[louvain_sort][:,:11],aspect='auto',interpolation='none')
plt.set_xticklabels(cell_type_header[:11])
plt.show()

plt.figure()
plt.title("Cell Types Vs Forest")
plt.imshow(cell_type_matrix[forest_sort][:,:11],aspect='auto',interpolation='none')
plt.getset_xticklabels(cell_type_header[:11])
plt.show()

# plt.figure()
# plt.imshow(cell_type_matrix[louvain_sort][:,11:],aspect='auto',interpolation='none')
# plt.show()

# plt.figure()
# plt.imshow(cell_type_matrix[forest_sort][:,:11:],aspect='auto',interpolation='none')
# plt.show()

# Comparisons: Paired membership

# Cluster set 1 vs Cluster set 2, % of pairings conserved 

# Homogeneity, Completeness

In [None]:
# For Nesterowa Data it appears difficult to separate:

    

## KS Analysis

In [None]:
# Now we wish to examine the behavior of distributions within clusters as defined by louvain and RF

In [None]:
umi_means = np.array(np.mean(umis,axis=0)).ravel()
umi_vars = np.array(np.var(umis,axis=0)).ravel()

In [None]:
umi_means.shape

In [None]:
plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Log/Log")
plt.scatter(np.log(umi_means),np.log(umi_vars),s=1)
plt.plot([-8,4],[-8,4],c='red')
plt.show()

plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Linear")
plt.scatter(umi_means,umi_vars,s=1)
plt.plot([0,10],[0,10],c='red')
plt.xlim(0,10)
plt.ylim(0,10)
plt.show()


plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Linear, low range")
plt.scatter(umi_means,umi_vars,s=1)
plt.plot([0,1],[0,1],c='red')
plt.xlim(0,1)
plt.ylim(0,1)
plt.show()

plt.figure(figsize=(10,10))
plt.title("UMI Mean/Variance Ratio per feature, Linear")
plt.scatter(np.arange(umi_means.shape[0]),(umi_means/umi_vars)[np.argsort(umi_means)],s=1)
plt.show()


In [None]:
for cluster in set(johnston_working.obs['louvain']):
    mask = johnston_working.obs['louvain'] == cluster
    filtered_cells = np.array(umis[mask])
    cluster_means = np.mean(filtered_cells,axis=0)
    cluster_var = np.var(filtered_cells,axis=0)
    plt.figure(figsize=(10,10))
    plt.scatter(np.log(cluster_means),np.log(cluster_var),s=1)
    plt.plot([-8,4],[-8,4],c='red')
    plt.show()

In [None]:
# Now we wish to examine

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = np.array(umis[mask])
    cluster_means = np.mean(filtered_cells,axis=0)
    cluster_var = np.var(filtered_cells,axis=0)
    plt.figure(figsize=(10,10))
    plt.title(f"Cluster:{cluster}")
    plt.scatter(np.log(cluster_means),np.log(cluster_var),s=1)
    plt.plot([-8,4],[-8,4],c='red')
    plt.show()
    plt.figure(figsize=(10,10))
    plt.title(f"UMI Mean/Variance Ratio per feature, Cluster:{cluster}")
    plt.scatter(np.arange(cluster_means.shape[0]),(cluster_means/cluster_var)[np.argsort(cluster_means)],s=1)
    plt.show()

In [None]:
np.argsort(np.mean(umis.X,axis=0))[1000]

In [None]:
from scipy.stats import kstest,poisson,nbinom

def poisson_ks_check(x):
    mean = np.mean(x)
    cdf = lambda x: poisson.cdf(x,mean)
    return kstest(x,cdf)

# Correctly fitting a negative binomial is a pain, here is a hacked out version for now:
def nb_pr_estimation(x):
    mean = np.mean(x)
    var = np.var(x)
    counter = mean/var
        
    p = -1 * (counter - 1)
    r = mean * ((1-p)/p)
        
    return p,r

In [None]:
nb_draws = nbinom.rvs(4,.5,size=1000)

plt.figure()
plt.hist(nb_draws)
plt.show()

nb_pr_estimation(nb_draws)

In [None]:
ks_values = [poisson_ks_check(x)[0] for x in forest.output.T]
print(ks_values)

In [None]:
plt.figure()
plt.title("Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Global")
plt.scatter(umi_means,ks_values[:2000],s=1)
plt.xlabel("Mean")
plt.ylabel("K-S Statistic")
plt.xlim(0,40)
plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = np.array(umis[mask])

    ks_values = [poisson_ks_check(x)[0] for x in filtered_cells.T]
    means = np.mean(filtered_cells,axis=0)
    
    plt.figure()
    plt.title(f"Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Cluster {cluster}")
    plt.scatter(means,ks_values[:2000],s=1)
    plt.xlabel("Mean")
    plt.ylabel("K-S Statistic")
    plt.xlim(0,30)
    plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = np.array(umis[mask])

    size_factors = np.sum(filtered_cells,axis=1)
    
    plt.figure()
    plt.title(f"Size factor distributions, Cluster {cluster}")
    plt.hist(np.array(size_factors),log=True,bins=np.arange(0,50000,200))
    plt.show()

In [None]:
size_factors = np.array(np.sum(np.array(umis),axis=1))

In [None]:
plt.hist(size_factors,bins=np.arange(0,50000,200))

In [None]:
from sklearn.decomposition import PCA

pca = PCA.fit_transform()

## Clustering/Partition Analysis

In [None]:
from sklearn.metrics import calinski_harabasz_score,silhouette_score,silhouette_samples,mutual_info_score,adjusted_mutual_info_score

In [None]:
print(calinski_harabasz_score(cell_type_matrix,nesterowa.obs['louvain']))
print(calinski_harabasz_score(cell_type_matrix,forest.sample_labels))

print(calinski_harabasz_score(nesterowa.X,nesterowa.obs['louvain']))
print(calinski_harabasz_score(nesterowa.X,forest.sample_labels))

In [None]:
print(silhouette_score(cell_type_matrix,nesterowa.obs['louvain']))
print(silhouette_score(cell_type_matrix,forest.sample_labels))

print(silhouette_score(nesterowa.X,nesterowa.obs['louvain'],metric='cosine'))
print(silhouette_score(nesterowa.X,forest.sample_labels,metric='cosine'))

In [None]:
print(mutual_info_score(nesterowa.obs['louvain'],forest.sample_labels))
print(adjusted_mutual_info_score(nesterowa.obs['louvain'],forest.sample_labels))


## Factor Discovery Analysis

In [None]:
forest.split_clusters

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

feature_sort = dendrogram(linkage(forest.output.T,metric='correlation',method='average'),no_plot=True)['leaves']
sample_sort = dendrogram(linkage(forest.output,metric='cos',method='average'),no_plot=True)['leaves']

plt.figure()
plt.imshow(forest.output[sample_sort].T[feature_sort].T,aspect='auto')
plt.show()

In [None]:
plt.figure()
plt.title("Agglomerated Dataset")
plt.imshow(forest.output[np.argsort(forest.sample_labels)].T[feature_sort].T,aspect='auto')
plt.show()

In [None]:
for cluster in forest.split_clusters:
    factor = cluster.sister_scores()
    factor_sort = np.argsort(factor)
    print(factor)
    print(factor_sort)
    plt.figure()
    plt.axes([0,0,.8,1])
    plt.title("Agglomerated Dataset")
    plt.imshow(forest.output[factor_sort].T[feature_sort].T,aspect='auto')
    plt.axes([.9,0,.1,1])
    plt.imshow(np.array([factor,]).T[factor_sort],cmap='bwr',aspect='auto')
    plt.show()

In [None]:
for cluster in forest.split_clusters:
    factor = cluster.sister_scores()
    plt.figure()
    plt.axes([0,0,.8,1])
    plt.title("Agglomerated Dataset")
    plt.imshow(forest.output[sample_sort].T[feature_sort].T,aspect='auto')
    plt.axes([.9,0,.1,1])
    plt.imshow(np.array([factor,]).T[sample_sort],cmap='bwr',aspect='auto')
    plt.show()

In [None]:
correlations = np.corrcoef(forest.output.T)
correlations = correlations[feature_sort].T[feature_sort]

plt.figure()
plt.title("Correlations of Features In Nestorowa")
plt.imshow(correlations,cmap='seismic',vmin=-1,vmax=1)
plt.colorbar()
plt.show()


In [None]:
for cluster in forest.split_clusters:
    factor = cluster.sister_scores()
    factor_correlations = np.corrcoef(forest.output.T,factor)[-1,:-1]
    plt.figure()
    plt.axes([0,0,.8,1])
    plt.title("Agglomerated Dataset")
    plt.imshow(correlations,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
    plt.axes([.9,0,.1,1])
    plt.imshow(np.array([factor_correlations[feature_sort],]).T,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
    plt.show()

In [None]:
sister_correlation_matrix = np.zeros((len(forest.split_clusters),len(forest.output_features)))

for i,cluster in enumerate(forest.split_clusters):
    factor = cluster.sister_scores()
    factor_correlations = np.corrcoef(forest.output.T,factor)[-1,:-1]
    sister_correlation_matrix[i] = factor_correlations
    
plt.figure(figsize=(14.2,10))
plt.axes([0,0,.7,1])
plt.title("Agglomerated Dataset")
plt.imshow(correlations,cmap='seismic',aspect='auto',vmin=-1,vmax=1)
plt.axes([.8,0,.2,1])
plt.imshow(sister_correlation_matrix.T[feature_sort],interpolation='none',cmap='seismic',aspect='auto',vmin=-1,vmax=1)
plt.show()

In [None]:
len(header)