In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of citeseq data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# First we load the data, fortunately the facilities for this are pretty nice

datasets = [
    '/localscratch/bbrener1/johnston_sc/Retina2/outs/filtered_gene_bc_matrices/dmel_r6.20/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/nelmari/ctrl/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/fan_tendon/TH1/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/citeseq/GSE100866_CBMC_human_umis.tsv', # This dataset had spiked in mouse cells
                                                                                      # but this is a filtered matrix
        
    '/home/bbrener1/transfer/all_raw/raw_data/vision_sc/raw_counts.txt',   # These are the unfiltered UMIs of this dataset
                                                                           # Actual paper analysis was done on like 1600 cells, so needs to be checked out
]
# Whether this dataset needs to be transposed
transpose = [
    True,
    True,
    True,
    True,
    False,
]

scanpy_objects = []
umi_objects = []

for dataset,d_transpose in zip(datasets,transpose): 
    
    print(dataset)
    
    large_scanpy_object = sc.read(dataset)
    
    if d_transpose:
        large_scanpy_object = large_scanpy_object.T

    sc.pp.downsample_counts(large_scanpy_object,counts_per_cell=1200)
    large_scanpy_object.X = large_scanpy_object.X.astype(dtype=float)
        
    sc.pp.filter_cells(large_scanpy_object,min_genes=100)
    sc.pp.filter_cells(large_scanpy_object,min_counts=100)
    
    
    
    print("Read in")
#     This filtration is roughly analogous to the standard zheng, but it retains the UMI object. 

    sc.pp.filter_genes(large_scanpy_object, min_counts=10)         # only consider genes with more than 10 counts

    
    scpy_copy = large_scanpy_object.copy()
    sc.pp.normalize_per_cell(scpy_copy)
    filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
        scpy_copy.X, flavor='cell_ranger', n_top_genes=2000, log=False
    )
    scpy_filtered = large_scanpy_object[:, filter_result.gene_subset].copy()     # subset the genes
    
    del(large_scanpy_object)
        
    umis = scpy_filtered.copy().X

    sc.pp.normalize_per_cell(scpy_filtered)                 # renormalize after filtering
    sc.pp.log1p(scpy_filtered)                      # log transform: adata.X = log(adata.X + 1)
    sc.pp.scale(scpy_filtered)

#     print(f"zero mean:{np.sum(np.mean(umis,axis=0) == 0)}/{np.mean(umis,axis=0).shape}")
#     print(f"zero var:{np.sum(np.var(umis,axis=0) == 0)}")
    
    scanpy_objects.append(scpy_filtered)
    umi_objects.append(umis)

In [None]:
for i,umi in enumerate(umi_objects):
    print(type(umi))
    if type(umi) is not type(np.zeros(0)):
        umi_objects[i] = np.array(umi.todense())
    print(umi.shape)

In [None]:
for i,umi in enumerate(umi_objects):
    print(type(umi))
    print(umi.shape)
    print(f"zero mean:{np.sum(np.mean(umi,axis=0) == 0)}/{np.mean(umis,axis=0).shape}")
    print(f"zero var:{np.sum(np.var(umi,axis=0) == 0)}")
    print(np.log(np.mean(umi,axis=0)))
    print(np.log(np.var(umi,axis=0)))


In [None]:
import sys
sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest_objects = []

for umi in umi_objects: 
    forest = lumberjack.fit(
        umi,
        trees=100,
        braids=3,
        ifs=500,
        ofs=500,
        ss=200,
        depth=8,
        leaves=50,
        sfr=0
    )

    forest.set_cache(True)
    
    forest_objects.append(forest)

In [None]:
import dill
dill.dump_session("scanpy_poisson_session.db")


In [None]:
for dataset,forest in zip(datasets,forest_objects):

#     forest.reset_sample_clusters()
    if len(forest.samples) > 3000:
        k = 20
    else:
        k = 10
    forest.cluster_samples_encoding(
        sub=.8,
        k=k,
        depth=8,
        metric='cosine',
        pca=100
    )
    forest.tsne(pca=100)
    forest.plot_sample_clusters()
    
    umi_means = np.array(np.mean(forest.output,axis=0)).ravel()
    umi_vars = np.array(np.var(forest.output,axis=0)).ravel()
    
    plt.figure(figsize=(10,10))
    plt.title("UMI Mean vs Variance per feature, Log/Log, Global")
    plt.scatter(np.log(umi_means),np.log(umi_vars),s=1)
    plt.plot([-8,4],[-8,4],c='red')
    plt.show()

#     plt.figure(figsize=(10,10))
#     plt.title("UMI Mean vs Variance per feature, Linear, Global")
#     plt.scatter(umi_means,umi_vars,s=1)
#     plt.plot([0,10],[0,10],c='red')
#     plt.xlim(0,10)
#     plt.ylim(0,10)
#     plt.show()


#     plt.figure(figsize=(10,10))
#     plt.title("UMI Mean/Variance Ratio per feature, Linear")
#     plt.scatter(np.arange(umi_means.shape[0]),(umi_means/umi_vars)[np.argsort(umi_means)],s=1)
#     plt.show()
    
    forest_clusters = []
    
    for cluster in set(forest.sample_labels):
        mask = forest.sample_labels == cluster
        filtered_cells = forest.output[mask]
        cluster_means = np.mean(filtered_cells,axis=0)
        cluster_var = np.var(filtered_cells,axis=0)
        forest_clusters.append((cluster_means,cluster_var))
    
    forest_cluster_means = [m for c in forest_clusters for m in c[0]]
    forest_cluster_var = [v for c in forest_clusters for v in c[1]]
        
    plt.figure(figsize=(10,10))
    plt.title("Clustered vs Global")
    plt.scatter(np.log(forest_cluster_means),np.log(forest_cluster_var),c='blue',s=1,alpha=.5,label="Cluster")
    plt.scatter(np.log(umi_means),np.log(umi_vars),c='red',s=1,alpha=.5,label="Global")
    plt.plot([0,10],[0,10],c='red')
    plt.legend()
    plt.xlabel("Mean")
    plt.ylabel("Variance")
    plt.show()        
    
# #     plt.figure(figsize=(10,10))
# #     plt.title("Clustered vs Global")
# #     plt.scatter(forest_cluster_means,forest_cluster_var,c='blue',s=1,alpha=.5,label="Cluster")
# #     plt.scatter(umi_means,umi_vars,c='red',s=1,alpha=.5,label="Global")
# #     plt.plot([0,10],[0,10],c='red')
# #     plt.legend()
# #     plt.xlabel("Mean")
# #     plt.ylabel("Variance")
# #     plt.show()        

    mean_sort = np.argsort(umi_means)

    plt.figure(figsize=(10,10))
    plt.title(f"UMI Mean/Variance Ratio per feature, Dataset:{dataset}")
    for cluster in forest_clusters:    
        cluster_means,cluster_var = cluster
        plt.scatter(np.arange(len(cluster_means)),(np.array(cluster_means)/np.array(cluster_var))[mean_sort],s=1,c='blue')
    plt.ylim(0,2)
    plt.scatter(np.arange(len(umi_means)),(umi_means/umi_vars)[mean_sort],s=3,c='red')
    plt.show()

In [None]:
for forest in forest_objects:
    print(len(forest.samples))

In [None]:
# We now establish the neighbor graph because several methods rely on it
import warnings

for dataset,scanpy_object,umis in zip(datasets,scanpy_objects,umi_objects):

#     print(scanpy_object.shape)
#     print(umis.shape)
    
    sc.pp.neighbors(scanpy_object)
    sc.tl.umap(scanpy_object)
    sc.tl.tsne(scanpy_object)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        sc.tl.louvain(scanpy_object,resolution=1)
        
    sc.pl.umap(scanpy_object,color='louvain')
    sc.pl.tsne(scanpy_object,color='louvain')
    
    umi_means = np.mean(umis,axis=0)
    umi_vars = np.var(umis,axis=0)
    
    louvain_clusters = []
    
    for cluster in set(scanpy_object.obs['louvain']):
        mask = scanpy_object.obs['louvain'] == cluster
        filtered_cells = np.array(umis[mask])
        cluster_means = np.mean(filtered_cells,axis=0)
        cluster_vars = np.var(filtered_cells,axis=0)
        louvain_clusters.append((cluster_means,cluster_vars))
        
#         print(len(cluster_means))
#         print(len(cluster_vars))

    louvain_cluster_means = [m for c in louvain_clusters for m in c[0]]
    louvain_cluster_vars = [v for c in louvain_clusters for v in c[1]]

    plt.figure(figsize=(10,10))
    plt.title("Clustered vs Global")
    plt.scatter(np.log(louvain_cluster_means),np.log(louvain_cluster_vars),c='blue',s=1,alpha=.5,label="Cluster")
    plt.scatter(np.log(umi_means),np.log(umi_vars),c='red',s=1,alpha=.5,label="Global")
    plt.plot([0,10],[0,10],c='red')
    plt.legend()
    plt.xlabel("Mean")
    plt.ylabel("Variance")
    plt.show()        
    
    
    mean_sort = np.argsort(umi_means)

    plt.figure(figsize=(10,10))
    plt.title(f"UMI Mean/Variance Ratio per feature, Dataset:{dataset}")
    for cluster in louvain_clusters:    
        cluster_means,cluster_vars = cluster        
        cluster_ratios = np.array(cluster_means)/np.array(cluster_vars)
        print(len(cluster_ratios))
        print(len(umi_means))
        print(len(mean_sort))
        plt.scatter(np.arange(len(cluster_means)),cluster_ratios[mean_sort],s=1,c='blue')
    plt.ylim(0,2)
    plt.scatter(np.arange(len(umi_means)),(umi_means/umi_vars)[mean_sort],s=3,c='red')
    plt.show()

In [None]:
# plt.figure(figsize=(10,10))
# plt.scatter(*citeseq_scaled.obsm['X_tsne'].T,c=forest.sample_labels,s=4,cmap='rainbow')
# plt.show()

plt.figure(figsize=(10,10))
plt.scatter(*johnston_working.obsm['X_umap'].T,c=forest.sample_labels,s=4,cmap='rainbow')
plt.show()

In [None]:
forest.tsne_coordinates = johnston_working.obsm['X_umap']

In [None]:
forest.plot_sample_clusters()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',depth=5,sub=.5,k=20,metric='cosine',relatives=True)

In [None]:
for split_cluster in forest.split_clusters:
    split_cluster.html_sister_scores()

In [None]:
# forest.maximum_spanning_tree(depth=5)
# forest.most_likely_tree(depth=5)
forest.html_tree_summary(n=10)

In [None]:
len(forest.nodes(root=True,depth=5))

In [None]:
# Now we wish to examine the behavior of distributions within clusters as defined by louvain and RF

In [None]:
umi_means.shape

In [None]:
# Now we wish to examine

In [None]:
np.argsort(np.mean(umis.X,axis=0))[1000]

In [None]:
from scipy.stats import kstest,poisson,nbinom

def poisson_ks_check(x):
    mean = np.mean(x)
    cdf = lambda x: poisson.cdf(x,mean)
    return kstest(x,cdf)

# Correctly fitting a negative binomial is a pain, here is a hacked out version for now:
def nb_pr_estimation(x):
    mean = np.mean(x)
    var = np.var(x)
    counter = mean/var
        
    p = -1 * (counter - 1)
    r = mean * ((1-p)/p)
        
    return p,r

In [None]:
nb_draws = nbinom.rvs(4,.5,size=1000)

plt.figure()
plt.hist(nb_draws)
plt.show()

nb_pr_estimation(nb_draws)

In [None]:
ks_values = [poisson_ks_check(x)[0] for x in forest.output.T]
print(ks_values)

In [None]:
plt.figure()
plt.title("Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Global")
plt.scatter(umi_means,ks_values[:2000],s=1)
plt.xlabel("Mean")
plt.ylabel("K-S Statistic")
plt.xlim(0,40)
plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = np.array(umis[mask])

    ks_values = [poisson_ks_check(x)[0] for x in filtered_cells.T]
    means = np.mean(filtered_cells,axis=0)
    
    plt.figure()
    plt.title(f"Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Cluster {cluster}")
    plt.scatter(means,ks_values[:2000],s=1)
    plt.xlabel("Mean")
    plt.ylabel("K-S Statistic")
    plt.xlim(0,30)
    plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = np.array(umis[mask])

    size_factors = np.sum(filtered_cells,axis=1)
    
    plt.figure()
    plt.title(f"Size factor distributions, Cluster {cluster}")
    plt.hist(np.array(size_factors),log=True,bins=np.arange(0,50000,200))
    plt.show()

In [None]:
size_factors = np.array(np.sum(np.array(umis),axis=1))

In [None]:
plt.hist(size_factors,bins=np.arange(0,50000,200))

In [None]:
from sklearn.decomposition import PCA

pca = PCA.fit_transform()