In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of citeseq data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# First we load the data, fortunately the facilities for this are pretty nice

citeseq = sc.read('/localscratch/bbrener1/citeseq_raw_data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv')


In [None]:
# Our data is transposed from how it would appear in R 🙄

citeseq = citeseq.transpose()

In [None]:
# We want to filter out mouse cells and mouse genes from further analysis. All mouse genes appear after index 20401
# We retain cells where >90% of counts are human 

citeseq.var_names[:20401]

umi_totals = np.sum(citeseq.X,axis=1)
human_count_totals = np.sum(citeseq.X[:,:20401],axis=1)
human_fraction = human_count_totals / umi_totals
human_mask = human_fraction > .9

citeseq_filtered = citeseq[human_mask,:20401]

In [None]:
# We will further filter per Zheng 17 (Ripped from scanpy site)

sc.pp.filter_genes(citeseq_filtered, min_counts=1)         # only consider genes with more than 1 count
citeseq_copy = citeseq_filtered.copy()
sc.pp.normalize_per_cell(citeseq_copy)
filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
    citeseq_copy.X, flavor='cell_ranger', n_top_genes=2000, log=False
)
citeseq_second_filter = citeseq_filtered[:, filter_result.gene_subset]     # subset the genes

umis = citeseq_second_filter.copy()

sc.pp.normalize_per_cell(citeseq_second_filter)                 # renormalize after filtering
sc.pp.log1p(citeseq_second_filter)                      # log transform: adata.X = log(adata.X + 1)
sc.pp.scale(citeseq_second_filter)

citeseq_working = citeseq_second_filter.copy()

In [None]:
citeseq_working

In [None]:
import sys
sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

# forest = lumberjack.fit(
#     citeseq_working.X,
#     header=citeseq_working.var_names,
#     trees=100,
#     braids=3,
#     ifs=1000,
#     ofs=1000,
#     ss=1000,
#     depth=10,
#     leaves=100,
#     sfr=0
# )


forest = lumberjack.fit(
    umis.X,
    header=citeseq_working.var_names,
    trees=100,
    braids=3,
    ifs=1000,
    ofs=1000,
    ss=1000,
    depth=8,
    leaves=100,
    sfr=0
)

forest.set_cache(True)

In [None]:
forest.backup("scanpy_cmp_new")
# forest = tr.Forest.reconstitute('scanpy_cmp_new')
# forest.arguments

In [None]:
forest.reset_sample_clusters()
forest.cluster_samples_encoding(sub=.5,k=20,pca=100,depth=5,metric='cosine')

In [None]:
forest.tsne(pca=100)
forest.plot_sample_clusters()
# forest.trees[0].plot()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(sub=.8,relatives=True,pca=30,depth=5,mode='additive_mean',metric='cosine',k=20)

In [None]:
# forest.tsne_coordinates = 
# forest.tsne(pca=100)
forest.most_likely_tree(depth=5)
# forest.maximum_spanning_tree(depth=5)
forest.html_tree_summary()

In [None]:
citeseq_scaled = citeseq_second_filter.copy()

In [None]:
citeseq_scaled[:10,:10].X

In [None]:
# sc.pp.normalize_per_cell(citeseq_scaled)                 # renormalize after filtering
# sc.pp.log1p(citeseq_scaled)                      # log transform: adata.X = log(adata.X + 1)
# sc.pp.scale(citeseq_scaled)


In [None]:
# We now establish the neighbor graph because several methods rely on it

sc.pp.neighbors(citeseq_scaled)

In [None]:
sc.tl.umap(citeseq_scaled)

In [None]:
sc.pl.umap(citeseq_scaled)

In [None]:
# We want to do clusterin via Louvain as one of the gold standards
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(citeseq_scaled,resolution=5)

In [None]:
sc.pl.umap(citeseq_scaled,color='louvain')

In [None]:
sc.tl.tsne(citeseq_scaled)

In [None]:
sc.pl.tsne(citeseq_scaled,color='louvain')

In [None]:
# plt.figure(figsize=(10,10))
# plt.scatter(*citeseq_scaled.obsm['X_tsne'].T,c=forest.sample_labels,s=4,cmap='rainbow')
# plt.show()

plt.figure(figsize=(10,10))
plt.scatter(*citeseq_scaled.obsm['X_umap'].T,c=forest.sample_labels,s=4,cmap='rainbow')
plt.show()

In [None]:
forest.tsne_coordinates = citeseq_scaled.obsm['X_umap']

In [None]:
forest.plot_sample_clusters()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',depth=5,sub=.5,k=20,metric='cosine',relatives=True)

In [None]:
for split_cluster in forest.split_clusters:
    split_cluster.html_sister_scores()

In [None]:
# forest.maximum_spanning_tree(depth=5)
# forest.most_likely_tree(depth=5)
forest.html_tree_summary(n=10)

In [None]:
len(forest.nodes(root=True,depth=5))

In [None]:
# Now we wish to examine the behavior of distributions within clusters as defined by louvain and RF

In [None]:
umi_means = np.mean(umis.X,axis=0)
umi_vars = np.var(umis.X,axis=0)

In [None]:
plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Log/Log")
plt.scatter(np.log(umi_means),np.log(umi_vars),s=1)
plt.plot([-8,4],[-8,4],c='red')
plt.show()

plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Linear")
plt.scatter(umi_means,umi_vars,s=1)
plt.plot([0,10],[0,10],c='red')
plt.xlim(0,10)
plt.ylim(0,10)
plt.show()


plt.figure(figsize=(10,10))
plt.title("UMI Mean vs Variance per feature, Linear, low range")
plt.scatter(umi_means,umi_vars,s=1)
plt.plot([0,1],[0,1],c='red')
plt.xlim(0,1)
plt.ylim(0,1)
plt.show()

plt.figure(figsize=(10,10))
plt.title("UMI Mean/Variance Ratio per feature, Linear")
plt.scatter(np.arange(umi_means.shape[0]),(umi_means/umi_vars)[np.argsort(umi_means)],s=1)
plt.show()


In [None]:
for cluster in set(citeseq_scaled.obs['louvain']):
    mask = citeseq_scaled.obs['louvain'] == cluster
    filtered_cells = umis[mask]
    cluster_means = np.mean(filtered_cells.X,axis=0)
    cluster_var = np.var(filtered_cells.X,axis=0)
    plt.figure(figsize=(10,10))
    plt.scatter(np.log(cluster_means),np.log(cluster_var),s=1)
    plt.plot([-8,4],[-8,4],c='red')
    plt.show()

In [None]:
# Now we wish to examine

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = umis[mask]
    cluster_means = np.mean(filtered_cells.X,axis=0)
    cluster_var = np.var(filtered_cells.X,axis=0)
    plt.figure(figsize=(10,10))
    plt.title(f"Cluster:{cluster}")
    plt.scatter(np.log(cluster_means),np.log(cluster_var),s=1)
    plt.plot([-8,4],[-8,4],c='red')
    plt.show()
    plt.figure(figsize=(10,10))
    plt.title(f"UMI Mean/Variance Ratio per feature, Cluster:{}")
    plt.scatter(np.arange(cluster_means.shape[0]),(cluster_means/cluster_var)[np.argsort(cluster_means)],s=1)
    plt.show()

In [None]:
np.argsort(np.mean(umis.X,axis=0))[1000]

In [None]:
from scipy.stats import kstest,poisson,nbinom

def poisson_ks_check(x):
    mean = np.mean(x)
    cdf = lambda x: poisson.cdf(x,mean)
    return kstest(x,cdf)

# Correctly fitting a negative binomial is a pain, here is a hacked out version for now:
def nb_pr_estimation(x):
    mean = np.mean(x)
    var = np.var(x)
    counter = mean/var
        
    p = -1 * (counter - 1)
    r = mean * ((1-p)/p)
        
    return p,r

In [None]:
nb_draws = nbinom.rvs(4,.5,size=1000)

plt.figure()
plt.hist(nb_draws)
plt.show()

nb_pr_estimation(nb_draws)

In [None]:
ks_values = [poisson_ks_check(x)[0] for x in forest.output.T]
print(ks_values)

In [None]:
plt.figure()
plt.title("Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Global")
plt.scatter(umi_means,ks_values[:1999],s=1)
plt.xlabel("Mean")
plt.ylabel("K-S Statistic")
plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = umis[mask]

    ks_values = [poisson_ks_check(x)[0] for x in filtered_cells.X.T]
    means = np.mean(filtered_cells.X,axis=0)
    
    plt.figure()
    plt.title(f"Kolomogorov-Smirnov Test Statistic Vs ML Estimated Poisson Distribution vs Mean, Cluster {cluster}")
    plt.scatter(means,ks_values[:1999],s=1)
    plt.xlabel("Mean")
    plt.ylabel("K-S Statistic")
    plt.xlim(0,30)
    plt.show()

In [None]:
for cluster in set(forest.sample_labels):
    mask = forest.sample_labels == cluster
    filtered_cells = umis[mask]

    size_factors = np.sum(filtered_cells.X,axis=1)
    
    plt.figure()
    plt.title(f"Size factor distributions, Cluster {cluster}")
    plt.hist(np.array(size_factors),log=True,bins=np.arange(0,4000,200))
    plt.show()

In [None]:
size_factors = np.array(np.sum(umis.X,axis=1))

In [None]:
plt.hist(size_factors,bins=np.arange(0,8000,200))

In [None]:
from sklearn.decomposition import PCA

pca = PCA.fit_transform()