In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of fan data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# First we load the data, fortunately the facilities for this are pretty nice

regev = sc.read('/home/bbrener1/transfer/all_raw/raw_data/aging_sc/primary_counts_gene_expression_array.txt')
header = np.loadtxt('/home/bbrener1/transfer/all_raw/raw_data/aging_sc/primary_counts_gene_id_array.txt',dtype=str)
regev.shape


In [None]:
regev = regev.transpose()
regev.var_names = header
regev.shape

In [None]:
sc.pp.filter_genes(regev,min_counts=10)
regev.shape

In [None]:
sc.pp.downsample_counts(fan,counts_per_cell=2500)

fan.X = np.array(fan.X.todense())
fan.X = fan.X.astype(dtype=float)
type(fan.X)

In [None]:
regev.X[:10,:10]

# plt.figure()
# plt.hist(
#     np.sum(regev.X,axis=1),
#     bins=np.arange(0,20000,500),
# )
# plt.show()

# plt.figure()
# plt.hist(
#     np.sum(regev.X,axis=0),
#     bins=np.arange(0,2000,100),
# )
# plt.show()





In [None]:

sc.pp.filter_genes(fan, min_counts=1)         # only consider genes with more than 1 count
fan_copy = fan.copy()
sc.pp.normalize_per_cell(fan_copy)
filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
    fan_copy.X, flavor='cell_ranger', n_top_genes=2000, log=False
)
fan_filtered = fan[:, filter_result.gene_subset]     # subset the genes

umis = fan_filtered.copy().X

sc.pp.normalize_per_cell(fan_filtered)                 # renormalize after filtering
sc.pp.log1p(fan_filtered)                      # log transform: adata.X = log(adata.X + 1)
sc.pp.scale(fan_filtered)

fan_working = fan_filtered.copy()

In [None]:
import sys
sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest_log = lumberjack.fit(
    fan_filtered.X,
    header=fan_filtered.var_names,
    trees=100,
    braids=3,
    ifs=700,
    ofs=700,
    ss=200,
    depth=8,
    leaves=100,
    sfr=.5
)


forest = lumberjack.fit(
    umis,
    header=fan_filtered.var_names,
    trees=100,
    braids=3,
    ifs=700,
    ofs=700,
    ss=200,
    depth=8,
    leaves=50,
    sfr=.5
)

forest.set_cache(True)

In [None]:
list(fan_filtered.var_names)

In [None]:
forest.backup("scanpy_cmp_fan")
# forest = tr.Forest.reconstitute('scanpy_cmp_johnston')
# forest.arguments

In [None]:
forest.reset_sample_clusters()
# forest.cluster_samples_encoding(sub=.8,k=20,depth=8,metric='jaccard')
forest.cluster_samples_encoding(sub=.8,k=20,depth=8,metric='cosine',pca=100)

# forest.reset_leaf_clusters()
# forest.cluster_leaves_samples(sub=.5,k=20,depth=6,metric="jaccard")
# forest.cluster_leaves_samples(sub=.8,k=20,metric="cosine",pca=100)
# forest.cluster_leaves_predictions(sub=.8,k=20,metric="cosine",pca=100,mode="mean")
# forest.cluster_samples_leaf_cluster()

In [None]:
# forest.tsne(pca=100)
forest.tsne_coordinates = fan_filtered.obsm['X_umap']
forest_log.tsne_coordinates = fan_filtered.obsm['X_umap']
# forest.plot_sample_clusters()
# forest.trees[0].plot()

In [None]:
for cluster in forest.leaf_clusters:
    cluster.plot_sample_counts()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(sub=.8,k=20,relatives=True,pca=100,depth=6,mode='additive_mean',metric='cosine')

forest_log.reset_split_clusters()
forest_log.interpret_splits(sub=.8,k=20,relatives=True,pca=100,depth=6,mode='additive_mean',metric='cosine')

In [None]:
# forest.most_likely_tree(depth=6)
forest.maximum_spanning_tree(depth=6)

# forest_log.most_likely_tree(depth=6)
forest_log.maximum_spanning_tree(depth=6)

In [None]:
# forest.tsne(pca=100)
forest.html_tree_summary(n=10)
# forest_log.html_tree_summary(n=10)

In [None]:
# We now establish the neighbor graph because several methods rely on it

sc.pp.neighbors(fan_filtered)

In [None]:
sc.tl.umap(fan_filtered)

In [None]:
sc.pl.umap(fan_filtered)

In [None]:
# We want to do clusterin via Louvain as one of the gold standards
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sc.tl.louvain(fan_filtered,resolution=1)

In [None]:
sc.pl.umap(fan_filtered,color='louvain')

In [None]:
sc.tl.tsne(fan_filtered)

In [None]:
sc.pl.tsne(fan_filtered,color='louvain')

In [None]:


plt.figure(figsize=(15,10))
plt.scatter(*fan_filtered.obsm['X_umap'].T,c=forest.sample_labels,s=4,cmap='rainbow')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

# feature_sort = dendrogram(linkage(forest2.output.T,metric='correlation',method='average'),no_plot=True)['leaves']
# sample_sort = dendrogram(linkage(forest2.output,metric='cos',method='average'),no_plot=True)['leaves']

plt.figure()
plt.imshow(forest2.output[sample_sort].T[feature_sort].T,aspect='auto',interpolation='none',vmin=-1,vmax=3)
plt.colorbar()
plt.show()

In [None]:
correlations = np.corrcoef(fan_filtered.X.T)
correlations = correlations[feature_sort].T[feature_sort]



plt.figure()
plt.title("Correlations of Features In Fan Tendon Data")
plt.imshow(correlations,cmap='seismic',vmin=-1,vmax=1)
plt.colorbar()
plt.show()