In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of fan data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# First we load the data, fortunately the facilities for this are pretty nice

th1_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/TH1_count/outs/filtered_gene_bc_matrices/mm10/')
print(th1_raw.shape)

th2_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/TH2_count/outs/filtered_gene_bc_matrices/mm10/')
print(th2_raw.shape)

ll1_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/LL1_count/outs/filtered_gene_bc_matrices/mm10/')
print(ll1_raw.shape)

ll4_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/LL4_count/outs/filtered_gene_bc_matrices/mm10/')
print(ll4_raw.shape)

# These are the old cells
ll2_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/LL2_count/outs/filtered_gene_bc_matrices/mm10/')
print(ll2_raw.shape)

# LL6-12 is not hitting thresholds
ll6_raw = sc.read_10x_mtx('/Users/bbrener1/raw_data/fan_tendon/LL6-12_count/outs/filtered_gene_bc_matrices/mm10/')
print(ll6_raw.shape)


In [None]:
plt.figure()
plt.title("ll1")
plt.hist(np.sum(ll1_raw.X,axis=1)[:,0],bins=50)
plt.show()

plt.figure()
plt.title("ll4")
plt.hist(np.sum(ll4_raw.X,axis=1)[:,0],bins=50)
plt.show()

plt.figure()
plt.title("ll2")
plt.hist(np.sum(ll2_raw.X,axis=1)[:,0],bins=50)
plt.show()

plt.figure()
plt.title("ll6")
plt.hist(np.sum(ll6_raw.X,axis=1)[:,0],bins=50)
plt.show()

In [None]:
young = ll4_raw.copy()
aged = ll6_raw.copy()

In [None]:


sc.pp.downsample_counts(young,counts_per_cell=1000)
sc.pp.downsample_counts(aged,counts_per_cell=1000)

young.X = np.array(young.X.todense())
aged.X = np.array(aged.X.todense())
young.X = young.X.astype(dtype=float)
aged.X = aged.X.astype(dtype=float)
type(young.X)

In [None]:
# plt.figure()
# plt.hist(
#     np.sum(young.X,axis=1),
# )
# plt.show()

# plt.figure()
# plt.hist(
#     np.sum(aged.X,axis=1),
# )
# plt.show()




In [None]:
first_filter,_ = sc.pp.filter_genes(young, min_counts=2,inplace=False)         # only consider genes with more than 1 count
young = young[:,first_filter]
sc.pp.normalize_per_cell(young)
print(young.shape)
young_filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
    young.X, flavor='cell_ranger', n_top_genes=500, log=False
)
print(young_filter_result.shape)
young_filtered = young[:, young_filter_result.gene_subset]     # subset the genes

sc.pp.normalize_per_cell(young_filtered)                 # renormalize after filtering
sc.pp.log1p(young_filtered)                      # log transform: adata.X = log(adata.X + 1)
sc.pp.scale(young_filtered)

young = young_filtered


# sc.pp.filter_genes(aged, min_counts=1)         # only consider genes with more than 1 count
aged = aged[:,first_filter]
sc.pp.normalize_per_cell(aged)
aged_filtered = aged[:, young_filter_result.gene_subset]     # subset the genes

sc.pp.normalize_per_cell(aged_filtered)                 # renormalize after filtering
sc.pp.log1p(aged_filtered)                      # log transform: adata.X = log(adata.X + 1)
sc.pp.scale(aged_filtered)

aged = aged_filtered

In [None]:
import pickle

pickle.dump(young,open("aging_tendon_young.pickle",mode='bw'))
pickle.dump(aged,open("aging_tendon_aged.pickle",mode='bw'))

In [None]:
import pickle 

young = pickle.load(open("aging_tendon_young.pickle",mode='rb'))
old = pickle.load(open("aging_tendon_aged.pickle",mode='rb'))


In [None]:
sc.pp.neighbors(young)
sc.tl.umap(young)
sc.pl.umap(young)

sc.tl.louvain(young)
sc.pl.umap(young,color='louvain')



In [None]:
# from sklearn.manifold import TSNE

# tsne_coordinates = TSNE().fit_transform(young.X)

# plt.figure()
# plt.scatter(*tsne_coordinates.T)
# plt.show()

In [None]:
import sys
# sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
sys.path.append('../src')
import tree_reader as tr 
import lumberjack

forest = lumberjack.fit(
    young.X,
    header=young.var_names,
    trees=300,
    braids=2,
    ifs=250,
    ofs=250,
    ss=500,
    depth=8,
    leaves=100,
    sfr=.5
)


forest.set_cache(True)
forest.backup("scanpy_cmp_young_tendon_double")


In [None]:
forest.arguments

In [None]:

import sys
# sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
sys.path.append('../src')
import tree_reader as tr 
import lumberjack
forest = tr.Forest.reconstitute('scanpy_cmp_young_tendon')
forest.arguments

In [None]:
forest.reset_sample_clusters()
# forest.cluster_samples_simple(k=50,resolution=1,metric='euclidean',pca=50)
# forest.cluster_samples_encoding(k=50,depth=6,metric='euclidean',pca=50)

In [None]:
# forest.tsne()
forest.tsne_coordinates = young.obsm['X_umap']
forest.plot_sample_clusters()
# forest.trees[0].plot()

In [None]:
forest.reset_split_clusters()
forest.interpret_splits(k=100,relatives=True,pca=100,depth=6,mode='additive_mean',metric='cosine')

# forest_log.reset_split_clusters()
# forest_log.interpret_splits(sub=.8,k=20,relatives=True,pca=100,depth=6,mode='additive_mean',metric='cosine')

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

factor_matrix = forest.factor_matrix()
factor_sort = dendrogram(linkage(np.abs(factor_matrix.T),metric='cosine',method='average'),no_plot=True)['leaves']
sample_aggsort = dendrogram(linkage(np.abs(factor_matrix),metric='correlation',method='average'),no_plot=True)['leaves']
# sample_sort = np.argsort(forest.sample_labels)

plt.figure()
plt.imshow(factor_matrix[sample_aggsort].T[factor_sort].T,aspect='auto',interpolation='none',cmap="seismic",vmin=-1,vmax=1)
plt.colorbar()
plt.show()

In [None]:
# forest.most_likely_tree(depth=6)
forest.maximum_spanning_tree(mode='samples',depth=6)

# forest_log.most_likely_tree(depth=6)
# forest_log.maximum_spanning_tree(depth=6)

In [None]:
# forest.tsne(pca=100)
forest.html_tree_summary(n=10)
# forest_log.html_tree_summary(n=10)

In [None]:
plt.figure()
plt.hist(young.X.flatten(),log=True)

In [None]:
forest.reset_sample_clusters()
young_prediction = forest.predict(young.X)
young_prediction.prediction_report(mode='additive_mean')

In [None]:
from sklearn.decomposition import PCA

model = PCA(n_components=10).fit(young.X)
transformed = model.transform(young.X)
recovered = model.inverse_transform(transformed)

centered = young.X - np.mean(young.X,axis=0)
transformed_residual = np.power(centered,2)

recovered_residual = np.power(young.X - recovered,2)

pca_recovered_per_sample = np.sum(recovered_residual,axis=1)
pca_recovered_fraction_per_sample = np.sum(recovered_residual,axis=1) / np.sum(transformed_residual,axis=1)
print(np.sum(transformed_residual))
print(np.sum(recovered_residual))

print(f"Remaining variance:{(np.sum(recovered_residual) / np.sum(transformed_residual))}")