In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

In [None]:
# This notebook will go through a conventional scanpy analysis of citeseq data, so that we can compare it to an 
# analysis performed by a random forest

In [None]:
# First we load the data, fortunately the facilities for this are pretty nice

datasets = [
    '/localscratch/bbrener1/johnston_sc/Retina2/outs/filtered_gene_bc_matrices/dmel_r6.20/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/nelmari/ctrl/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/fan_tendon/TH1/matrix.mtx',
    '/home/bbrener1/transfer/all_raw/raw_data/citeseq/GSE100866_CBMC_human_umis.tsv', # This dataset had spiked in mouse cells
                                                                                      # but this is a filtered matrix
        
    '/home/bbrener1/transfer/all_raw/raw_data/vision_sc/raw_counts.txt',   # These are the unfiltered UMIs of this dataset
                                                                           # Actual paper analysis was done on like 1600 cells, so needs to be checked out
]
# Whether this dataset needs to be transposed
transpose = [
    True,
    True,
    True,
    True,
    False,
]

scanpy_objects = []
umi_objects = []

for dataset,d_transpose in zip(datasets,transpose): 
    
    print(dataset)
    
    large_scanpy_object = sc.read(dataset)
    
    if d_transpose:
        large_scanpy_object = large_scanpy_object.T

    sc.pp.downsample_counts(large_scanpy_object,counts_per_cell=1200)
    large_scanpy_object.X = large_scanpy_object.X.astype(dtype=float)
        
    sc.pp.filter_cells(large_scanpy_object,min_genes=100)
    sc.pp.filter_cells(large_scanpy_object,min_counts=100)
    
    
    
    print("Read in")
#     This filtration is roughly analogous to the standard zheng, but it retains the UMI object. 

    sc.pp.filter_genes(large_scanpy_object, min_counts=10)         # only consider genes with more than 10 counts

    
    scpy_copy = large_scanpy_object.copy()
    sc.pp.normalize_per_cell(scpy_copy)
    filter_result = sc.pp.filter_genes_dispersion(  # select highly-variable genes
        scpy_copy.X, flavor='cell_ranger', n_top_genes=2000, log=False
    )
    scpy_filtered = large_scanpy_object[:, filter_result.gene_subset].copy()     # subset the genes
    
    del(large_scanpy_object)
        
    umis = scpy_filtered.copy().X

    sc.pp.normalize_per_cell(scpy_filtered)                 # renormalize after filtering
    sc.pp.log1p(scpy_filtered)                      # log transform: adata.X = log(adata.X + 1)
    sc.pp.scale(scpy_filtered)

#     print(f"zero mean:{np.sum(np.mean(umis,axis=0) == 0)}/{np.mean(umis,axis=0).shape}")
#     print(f"zero var:{np.sum(np.var(umis,axis=0) == 0)}")
    
    scanpy_objects.append(scpy_filtered)
    umi_objects.append(umis)

In [None]:
for i,umi in enumerate(umi_objects):
    print(type(umi))
    if type(umi) is not type(np.zeros(0)):
        umi_objects[i] = np.array(umi.todense())
    print(umi.shape)

In [None]:
for i,umi in enumerate(umi_objects):
    print(type(umi))
    print(umi.shape)
    print(f"zero mean:{np.sum(np.mean(umi,axis=0) == 0)}/{np.mean(umis,axis=0).shape}")
    print(f"zero var:{np.sum(np.var(umi,axis=0) == 0)}")
    print(np.log(np.mean(umi,axis=0)))
    print(np.log(np.var(umi,axis=0)))


In [None]:
# Here we set down the parameter space to be explored. 
#
# We will set up certain defualt values, and then perform testing on how deviation from these values affects 
# the results of clustering by two agnostic measures: Silhouette and Calinski-Harabasz scores, per SK-Learn

defaults = {
    'trees':100,
    'braids':3,
    'ifs':500,
    'ofs':500,
    'ss':200,
    'depth':8,
    'leaves':50,
    'sfr':0.,
    'forest_metric':'ssme',
    'regularization':'l1',
    'clustering_strategy':'encoding',
    'k':10,
    'sub':.5,
    'clustering_metric':'cos'
}


alternatives = {
    'trees':[30,100,300],
    'braids':[1,3,5],
    'ifs':[100,500,1000],
    'ofs':[100,500,1000],
    'ss':[50,200,1000],
#     'depth':[4,8,10],
    'leaves':[20,50,100,400],
    'sfr':[0,0.5,1.],
#     'forest_metric':['ssme','var'],
    'clustering_strategy':['encoding','leaf'],
#     'clustering_algorithm':['sdg','louvain'],
    'k':[10,20,30],
    'sub':[.3,.5,.8],
    'clustering_metric':['cos','jaccard']
}



In [None]:
import sys
sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
# sys.path.append('../src')
import tree_reader as tr 
import lumberjack

from copy import deepcopy
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import calinski_harabasz_score,silhouette_score


In [None]:

umis = umi_objects[0]

tsne = TSNE().fit_transform(PCA(n_components=100).fit_transform(umis))

johnston_forests = {p:[] for p in alternatives.keys()}
johnston_calinski_harabasz = {p:[] for p in alternatives.keys()}
johnston_silhouette = {p:[] for p in alternatives.keys()}

for parameter in alternatives.keys():
    print(f"Iterating over {parameter}")
    for alternative in alternatives[parameter]:
        print(f"Trying {alternative}")
        
        parameters = deepcopy(defaults)
        parameters[parameter] = alternative
        
        print(parameters)

        forest = lumberjack.fit(
            umis,
            trees=parameters['trees'],
            braids=parameters['braids'],
            ifs=parameters['ifs'],
            ofs=parameters['ofs'],
            ss=parameters['ss'],
            depth=parameters['depth'],
            leaves=parameters['leaves'],
            sfr=parameters['sfr']
        )
        
        if parameters['clustering_strategy'] == 'encoding':
            if parameters['clustering_metric'] == 'cos':
                forest.cluster_samples_encoding(
                    pca=100,
                    k=parameters['k'],
                    sub=parameters['sub'],
                    metric=parameters['clustering_metric'],
                )
            elif parameters['clustering_metric'] == 'jaccard':
                forest.cluster_samples_encoding(
                    k=parameters['k'],
                    sub=parameters['sub'],
                    metric=parameters['clustering_metric'],
                )
            else:
                raise Exception
        elif parameters['clustering_strategy'] == 'leaf':
            forest.cluster_leaves_samples(
                k=parameters['k'],
                sub=parameters['sub'],
                metric=parameters['clustering_metric'],
            )
            forest.cluster_samples_leaf_cluster()
        else:
            raise Exception
        
        forest.tsne_coordinates = tsne
        forest.plot_sample_clusters()
        
    
        johnston_forests[parameter].append(forest)
        johnston_calinski_harabasz[parameter].append(calinski_harabasz_score(umis,forest.sample_labels))
        johnston_silhouette[parameter].append(silhouette_score(umis,forest.sample_labels))
        
        

In [None]:

umis = umi_objects[3]

tsne = TSNE().fit_transform(PCA(n_components=100).fit_transform(umis))

citeseq_forest = {p:[] for p in alternatives.keys()}
citeseq_calinski_harabasz = {p:[] for p in alternatives.keys()}
citeseq_silhouette = {p:[] for p in alternatives.keys()}

for parameter in alternatives.keys():
    print(f"Iterating over {parameter}")
    for alternative in alternatives[parameter]:
        print(f"Trying {alternative}")
        
        parameters = deepcopy(defaults)
        parameters[parameter] = alternative
        
        print(parameters)

        forest = lumberjack.fit(
            umis,
            trees=parameters['trees'],
            braids=parameters['braids'],
            ifs=parameters['ifs'],
            ofs=parameters['ofs'],
            ss=parameters['ss'],
            depth=parameters['depth'],
            leaves=parameters['leaves'],
            sfr=parameters['sfr']
        )
        
        if parameters['clustering_strategy'] == 'encoding':
            if parameters['clustering_metric'] == 'cos':
                forest.cluster_samples_encoding(
                    pca=100,
                    k=parameters['k'],
                    sub=parameters['sub'],
                    metric=parameters['clustering_metric'],
                )
            elif parameters['clustering_metric'] == 'jaccard':
                forest.cluster_samples_encoding(
                    k=parameters['k'],
                    sub=parameters['sub'],
                    metric=parameters['clustering_metric'],
                )
            else:
                raise Exception
        elif parameters['clustering_strategy'] == 'leaf':
            forest.cluster_leaves_samples(
                k=parameters['k'],
                sub=parameters['sub'],
                metric=parameters['clustering_metric'],
            )
            forest.cluster_samples_leaf_cluster()
        else:
            raise Exception
        
        forest.tsne_coordinates = tsne
        forest.plot_sample_clusters()
        
    
        citeseq_forests[parameter].append(forest)
        citeseq_calinski_harabasz[parameter].append(calinski_harabasz_score(umis,forest.sample_labels))
        citeseq_silhouette[parameter].append(silhouette_score(umis,forest.sample_labels))


In [None]:
import dill
dill.dump_session("scanpy_calinski_silhouette.db")


In [None]:
!ls -lh

In [None]:
for parameter in alternatives.keys():
    
    jh_ch_results = johnston_calinski_harabasz[parameter]
    jh_ar = np.arange(len(ch_results))
    plt.figure()
    plt.title(f"Calinski-Harabasz:{parameter}")
    plt.bar(jh_ar,jh_ch_results,tick_labels=alternatives[parameter])
    plt.show()

    

In [None]:
!mkdir forest_alternatives

In [None]:
for parameter in alternatives.keys():
    for i,forest in enumerate(johnston_forests[parameter]):
        forest.backup('./forest_alternatives/' + parameter + str(i) + ".forest" )

In [None]:
default_forest = johnston_forests['trees'][1]