In [None]:
import dill
# dill.dump_session("citeseq_session.db")
dill.load_session("citeseq_session.db")

In [None]:
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram,linkage
from scipy.spatial.distance import pdist,squareform

raw_data_location = "/Users/bbrener1/taylor/raw_data/citeseq/"

In [None]:
counts = np.loadtxt('citeseq_cbmc_counts.tsv')
fluorescence_counts = np.loadtxt('citeseq_cbmc_adt.tsv')
header = np.loadtxt('citeseq_cbmc_header.txt',dtype=str)
fluorescence_header = np.loadtxt('citeseq_cbmc_fluorescence_header.tsv',dtype=str)
# t_markers = np.loadtxt('cbmc_cell_markers.tsv')
# t_marker_header = np.loadtxt('cbmc_cell_marker_header.txt',dtype=str)
# fluorescence_coordinates = np.loadtxt('cbmc_fluorescence_coordiantes.tsv')
# gene_coordinates = np.loadtxt('cbmc_gene_coordinates.tsv')
umis = np.loadtxt('citeseq_cbmc_umis.tsv')
# fluorescence_umis = np.loadtxt('citeseq_cbmc_fluorescence_umis.tsv')

# Reprocessing CITESeq

## Table of Contents

* [Importing Data](#importing_data)
* [Paring Down Data To Relevant Elements](#feature_selection)
* [Processing Data](#data_processing)
* [Forest Analysis](#forest_analysis)

## Importing Data <a class="anchor" id="importing_data"></a>

In [None]:
# Source paper:
# https://www.nature.com/articles/nmeth.4380.pdf?origin=ppub

# Data is available on GEO, at GSE100866
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866

In [None]:
# %cd ~/taylor/raw_data
# !wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866%5FCBMC%5F8K%5F13AB%5F10X%2DADT%5Fumi%2Ecsv%2Egz
# !wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866%5FCBMC%5F8K%5F13AB%5F10X%2DRNA%5Fumi%2Ecsv%2Egz
# !wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866%5FPBMC%5Fvs%5Fflow%5F10X%2DADT%5Fumi%2Ecsv%2Egz
# !wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866%5FPBMC%5Fvs%5Fflow%5F10X%2DRNA%5Fumi%2Ecsv%2Egz 

In [None]:
# %cd ~/taylor/raw_data
# %pwd
# !gunzip *.gz
# !ls -lh
# %cd ~/taylor/rusty_lumberjack/work

In [None]:
raw_data_location = "/Users/bbrener1/taylor/raw_data/citeseq/"

In [None]:
# Loading 

# Numpy facilities for loading csvs are kind of dumb, so we count the number of columns in the target

columns = !head -n 1 {raw_data_location}/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv | grep -o ',' | wc -l
columns = int(columns[0]) 

rows = !wc -l {raw_data_location}/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv 
rows = int(rows[0].split()[0]) - 1


print(f"Rows: {rows}, Columns: {columns}")


In [None]:
umi_counts = np.zeros((rows,columns),dtype=int)


In [None]:
with open(raw_data_location+'GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv') as file:
    file.seek(0)
    lines = []
    while True:
        try:
            line = file.readline()
            if len(line) < 1:
                break
            lines.append(line)
#             print(len(lines))
        except:
            break

            


In [None]:
for i,line in enumerate(lines[1:]):
#     print(i)
    split = [int(x) for x in line.split(',')[1:]]
    umi_counts[i] = np.array(split)

In [None]:
gene_header = []

for i,line in enumerate(lines[1:]):
#     print(i)
    gene_header.append(line.split(',')[0].strip("\""))
    
gene_header = np.array(gene_header)

In [None]:
rna_cell_header = [id.strip("\"\n") for id in lines[0].split(',')[1:]]
rna_cell_header

In [None]:
# Saving the separated files in a numpy-friendly format

np.savetxt(raw_data_location+'/GSE100866_CBMC_umi_counts.tsv',umi_counts,fmt='%i')
np.savetxt(raw_data_location+'/GSE100866_CBMC_gene_header.tsv',gene_header,fmt='%s')


In [None]:
with open(raw_data_location+'/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv') as file:
    file.seek(0)
    lines = []
    while True:
        try:
            line = file.readline()
            if len(line) < 1:
                break
            lines.append(line)
#             print(len(lines))
        except:
            break


In [None]:
fluorescence_cell_header = [id.strip("\"\n") for id in lines[0].split(',')[1:]]
fluorescence_cell_header

In [None]:
fluorescence_umis = np.zeros((len(lines)-1,columns))

for i,line in enumerate(lines[1:]):
#     print(i)
    split = [int(x) for x in line.split(',')[1:]]
    fluorescence_umis[i] = np.array(split)

In [None]:
fluorescence_header = []

for i,line in enumerate(lines[1:]):
#     print(i)
    fluorescence_header.append(line.split(',')[0].strip("\""))
    
fluorescence_header = np.array(fluorescence_header)

In [None]:
fluorescence_header

In [None]:
np.savetxt(raw_data_location+'/GSE100866_CBMC_fluorescence_umis.tsv',fluorescence_umis,fmt='%i')
np.savetxt(raw_data_location+'/GSE100866_CBMC_fluorescence_header.tsv',fluorescence_header,fmt='%s')


In [None]:
## Here we check just in case that rna and fluorescence cell headers are identical. All good.

fluorescence_cell_header == rna_cell_header

In [None]:
np.savetxt(raw_data_location+'/GSE100866_CBMC_cell_header.tsv',rna_cell_header,fmt='%s')

In [None]:
### Only applicable to CBMC

# Finally CCR7 is not actually present (all values 0), so let's truncate 

fluorescence_umis = fluorescence_umis[:13]
fluorescence_header = fluorescence_header[:13]

In [None]:
## In case we need to reload, after clearing the string objects from memory:

umi_counts = np.loadtxt(raw_data_location+'/GSE100866_CBMC_umi_counts.tsv')
fluorescence_umis = np.loadtxt(raw_data_location+'/GSE100866_CBMC_fluorescence_umis.tsv')

gene_header = np.loadtxt(raw_data_location+'/GSE100866_CBMC_gene_header.tsv',dtype=str)
fluorescence_header = np.loadtxt(raw_data_location+'/GSE100866_CBMC_fluorescence_header.tsv',dtype=str)

fluorescence_umis.shape

## Paring Down Data To Relevant Elements <a class="anchor" id="feature_selection"></a>

In [None]:
# First we will need to interpret umi counts as normalized expression values. Many approaches are used here

# The source paper performed extremely basic normalization without deconvolution or distribution compensation
# This is fine for a preliminary analysis, I don't feel like importing stuff so let's keep it basic:

umi_sums = np.sum(umi_counts,axis=0)

log_size_normalized = np.log10(umi_counts*10000 / np.tile(umi_sums,(umi_counts.shape[0],1)))
log_size_normalized[umi_counts == 0] = 0

In [None]:
# We want to check how many genes are meaningfully expressed in the data set,
# and how variant they are:


means = np.mean(log_size_normalized,axis=1)
variance = np.var(log_size_normalized,axis=1)

mean_ranking = np.argsort(means)
variance_ranking = np.argsort(variance)

plt.figure()
plt.title("Means Vs Variance For All Genes")
plt.scatter(means,variance,s=1)
plt.xlabel("Means")
plt.ylabel("Variances")
plt.show()

plt.figure()
plt.title("Mean-ranked Variance")
plt.scatter(np.arange(5000),variance[mean_ranking[-5000:]],s=1,c=means[mean_ranking[-5000:]],cmap='binary')
plt.colorbar()
plt.show()

plt.figure()
plt.title("Sparsity")
plt.scatter(np.arange(20000),(np.sum(umi_counts > 0,axis=1)/umi_counts.shape[1])[mean_ranking[-20000:]],s=1)
plt.show()

Here we notice that there are two distinct apparent populations of genes. We recall that the source paper spiked in mouse genetic material, and we would like to eliminate the mouse genes from further analysis:

In [None]:
### CBMC:
# Mouse genes appear after position 20401 in the gene header.
print(list(gene_header[:20401]))

In [None]:
human_normalized = log_size_normalized[:20401]
human_gene_header = gene_header[:20401]
human_gene_umis = umi_counts[:20401]

In [None]:
# We also want to eliminate mouse cells from the analysis (we can examine them later if needed)
# The source paper uses a criterion that more than 90% of UMIs in a cell must be aligned to human genes for a cell to be 
# considered human, so let's stick with that. 

umi_sums = np.sum(umi_counts,axis=0)
umi_sums.shape

umi_human_gene_sums = np.sum(umi_counts[:20401],axis=0)

human_fraction = umi_human_gene_sums/umi_sums

human_mask = human_fraction > .9

print(np.sum(human_mask))

plt.figure()
plt.hist(umi_human_gene_sums,bins=50)
plt.show()

plt.figure()
plt.hist(umi_sums,bins=50)
plt.show()

plt.figure()
plt.hist(human_fraction,bins=50)
plt.show()


In [None]:
human_normalized = human_normalized.T[human_mask].T
fluorescence_umis = fluorescence_umis.T[human_mask].T
human_umi_counts = human_gene_umis.T[human_mask].T

In [None]:
# Rerunning earlier analysis


means = np.mean(human_normalized,axis=1)
variance = np.var(human_normalized,axis=1)

mean_ranking = np.argsort(means)
variance_ranking = np.argsort(variance)

plt.figure()
plt.title("Means Vs Variance For All Genes")
plt.scatter(means,variance,s=1)
plt.xlabel("Means")
plt.ylabel("Variances")
plt.show()

plt.figure()
plt.title("Mean-ranked Variance")
plt.scatter(np.arange(5000),variance[mean_ranking[-5000:]],s=1,c=means[mean_ranking[-5000:]],cmap='binary')
plt.colorbar()
plt.show()

plt.figure()
plt.title("Variance-ranked Variance")
plt.scatter(np.arange(5000),variance[variance_ranking[-5000:]],s=1,c=means[variance_ranking[-5000:]],cmap='binary')
plt.colorbar()
plt.show()

plt.figure()
plt.title("Sparsity")
plt.scatter(np.arange(5000),(np.sum(umi_counts[:20401] > 0,axis=1)/umi_counts[:20401].shape[1])[mean_ranking[-5000:]],s=1)
plt.show()

In [None]:
## We also want to examine the coefficient of variance to look at genes that may meaningfully contribute to differentiation 

cov = means[variance_ranking[-5000:]] / variance[variance_ranking[-5000:]]

plt.figure()
plt.title("CoV for top genes")
plt.scatter(np.arange(5000),cov,s=1)
plt.show()

In all likelihood we are safe in paring down the output to only the top 2000 most variant genes. This will probably leave out transcription factors and the like, but... vOv

In [None]:
truncated_counts = human_normalized[variance_ranking[-2000:]].T
umis = human_umi_counts[variance_ranking[-2000:]].T
header = human_gene_header[variance_ranking[-2000:]]

# truncated_counts = human_normalized[mean_ranking[-7000:]].T
# header = human_gene_header[mean_ranking[-7000:]]

from sklearn.preprocessing import scale

counts = scale(truncated_counts)

print(counts.shape)
print(umis.shape)
print(fluorescence_umis.shape)
print(header.shape)

In [None]:
# Let's normalize the antibody derived tags as well
# Source paper calls for normalization via Centered Log Ratio

from scipy.stats.mstats import gmean

f_gmeans = gmean(fluorescence_umis+1,axis=0)
# f_gmeans
fluorescence_counts = np.log((fluorescence_umis + 1) / np.tile(f_gmeans,(fluorescence_umis.shape[0],1)))



In [None]:
np.savetxt("citeseq_cbmc_counts.tsv",counts)
np.savetxt("citeseq_cbmc_adt.tsv",fluorescence_counts)
np.savetxt("citeseq_cbmc_header.txt",header,fmt='%s')
np.savetxt("citeseq_cbmc_umis.tsv",umis)
np.savetxt("citeseq_cbmc_fluorescence_umis.tsv",fluorescence_umis)
np.savetxt("citeseq_cbmc_fluorescence_header.tsv",fluorescence_header,fmt='%s')

In [None]:
import numpy as np
import matplotlib.pyplot as plt


counts = np.loadtxt('citeseq_cbmc_counts.tsv')
fluorescence_counts = np.loadtxt('citeseq_cbmc_adt.tsv')
header = np.loadtxt('citeseq_cbmc_header.txt',dtype=str)
fluorescence_header = np.loadtxt('citeseq_cbmc_fluorescence_header.tsv',dtype=str)
umis = np.loadtxt('citeseq_cbmc_umis.tsv')
fluorescence_umis = np.loadtxt('citeseq_cbmc_fluorescence_umis.tsv')

## Processing Data <a class="anchor" id="data_processing"></a>

In [None]:
# First let's take a quick look at the fluorescence and genetic expression data

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
fluorescence_coordinates = TSNE().fit_transform(fluorescence_counts.T)

plt.figure()
plt.title("TSNE Manifold Mapping of Fluorescence Values for Blood Cells")
plt.scatter(fluorescence_coordinates.T[0],fluorescence_coordinates.T[1],s=1)
plt.show()

In [None]:
gene_coordinates = TSNE().fit_transform(PCA(n_components=20).fit_transform(counts))
# gene_coordinates = TSNE(metric='cosine').fit_transform(counts)

plt.figure()
plt.title("TSNE Manifold Mapping of PCA Decomposed Gene Expression Values, Blood Cells ")
plt.scatter(gene_coordinates.T[0],gene_coordinates.T[1],s=1)
plt.show()

In [None]:
# Now we need to recreate the canonical labels based on antibody surface markers. We have markers as follows:
print(fluorescence_header)

In [None]:
# Agglomerative clustering is also going to be a helpful way of looking at the data:

from scipy.cluster.hierarchy import dendrogram,linkage

feature_sort = dendrogram(linkage(counts.T,metric='cosine',method='average'),no_plot=True)['leaves']
sample_sort = dendrogram(linkage(counts,metric='cosine',method='average'),no_plot=True)['leaves']
fluorescence_sort = dendrogram(linkage(fluorescence_counts,method='average'),no_plot=True)['leaves']

plt.figure(figsize=(20,20))
plt.imshow(counts[sample_sort].T[feature_sort].T,cmap='bwr',aspect='auto')
plt.show()

plt.figure(figsize=(20,20))
plt.imshow(np.log10(1 + fluorescence_counts.T[sample_sort]),aspect='auto')
plt.colorbar()
plt.show()

In [None]:

# We can map cell surface markers to cell identities using refernce literature like the BD cell surface marker handbook:
# https://www.bdbiosciences.com/documents/cd_marker_handbook.pdf

# A basic mapping would be:

# CD3   + ::: T Cells
# CD2   + ::: T/B/NK (Adhesion)
# CD4   + ::: Granular cells (T, Macro, Granulo, early phase)

# CD8   + ::: T/NK (unclear since handbook lists 2 variants)
# CD45  + ::: T/B cell differentiation (not erythrocyte)

# CD56  + ::: T and NK (adhesion)
# CD57  + ::: T and NK (adhesion)
# CD16  + ::: T, Dendritic, NK, Macro, Granulo (IgG binding)
# CD10  + ::: NK, Endothelial 

# CD11c + ::: General marker (inflammatory response?)
# CD14  + ::: LPS response, Macrophage, Granulocyte


# CD19  + ::: B Cell, Dendritic, Stem marker
# CD34  + ::: Stem Cells, Endothelial 
# CCR5  + ::: Migration (T, Macro, Gran)
# CCR7  + ::: T/B/Dend


In [None]:
# We would like to cluster the cells, then examine the fluorescence value of each marker in each cluster

print(fluorescence_header)

In [None]:
plt.figure()
plt.title("All Cells")
plt.scatter(fluorescence_counts[9],fluorescence_counts[0],s=1)
plt.xlabel("CD19 (TPM)")
plt.ylabel("CD3 (TPM)")
plt.show()

In [None]:
cd3_p_mask = fluorescence_counts[0] > 1.2
cd19_p_mask = fluorescence_counts[9] > 1

t_mask = np.logical_and(cd3_p_mask,np.logical_not(cd19_p_mask))
b_mask = np.logical_and(cd19_p_mask,np.logical_not(cd3_p_mask))

In [None]:
plt.figure()
plt.title("All Cells")
plt.scatter(fluorescence_counts[9],fluorescence_counts[0],s=1,c=b_mask)
plt.xlabel("CD19")
plt.ylabel("CD3")
plt.show()

In [None]:
# Now we'd like to differentiate between CD4 and CD8 T cells:

plt.figure()
plt.title("T Cells only (CD3+,CD19-)")
plt.scatter(fluorescence_counts[2][t_mask],fluorescence_counts[1][t_mask],s=1)
plt.xlabel("CD8A")
plt.ylabel("CD4")
plt.show()

cd4_t = np.logical_and(t_mask,np.logical_and(fluorescence_counts[1] > 0,fluorescence_counts[2] < 1.5))
cd8_t = np.logical_and(t_mask,np.logical_and(fluorescence_counts[1] < 0,fluorescence_counts[2] > 3))

plt.figure()
plt.title("T Cells only (CD3+,CD19-)")
plt.scatter(fluorescence_counts[2][t_mask],fluorescence_counts[1][t_mask],c=cd4_t[t_mask],s=1)
plt.xlabel("CD8A")
plt.ylabel("CD4")
plt.show()

plt.figure()
plt.title("T Cells only (CD3+,CD19-)")
plt.scatter(fluorescence_counts[2][t_mask],fluorescence_counts[1][t_mask],c=cd8_t[t_mask],s=1)
plt.xlabel("CD8A")
plt.ylabel("CD4")
plt.show()


In [None]:
## Now let's examine how these are distributed in the TSNE:

plt.figure()
plt.title("TSNE Manifold Mapping of PCA Decomposed Gene Expression Values, CD4+ T Cells Colored, Blood Cells ")
plt.scatter(gene_coordinates.T[0],gene_coordinates.T[1],c=cd4_t,s=1)
plt.show()

In [None]:
plt.figure()
plt.title("TSNE Manifold Mapping of PCA Decomposed Gene Expression Values, CD8+ T Cells Colored, Blood Cells ")
plt.scatter(gene_coordinates.T[0],gene_coordinates.T[1],c=cd8_t,s=1)
plt.show()

In [None]:
## Finally let's try to map out naive, memory, and effector cells

plt.figure()
plt.title("CD4 T Cells")
plt.scatter(fluorescence_counts[3][cd4_t],fluorescence_counts[5][cd4_t],s=1)
plt.xlabel("CD2")
plt.ylabel("CD57")
plt.show()

plt.figure()
plt.title("CD8 T Cells")
plt.scatter(fluorescence_counts[3][cd8_t],fluorescence_counts[5][cd8_t],s=1)
plt.xlabel("CD2")
plt.ylabel("CD57")
plt.show()

cd4_t_naive = np.logical_and(cd4_t,np.logical_and(fluorescence_counts[3] < 2.5,fluorescence_counts[5] < 1))
cd4_t_57_effector = np.logical_and(cd4_t,fluorescence_counts[5] > 1)
cd4_t_memory = np.logical_and(cd4_t,np.logical_and(fluorescence_counts[3] > 2.5,fluorescence_counts[5] < 1))

cd8_t_naive = np.logical_and(cd8_t,np.logical_and(fluorescence_counts[3] < 2.5,fluorescence_counts[5] < 1))
cd8_t_57_effector = np.logical_and(cd8_t,fluorescence_counts[5] > 1)
cd8_t_memory = np.logical_and(cd8_t,np.logical_and(fluorescence_counts[3] > 2.5,fluorescence_counts[5] < 1))


In [None]:
plt.figure()
plt.title("CD4 T Cells")
plt.scatter(fluorescence_counts[3][cd4_t],fluorescence_counts[5][cd4_t],c=cd4_t_57_effector[cd4_t],s=1)
plt.xlabel("CD2")
plt.ylabel("CD57")
plt.show()

plt.figure()
plt.title("CD8 T Cells")
plt.scatter(fluorescence_counts[3][cd8_t],fluorescence_counts[5][cd8_t],c=cd8_t_57_effector[cd8_t],s=1)
plt.xlabel("CD2")
plt.ylabel("CD57")
plt.show()


In [None]:
t_markers = np.array([
    b_mask,
    t_mask,
    cd4_t,
    cd8_t,
    cd4_t_naive,
    cd4_t_memory,
    cd4_t_57_effector,
    cd8_t_naive,
    cd8_t_memory,
    cd8_t_57_effector])

t_marker_header = [
    "b_cell",
    "t_cell",
    "cd4_t_cell",
    "cd8_t_cell",
    "cd4_t_naive",
    "cd4_t_memory",
    "cd4_t_effector",
    "cd8_t_naive",
    "cd8_t_memory",
    "cd8_t_effector",
]

# t_markers.shape

In [None]:
np.savetxt('cell_markers.tsv',t_markers)
np.savetxt('cell_marker_header.txt',t_marker_header,fmt="%s")
np.savetxt('fluorescence_coordiantes.tsv',fluorescence_coordinates)
np.savetxt('gene_coordinates.tsv', gene_coordinates)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

t_markers = np.loadtxt('cell_markers.tsv')
t_marker_header = np.loadtxt('cell_marker_header.txt',dtype=str)
fluorescence_coordinates = np.loadtxt('fluorescence_coordiantes.tsv')
gene_coordinates = np.loadtxt('gene_coordinates.tsv')

## Forest Analysis <a class="anchor" id="forest_analysis"></a>

In [None]:
## Here we begin the analysis of this data using random forest regression.

# First boilerplate imports of RFR
 
import sys
sys.path.append("../src/")
import lumberjack
import tree_reader as tr

In [None]:
# import dill
# # dill.dump_session("citeseq_session.db")
# dill.load_session("citeseq_session.db")

In [None]:
## Here we create a random forest using the rust software.

# forest = lumberjack.fit(
#     input_counts=umis,
#     output_counts=umis,
#     header=header,
#     ifs=500,
#     ofs=500,
#     ss=500,
#     trees=100,
#     depth=7,
#     leaves=100,
#     sfr=.5,
#     braids=3
# )

forest = lumberjack.fit(
    input_counts=counts,
    output_counts=counts,
    header=header,
    ifs=700,
    ofs=700,
    ss=500,
    trees=100,
    depth=9,
    leaves=100,
    sfr=.5,
    norm='l1',
    braids=3,
    reduce_inputs="",
    reduce_outputs="",
)

forest.set_cache(True)

In [None]:
# Creating a random forest is a computationally expensive procedure, so we'll back up our results and generally won't 
# rerun them unless necessary

# forest.backup('citeseq_cbmc_l2_plain')
forest.backup('citeseq_cbmc_l1_double_experimental')
# forest.backup('citeseq_cmbc_forest_l1_umi_cache')

!ls -lh

In [None]:
# Here we can reconstitute previously backed up forests we have created. 

import sys
sys.path.append("../src/")
import lumberjack
import tree_reader as tr

forest = tr.Forest.reconstitute('citeseq_cbmc_l1_double_experimental')
forest.arguments

In [None]:

rep = forest.node_representation(forest.nodes(depth=8), mode='additive_mean',pca=100)
rep.shape

In [None]:
# ts = forest.tsne(pca=100)

# left = forest.trees[1].root.children[0].children[0].children[0].samples
# right = forest.trees[1].root.children[0].children[0].children[1].samples

# left_mask = np.zeros(len(forest.samples),dtype=bool)
# left_mask[left] = True
# # left_mask = forest.trees[1].root.children[0].children[0].encoding()

# right_mask = np.zeros(len(forest.samples),dtype=bool)
# right_mask[right] = True

# plt.figure()
# plt.scatter(*ts.T,s=1,c=left_mask)
# plt.show()



In [None]:
# filter_reduction = forest.trees[1].root.children[1].children[0].filter
# # filter_reduction['reduction']['features']


# selected = forest.input.T[filter_reduction['reduction']['features']].T
# subtracted = selected.copy()
# for row in subtracted:
#     row -= filter_reduction['reduction']['means']
# # reduced = np.dot(subtracted,filter_reduction['reduction']['scores'])
# reduced /= np.sum(subtracted,axis=1)

# plt.figure()
# plt.scatter(*ts.T,s=1,c=reduced)
# plt.show()

# plt.hist(reduced,bins=20,log=True)

In [None]:
# Here we would like to produce a figure demonstrating the possibility of grouping nodes

from scipy.spatial.distance import squareform,pdist
representation = forest.node_representation(forest.nodes(depth=4,root=False),mode='additive')
# distance = pdist(representation,metric='cosine')
# np.isnan(distance).any()

node_sort = dendrogram(linkage(representation,metric='jaccard',method='average'),no_plot=True)['leaves']

# np.sum(representation,axis=1).shape

# plt.figure()
# plt.hist(np.sum(representation,axis=1),log=True,bins=100)
# plt.show()

In [None]:
## Now we must find common marginal effects of splits in the random forest.
# We can do this by clustering vectors representing the marginal effect of each split

forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',relatives=True,pca=100,metric="cosine",depth=5,k=100,override=False,no_plot=True)



In [None]:
# forest.split_clusters[]

In [None]:
# forest.most_likely_tree(depth=5)
forest.maximum_spanning_tree(mode="samples",depth=5)


In [None]:
forest.tsne(override=True,pca=100)

In [None]:
## We want to examine the common splits in two ways: 

# Cell populations they affect anPd effects Pof each split
forest.html_tree_summary(n=10)

In [None]:
forest.split_clusters[8].set_name("HLA+,S100-")
forest.split_clusters[]

In [None]:
forest.split_clusters[8].error_ratio()

In [None]:
# forest.cluster_samples_simple(sub=.5,k=20,pca=20,metric='cosine',override=True)
forest.reset_sample_clusters()

forest.cluster_samples_encoding(metric='euclidean',depth=8,k=50,override=True,pca=100)
# forest.cluster_samples_simple(sub=.5,metric='cosine',k=30,pca=20,override=True,no_plot=False)
# len(forest.sample_clusters)

In [None]:
forest.output_features[2000:]

In [None]:
forest.tsne(pca=100)
# forest.tsne_coordinates = fluorescence_coordinates
forest.plot_sample_clusters()

In [None]:
plt.figure()
plt.title("TSNE Manifold Mapping of Sample Clusters onto Fluorescence Coordinates")
plt.scatter(fluorescence_coordinates.T[0],fluorescence_coordinates.T[1],s=1,c=forest.sample_labels,cmap='rainbow')

plt.show()

In [None]:
forest.reset_leaf_clusters()
forest.cluster_leaves_samples(sub=.5,k=20,metric="jaccard",override=True)


In [None]:
# forest.tsne(pca=100,override=True)
forest.tsne_coordinates = fluorescence_coordinates

In [None]:
for cluster in forest.leaf_clusters:
    cluster.plot_sample_counts()

In [None]:
forest.reset_sample_clusters()
forest.cluster_samples_leaf_cluster()

In [None]:
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################
###########################################################################

In [None]:
plt.figure()
plt.title("TSNE Manifold Mapping of Sample Clusters onto Fluorescence Coordinates")
plt.scatter(fluorescence_coordinates.T[0],fluorescence_coordinates.T[1],s=1,c=forest.sample_labels,cmap='rainbow')
for cluster in set(forest.sample_labels):
    mask = np.array(forest.sample_labels) == cluster
    mean_coordinates = np.mean(fluorescence_coordinates[mask],axis=0)
    plt.scatter([mean_coordinates[0],],[mean_coordinates[1],],s=100)
plt.show()

In [None]:
plt.figure()
plt.title("TSNE Manifold Mapping of Sample Clusters onto Fluorescence Coordinates")
plt.scatter(fluorescence_coordinates.T[0],fluorescence_coordinates.T[1],s=1,c=fluorescence_counts[9])
plt.show()

In [None]:
plt.figure()
plt.title("TSNE Manifold Mapping of Sample Clusters onto Fluorescence Coordinates")
plt.scatter(forest.coordinates(no_plot=True).T[0],forest.coordinates(no_plot=True).T[1],s=1,c=fluorescence_counts[2])
plt.show()

In [None]:
fluorescence_header

In [None]:
# from scipy.cluster.hierarchy import dendrogram,linkage

# feature_sort = dendrogram(linkage(counts.T,metric='cosine',method='average'),no_plot=True)['leaves']
# sample_sort = np.argsort(forest.sample_labels)

plt.figure(figsize=(20,20))
plt.imshow(counts[sample_sort].T[feature_sort].T,cmap='bwr',aspect='auto',interpolation='none')
plt.show()

plt.figure(figsize=(20,20))
plt.imshow(fluorescence_counts.T[sample_sort],aspect='auto',interpolation='none')
plt.xticks(np.arange(13),fluorescence_header)
plt.show()


In [None]:
forest.sample_cluster_feature_matrix()

In [None]:
# sample_cluster_features = forest.sample_cluster_feature_matrix()
# sample_cluster_sort = dendrogram(linkage(sample_cluster_features,metric='cosine',method='average'),no_plot=True)['leaves']
# sample_cluster_feature_sort = dendrogram(linkage(sample_cluster_features.T,metric='cosine',method='average'),no_plot=True)['leaves']

plt.figure(figsize=(20,20))
# plt.imshow(np.log10(sample_cluster_features[sample_cluster_sort].T[feature_sort].T),cmap='bwr',aspect='auto')
plt.imshow(sample_cluster_features[sample_cluster_sort].T[feature_sort].T,cmap='bwr',aspect='auto',interpolation='none')
plt.yticks(np.arange(len(forest.sample_clusters)),np.arange(len(forest.sample_clusters))[sample_cluster_sort])
plt.colorbar()
plt.show()



In [None]:
# focused = sample_cluster_features[sample_cluster_sort].T[feature_sort].T[-4:]
# feature_resort = dendrogram(linkage(focused.T,metric='cosine',method='average'),no_plot=True)['leaves']

# plt.figure(figsize=(20,5))
# plt.imshow(focused,cmap='bwr',aspect='auto')
# plt.show()

# plt.figure(figsize=(5,30))
# plt.imshow(focused.T[feature_resort],cmap='bwr',aspect='auto')
# plt.show()


In [None]:
plt.figure(figsize=(5,5))
plt.title("Mouse Blood Surface Markers (Checker Pattern by Unsupervised Cluster)")
plt.imshow(fluorescence_counts.T[sample_sort],aspect='auto',interpolation='none')
plt.xticks(np.arange(len(fluorescence_header)),fluorescence_header,rotation=45)
plt.show()


In [None]:
## We would now like to see what kind of distribution there is between forest splits and sample clusters
# for the fluorescence features

fluorescence_features = list(fluorescence_header) 
fluorescence_feature_values = np.array([f for f in fluorescence_counts[:]])
                                       
print(fluorescence_features)
print(len(fluorescence_features))
print(fluorescence_feature_values.shape)

for feature,feature_values in zip(fluorescence_features,fluorescence_feature_values):
    forest.add_output_feature(feature_values,feature)

In [None]:
forest.output_features[2000:]

In [None]:
# sample_cluster_fluorescence = forest.sample_cluster_feature_matrix(fluorescence_features)
# print(sample_cluster_fluorescence.shape)

# plt.figure(figsize=(20,20))
# plt.imshow(sample_cluster_fluorescence,aspect='auto')
# plt.yticks(np.arange(len(forest.sample_clusters)),np.arange(len(forest.sample_clusters)))
# plt.xticks(np.arange(13),fluorescence_features)
# plt.ylim(-.5,len(sample_cluster_fluorescence)-.5)
# plt.xlabel("Surface markers")
# plt.ylabel("Cluster (Unannotated)")
# cb = plt.colorbar()
# cb.set_label("Expression (Log10 TPM)")
# plt.show()

# plt.figure(figsize=(20,20))
# plt.imshow(sample_cluster_fluorescence[sample_cluster_sort],aspect='auto')
# plt.yticks(np.arange(23),np.arange(23)[sample_cluster_sort])
# plt.xticks(np.arange(20),fluorescence_features)
# plt.colorbar()
# plt.show()

# plt.figure(figsize=(20,20))
# plt.imshow(sample_cluster_fluorescence[sample_cluster_sort][:,:10],aspect='auto')
# plt.yticks(np.arange(23),np.arange(23)[sample_cluster_sort])
# plt.xticks(np.arange(10),fluorescence_features[:10])
# plt.colorbar()
# plt.show()

# plt.figure(figsize=(10,10))
# plt.imshow(sample_cluster_fluorescence[sample_cluster_sort],aspect='auto')
# plt.yticks(np.arange(len(forest.sample_clusters)),np.arange(len(forest.sample_clusters))[sample_cluster_sort])
# plt.xticks(np.arange(13),fluorescence_features,rotation=45)
# plt.ylim(-.5,len(sample_cluster_fluorescence)-.5)
# plt.xlabel("Surface markers")
# plt.ylabel("Cluster (Unannotated)")
# cb = plt.colorbar()
# cb.set_label("Expression (Log10 TPM)")
# plt.show()


In [None]:
## We want to compare distances of the samples in tree space vs other distance metrics


from scipy.spatial.distance import squareform,pdist

cosine_distance = squareform(pdist(forest.output[:,:2000],metric='cosine'))
euclidean_distance = squareform(pdist(forest.output[:,:2000],metric='euclidean'))

tree_jaccard_distance = squareform(pdist(forest.node_sample_encoding(forest.leaves()),metric='jaccard'))

In [None]:
c_agg_sort = dendrogram(linkage(cosine_distance,method='average'),no_plot=True)['leaves']
e_agg_sort = dendrogram(linkage(euclidean_distance,method='average'),no_plot=True)['leaves']
j_agg_sort = dendrogram(linkage(tree_jaccard_distance,method='average'),no_plot=True)['leaves']

In [None]:
plt.figure()
plt.imshow(cosine_distance[c_agg_sort].T[c_agg_sort])
plt.show()

plt.figure()
plt.imshow(euclidean_distance[e_agg_sort].T[e_agg_sort])
plt.show()

plt.figure()
plt.imshow(tree_jaccard_distance[j_agg_sort].T[j_agg_sort])
plt.show()

In [None]:
# Annotations: 

# Based on Figure 3, see paper

# 13 & 2: Natural Killers, CD16+ & CD56+ & CD45++
# 8: CD 16 MONO CD16+ & CD14 & CD11
# 14: Precursors, bright CD34
# 11: B cells CD- & CD19+ & CD45+ & CD16-
# 16&17 Star Doublets?
# 3,10,4,9,0,: CD4 T? 
# 6: CD8 T?
# 5 & 1 CD14 Mono DC? 
# 15&7 CD14 Mono Plain (CD3+,CD8+)

In [None]:
forest.nodes()[0].encoding()

In [None]:
# We want to see how factors compare to various surface markers

factor_matrix = forest.factor_matrix()
factor_matrix.shape

In [None]:
# sample_agg = dendrogram(linkage(factor_matrix,metric='cosine',method='average'),no_plot=True)['leaves']
# factor_agg = dendrogram(linkage(factor_matrix.T,metric='cosine',method='average'),no_plot=True)['leaves']

# plt.figure()
# plt.subplot()
# plt.imshow(factor_matrix[sample_agg].T[factor_agg].T,aspect='auto',cmap='bwr',interpolation='none',vmin=-1,vmax=1)
# plt.show()

plt.figure()
plt.title("Mouse Blood Cells, Extracted Factors")
plt.imshow(factor_matrix[sample_agg],aspect='auto',cmap='bwr',interpolation='none',vmin=-1,vmax=1)
plt.xlabel("Factors")
plt.ylabel("Cells")
plt.colorbar()
plt.show()

In [None]:
plt.figure()
plt.title("Surface Markers, Identical Sort")
plt.imshow(fluorescence_counts.T[sample_agg],aspect='auto',interpolation='none')
plt.ylabel("Cells")
plt.xticks(np.arange(len(fluorescence_header)),fluorescence_header,rotation=45)
plt.show()

In [None]:
for ff,fn in zip(fluorescence_counts,fluorescence_features):
    plt.figure()
    plt.title(fn)
    plt.scatter(*forest.tsne_coordinates.T,c=ff,s=2,alpha=.4)
    plt.show()

In [None]:
from scipy.spatial.distance import cdist,squareform

correlations = cdist(fluorescence_counts,factor_matrix.T[1:],metric='correlation')
correlations.shape

In [None]:
plt.figure()
plt.imshow(correlations - 1,cmap='bwr',interpolation='none',vmin=-1,vmax=1)
plt.colorbar()
plt.show()

In [None]:
np.max(factor_matrix[:,16])

In [None]:
for i,(ff,fn) in enumerate(zip(fluorescence_counts,fluorescence_features)):
    fmax = np.argmax(np.abs(correlations[i]-1))
    plt.figure()
    plt.title(f"{(fn,fmax)}, Pearson:{np.around(-1*(correlations[i,fmax]-1),3)}")
    plt.scatter(ff,factor_matrix[:,fmax+1])
    plt.xlabel(f"{fn}")
    plt.ylabel(f"Factor {fmax}")
    plt.show()

In [None]:
from sklearn.decomposition import PCA

pcs = PCA(n_components=21).fit_transform(forest.output)

pc_correlations = cdist(fluorescence_counts,pcs.T,metric='correlation')
pc_correlations.shape


In [None]:
plt.figure()
plt.imshow(pc_correlations - 1,cmap='bwr',interpolation='none',vmin=-1,vmax=1)
plt.colorbar()
plt.show()

In [None]:
for i,(ff,fn) in enumerate(zip(fluorescence_counts,fluorescence_features)):
    fmax = np.argmax(np.abs(pc_correlations[i]-1))
    plt.figure()
    plt.title(f"{(fn,fmax)},{pc_correlations[i,fmax]-1}")
    plt.scatter(ff,pcs[:,fmax])
    plt.show()

In [None]:
# plt.figure()
# plt.scatter(factor_matrix[:,18],fluorescence_counts[9])
# plt.show()
correlations[9,17]

In [None]:
for pc in pcs.T:
    plt.figure()
    plt.scatter(*forest.tsne_coordinates.T,c=pc,cmap='bwr')
    plt.colorbar()
    plt.show()
