In [1]:
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram,linkage
from scipy.spatial.distance import pdist,squareform

raw_data_location = "/localscratch/bbrener1/c_elegans_raw_data/"

In [None]:
# Preprocessing of single cell RNAseq data for C. elegans. 

# Source paper: Cao, Junyue, Jonathan S. Packer, Vijay Ramani, Darren A. Cusanovich, Chau Huynh, Riza Daza, Xiaojie Qiu et al. "Comprehensive single-cell transcriptional profiling of a multicellular organism." Science 357, no. 6352 (2017): 661-667.

# Url: https://science.sciencemag.org/content/357/6352/661.abstract

In [None]:
# We first obtain the count matrix in an annoying format:

%cd {raw_data_location}

!wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2599nnn/GSM2599701/suppl/GSM2599701%5FGene%2Ecount%2Ematrix%2Ecelegans%2Ecell%2ERdata%2Egz

In [None]:
# Fuck you very much Dr. Cao, for making me install fucking R to read your bullshit
%cd {raw_data_location}
# !gunzip *.gz

# Used R to convert to sparse matrix format, since matrix was totally unfiltered.

In [None]:
# For some reason python lacks good facilities for loading COO sparse matrices, so whatever, let's do this manually 

sparse_c_e = np.loadtxt('mtx.mtx')
rows = np.max(sparse_c_e[:,0])
columns = np.max(sparse_c_e[:,1])

In [None]:
print(f"rows:{rows},columns:{columns}")

In [None]:
# Note one extra row because the R mtx summary indexes from 1
from scipy.sparse import coo_matrix
sparse_c_e_np = coo_matrix((sparse_c_e[:,2],(sparse_c_e[:,0].astype(dtype=int),sparse_c_e[:,1].astype(dtype=int))),shape=(int(rows+1),int(columns+1)),dtype=float)
sparse_c_e_np.shape

In [None]:
# Conveniently we didn't omit the header from the R row names, so it actually matches the matrix above
header = np.loadtxt('raw_header.txt',dtype='str')
header = header[:,1]

In [None]:
header.shape

In [None]:
for i,element in enumerate(header):
    header[i] = element.strip('""')

In [None]:
np.sum

In [None]:
# All cells have at least one read
np.sum(np.sum(sparse_c_e_np,axis=0) > 0) 

In [None]:
# First select fatures that have at least one read
feature_mask = np.sum(sparse_c_e_np,axis=1) > 10
feature_mask = np.array(feature_mask).ravel()

In [None]:
np.sum(feature_mask)

In [None]:
# In order to operate on rows, we'll need to convert this to a CSR
feature_filtered = sparse_c_e_np.tocsr()[feature_mask[1:]]
filtered_header = header[feature_mask[1:]]

In [None]:
# Now we have to examine the distribution of per-cell read sums

plt.figure()
plt.hist(np.array(np.sum(feature_filtered,axis=0)).flatten(),bins=np.arange(0,100))

In [None]:
np.sum(np.sum(feature_filtered,axis=0) > 1000)

In [None]:
# We have a substantial number of cells showing at least 1000 UMIs (~30%), so we could simply choose these to operate on (at least for the moment)
cell_filter = np.array(np.sum(feature_filtered,axis=0) > 1000).ravel()

In [None]:
double_filtered = feature_filtered.T[cell_filter].T

In [None]:
double_filtered = double_filtered.T

In [None]:
double_filtered.shape

In [None]:
# Our matrix is still sort of porky to be operated on directly. Before we begin filtering by variance however, we should normalize by size

In [None]:
plt.figure()
plt.hist(np.array(np.sum(double_filtered,axis=1)).flatten(),bins=50)

In [None]:
cell_sums = np.array(np.sum(double_filtered,axis=1)).ravel()
cell_sums.shape

In [None]:
size_corrected = (double_filtered / np.tile(cell_sums,(double_filtered.shape[1],1)).T) * 1000000


In [None]:
# plt.figure()
# plt.hist(np.array(np.sum(size_corrected,axis=1)).flatten(),bins=50)
# plt.show()

In [None]:
# plt.figure()
# plt.hist(np.array(size_corrected).flatten(),bins=np.arange(0,1000,50),log=True)
# plt.show()

In [None]:
log_size_corrected = np.log10(1 + size_corrected)

In [None]:
means = np.array(np.mean(log_size_corrected,axis=0)).ravel()
variances = np.array(np.var(log_size_corrected,axis=0)).ravel()

mean_sort = np.argsort(means)
var_sort = np.argsort(variances)

In [None]:
plt.figure()
plt.title("Mean Vs Variance, Log10(n+1) TPM")
plt.scatter(means,variances,s=1)
plt.xlabel("Mean")
plt.ylabel("Variance")
plt.show()

In [None]:
plt.figure()
plt.title("Variance by mean rank, Log10(n+1) TPM")
plt.scatter(np.arange(len(mean_sort)),variances[mean_sort],s=1,c='blue')
plt.scatter(np.arange(len(mean_sort)),means[mean_sort],s=1,c='red')
plt.xlabel("Mean")
plt.ylabel("Variance")
plt.show()

plt.figure()
plt.title("Variance by mean rank, Log10(n+1) TPM")
plt.scatter(np.arange(10000,len(mean_sort)),variances[mean_sort[10000:]],s=1,c='blue')
plt.scatter(np.arange(10000,len(mean_sort)),means[mean_sort[10000:]],s=1,c='red')
plt.xlabel("Mean")
plt.ylabel("Variance")
plt.show()

In [None]:
plt.figure()
plt.title("Ranked Variance, Log10(n+1) TPM")
plt.scatter(np.arange(len(var_sort)),variances[var_sort],s=1)
plt.scatter(np.arange(len(var_sort)),means[var_sort],s=1)
plt.show()

In [None]:
cov = variances/means

In [None]:
plt.figure()
plt.title("Cov by ranked mean, Log10(n+1) TPM")
plt.scatter(np.arange(len(mean_sort[10000:])),cov[mean_sort[10000:]],s=1)
plt.show()

In [None]:
plt.figure()
plt.title("Cov by ranked mean, Log10(n+1) TPM")
plt.scatter(np.arange(len(var_sort[10000:])),cov[var_sort[10000:]],s=1)
plt.show()

In [None]:
# We may want to keep top 5000 genes by variance, this is a pretty diverse dataset

umis = double_filtered.T[var_sort[-5000:]].T
umis = umis.todense()
counts = log_size_corrected.T[var_sort[-5000:]].T
header = filtered_header[var_sort[-5000:]]


In [None]:
umis = np.array(umis)
counts = np.array(counts)

In [None]:
np.savetxt(raw_data_location+"umis.tsv",umis)
np.savetxt(raw_data_location+"counts.tsv",counts)
np.savetxt(raw_data_location+"header.txt",header,fmt="%s")

In [None]:
!ls -lh {raw_data_location}

In [2]:
umis = np.loadtxt(raw_data_location+"umis.tsv")
counts = np.loadtxt(raw_data_location+"counts.tsv")
header = np.loadtxt(raw_data_location+"header.txt",dtype=str)

In [None]:
print(counts.shape)
print(umis.shape)

In [3]:
%cd /localscratch/bbrener1/rusty_forest_v3/work/
import sys
sys.path.append("/localscratch/bbrener1/rusty_forest_v3/src/")
import lumberjack
import tree_reader as tr

/localscratch/bbrener1/rusty_forest_v3/work


In [None]:
counts.shape

In [None]:
from sklearn.decomposition import PCA
pca_counts = PCA(n_components=50).fit_transform(counts)
print(pca_counts.shape)

In [None]:
forest = lumberjack.fit(
    counts,
    header=header,
    ifs=1500,
    ofs=1500,
    ss=1000,
#     dispersion_mode='ssme',
    sfr=0.5,
#     norm='l2',
    trees=100,
    leaves=100,
    depth=10,
#     lrg_mem=True
)

# forest = lumberjack.fit(
#     input_counts=umis,
#     output_counts=umis,
#     ifh=None,
#     ofh=None,
#     ifs=2000,
#     ofs=2000,
#     ss=1000,
# #     dispersion_mode='ssme',
#     sfr=.5,
# #     norm='l2',
#     trees=100,
#     depth=10,
#     leaves=100,
# )

Setting context
Input:(10566, 5000)
Output:(10566, 5000)
Generating trees
Running /localscratch/bbrener1/rusty_forest_v3/target/release/rusty_lumberjack_v3
Command: /localscratch/bbrener1/rusty_forest_v3/target/release/rusty_lumberjack_v3 -ic /tmp/tmp2r4w0c9b/input.counts -oc /tmp/tmp2r4w0c9b/output.counts -o /tmp/tmp2r4w0c9b/tmp -auto -ifh /tmp/tmp2r4w0c9b/tmp.ifh -ofh /tmp/tmp2r4w0c9b/tmp.ofh -ifs 1500 -ofs 1500 -ss 1000 -sfr 0.5 -trees 100 -leaves 100 -depth 10
Read matrix:(10566, 5000)
Read matrix:(10566, 5000)
Reading header: /tmp/tmp2r4w0c9b/tmp.ifh
Read 5000 lines
Reading header: /tmp/tmp2r4w0c9b/tmp.ofh
Read 5000 lines
Read parameters
Starting loop
Computing tree 93
Computing tree 4
Computing tree 38
Computing tree 45
Computing tree 15
Computing tree 0
Computing tree 80
Computing tree 76
Computing tree 63
Computing tree 84
Computing tree 69
Computing tree 70
Computing tree 98
Computing tree 24
Computing tree 26
Computing tree 65
Computing tree 82
Computing tree 78
Computing tre

In [None]:
# forest.backup("c_elegans_forest")
forest.backup("c_elegans_forest_reduced_cache")
# forest.backup("c_elegans_forest_pca")
# forest.backup("c_elegans_forest_pca_cache")
# forest.backup("c_elegans_forest_double_pca")

In [None]:
forest = tr.Forest.reconstitute('c_elegans_forest_cache')

In [None]:
# forest.reset_sample_clusters()
# forest.cluster_samples_encoding(sub=.8,k=30,metric="cosine",pca=50)

In [None]:
forest.reset_leaf_clusters()
forest.cluster_leaves_samples(sub=.5,k=30,metric="cosine",pca=50)

In [None]:
forest.cluster_samples_leaf_cluster()

In [None]:
forest.tsne(pca=100)
forest.plot_sample_clusters()

In [None]:
coordinates = forest.coordinates()

In [None]:
plt.figure()
plt.scatter(coordinates[:,0],coordinates[:,1],c=forest.sample_labels == 38,s=1)

In [None]:
forest.sample_labels

In [None]:
forest.set_cache(True)

In [None]:
forest.arguments