In [1]:
import numpy as np
import pandas as pd
#!pip install anndata --target D:/usr/local/lib/python3.8/dist-packages
import anndata as ad
from collections import defaultdict
from rnasieve.preprocessing import model_from_raw_counts
print(ad.__version__)

0.7.8


This file uses guidance from https://github.com/theislab/anndata-tutorials/blob/master/getting-started.ipynb
 to get the jcoffman008 data as an AnnData structure so that it can be used for RNAsieve as in https://github.com/songlab-cal/rna-sieve


In [2]:
##-----loading in jcoffman007 expression and design data -----
#load in jcoffman007 count data (raw counts)
widejcoffman007_count = pd.read_csv("bulk_count_0dpa.csv.gz", compression="gzip")
#changing GeneID_genename format to just GeneID
# widejcoffman007_count["gene_id"]=widejcoffman007_count["gene_id"].str[:18]
#load in design file. note that each row is a sample that matches up with the "data" dataframe  
jcoffman007_design= pd.read_csv("bulk_design_0dpa.csv.gz", compression="gzip",index_col=0)


##-----loading in atlas expression and design data----- 
#uploading the expression data for the reference (atlas)
wideatlas_count = pd.read_csv("single_count_0dpa.csv.gz", compression="gzip")
#changing GeneID|genename format to just GeneID
# wideatlas_count["gene"]=wideatlas_count["gene"].str[:18]
#uploading the meta/"obs" for the reference (atlas)
''' 
atlas_meta_alldpf = pd.read_csv("meta.tsv", sep="\t",index_col=0)
#restricting meta to only 5dpf 
sample_names_5dpf= ['5a_5dpf', '5b_5dpf']
atlas_meta= atlas_meta_alldpf[atlas_meta_alldpf['sample_name'].isin(sample_names_5dpf)]
'''
atlas_design = pd.read_csv("single_design_0dpa.csv.gz", compression="gzip",index_col=0)


In [3]:
#altering both expression datasets to contain identical sets of genes 
jcoffman007_genes= widejcoffman007_count["genes"].tolist()
atlas_genes= wideatlas_count["genes"].tolist()
genes_in_both= list(set(jcoffman007_genes) & set(atlas_genes))
print("there are",len(genes_in_both),"genes that are in both the atlas and jcoffman007 count data") 
widejcoffman007_count=widejcoffman007_count[widejcoffman007_count["genes"].isin(genes_in_both)]
wideatlas_count=wideatlas_count[wideatlas_count["genes"].isin(genes_in_both)]
#sorting to be in same order
widejcoffman007_count=widejcoffman007_count.sort_values(by=['genes'])
wideatlas_count=wideatlas_count.sort_values(by=['genes'])

there are 24324 genes that are in both the atlas and jcoffman007 count data


In [4]:
#------making annData structure that holds the counts and meta for jcoffman007-----
#convert to having genes as columns, samples as rows (first have to make the gene ID the row label)
widejcoffman007_count= widejcoffman007_count.set_index("genes")
jcoffman007_count= widejcoffman007_count.transpose()
jcoffman007 = ad.AnnData(jcoffman007_count, obs=jcoffman007_design)

#-----making annData structure that holds the counts and meta for atlas----- 
wideatlas_count=wideatlas_count.set_index("genes")
atlas_count= wideatlas_count.transpose()
atlas = ad.AnnData(atlas_count, obs=atlas_design)

In [5]:
#modeling off of the raw counts prep section from example.ipynb from song lab github

# grouping the cells in the reference data by cluster 
print('Aggregating by cluster...')
counts_by_cluster = {}
for i in range(len(atlas)):
    sc = atlas[i]
    if len(sc.obs['major.cl']) == 0:
        continue
    cell_cluster = sc.obs['major.cl'][0]
    if cell_cluster not in counts_by_cluster:
        counts_by_cluster[cell_cluster] = np.empty((sc.X.shape[1], 0), dtype=np.float32)
    counts_by_cluster[cell_cluster] = np.hstack(
        (counts_by_cluster[cell_cluster], sc.X.toarray().reshape(-1, 1)))
    

# Bulk prep
print('Aggregating bulks by name...')
G = jcoffman007.n_vars
bulk_by_time = defaultdict(list)
for i in range(len(jcoffman007)):
    bulk = jcoffman007[i]
    if len(bulk.obs['name']) == 0:
        continue
    time = bulk.obs['name'][0]
    bulk_by_time[time].append(bulk.X.toarray().reshape(-1, 1))

bulk_labels = []
psis = np.empty((G, 0), dtype=np.float32)
for name in sorted(bulk_by_time.keys()):
    bulks = bulk_by_time[name]
    for i in range(len(bulks)):
        bulk_labels.append("{} name, subject {}".format(name, i))
        psis = np.hstack((psis, bulks[i]))
        
print('Done!')

Aggregating by cluster...
Aggregating bulks by name...
Done!


In [6]:
counts_by_cluster['Intermediate Epithelial'].shape

(24324, 1166)

In [7]:
model, cleaned_psis = model_from_raw_counts(counts_by_cluster, psis[:, :18])

In [8]:
cleaned_psis

Unnamed: 0,Bulk 0,Bulk 1,Bulk 2,Bulk 3,Bulk 4,Bulk 5
0,1677.378296,2553.272461,2512.293945,4270.180176,4251.043945,3804.370605
1,755.000000,845.000000,807.000000,632.000000,616.000000,585.000000
2,1970.340454,2339.308350,2001.430786,1795.925537,1695.094971,1565.693604
3,56411.500000,52280.015625,64635.925781,145650.125000,146573.468750,128260.828125
4,562.000000,602.000000,584.000000,1187.000000,1059.000000,1012.249939
...,...,...,...,...,...,...
3877,352.111328,307.359314,368.413605,902.443115,852.180664,939.801208
3878,2512.000000,1996.000000,3208.000000,1697.000000,2005.000000,1695.000000
3879,530.809448,500.286652,729.000000,6539.441895,5886.029785,5865.681641
3880,269.000000,295.000000,330.000000,162.000000,156.000000,191.000000


In [9]:
output_table= model.predict(cleaned_psis) #this takes a minute ... 
output_table.to_csv('rnasieve_jcoffman007_0dpa.csv')

In [10]:
# In this example, the intervals at a significance level of 0.05 indicate the estimate is poor.
# We set sig=0.9999 for the sake of visualization.
model.compute_marginal_confidence_intervals(sig=0.2)

[[(0.14817625693657605, 0.2522257776969925),
  (0.04421673270254172, 0.14156653365788358),
  (0.05120504291769731, 0.2061147834158816),
  (0.020639718878636457, 0.07017400146019934),
  (0.05197381118038216, 0.15510349768865153),
  (0.383230088098422, 0.47537375536613574)],
 [(0.17872401482629008, 0.2614287809676977),
  (0.06432421407511865, 0.17644793882487103),
  (0.05577699950633644, 0.2121001377708042),
  (0.031073935567842992, 0.0700020201285564),
  (0.049299109716623365, 0.13182444176720404),
  (0.33883156033149336, 0.4301668465171616)],
 [(0.16421407066108018, 0.2513309108158178),
  (0.0414339273975909, 0.13352855742351166),
  (0.03381002104199164, 0.14741611736915172),
  (0.024078668064715553, 0.06571041831268526),
  (0.042914132407006235, 0.13501582848856172),
  (0.43871552850431683, 0.5218318195135705)],
 [(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0)],
 [(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0)],
 [(0.0, 1.0), (0.0, 1.0),

In [11]:
model.plot_proportions('bar').properties(title="Bar visualization ")

In [12]:
model.plot_proportions('stacked').properties(title="Stacked visualization")