barcodes_samples now accept files like:

AAAAAAA,sample1 That will be used to named the cells from that sample in the final tagcounts.mtx.metadata
bcbio · Feb 14, 2019 · c3b057d · c3b057d
1 parent b481da8
commit c3b057d
Show file tree

Hide file tree

Showing 8 changed files with 176 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 *.pyc
 *.idx
+*.swo
+*.swp
 bcbio/pipeline/version.py
 bcbio_nextgen.egg-info/
 build/

diff --git a/bcbio/pipeline/datadict.py b/bcbio/pipeline/datadict.py
@@ -86,6 +86,7 @@
     "novel_mirna_counts": {"keys": ["novel_mirna_counts"]},
     "novel_isomir_counts": {"keys": ["novel_isomir_counts"]},
     "combined_counts": {"keys": ["combined_counts"]},
+    "combined_histogram": {"keys": ["combined_histogram"]},
     "annotated_combined_counts": {"keys": ["annotated_combined_counts"]},
     "genome_context_files": {"keys": ["reference", "genome_context"], "default": [], "always_list": True},
     "viral_files": {"keys": ["reference", "viral"], "default": [], "always_list": True},
@@ -96,6 +97,7 @@
     "express_fpkm": {"keys": ['express_fpkm']},
     "express_tpm": {"keys": ['express_tpm']},
     "express_counts": {"keys": ['express_counts']},
+    "histogram_counts": {"keys": ['histogram_counts']},
     "isoform_to_gene": {"keys": ['isoform_to_gene']},
     "fusion_mode": {"keys": ['config', 'algorithm', 'fusion_mode']},
     "fusion_caller": {"keys": ['config', 'algorithm', 'fusion_caller']},

diff --git a/bcbio/pipeline/rnaseq.py b/bcbio/pipeline/rnaseq.py
@@ -2,14 +2,13 @@
 import sys
 from bcbio.rnaseq import (featureCounts, cufflinks, oncofuse, count, dexseq,
                           express, variation, stringtie, sailfish, spikein, pizzly, ericscript,
-                          kallisto, salmon)
+                          kallisto, salmon, singlecellexperiment)
 from bcbio.ngsalign import bowtie2, alignprep
 from bcbio.variation import effects, joint, multi, population, vardict
 import bcbio.pipeline.datadict as dd
-from bcbio.utils import filter_missing, flatten, to_single_data
+from bcbio.utils import filter_missing, flatten, to_single_data, file_exists
 from bcbio.log import logger
 
-
 def fast_rnaseq(samples, run_parallel):
     samples = run_parallel("run_salmon_index", [samples])
     samples = run_parallel("run_salmon_reads", samples)
@@ -40,6 +39,43 @@ def singlecell_rnaseq(samples, run_parallel):
         logger.error(("%s is not supported for singlecell RNA-seq "
                       "quantification." % quantifier))
         sys.exit(1)
+    samples = scrnaseq_concatenate_metadata(samples)
+    singlecellexperiment.make_scrnaseq_object(samples)
+    return samples
+
+def scrnaseq_concatenate_metadata(samples):
+    """
+    Create file same dimension than mtx.colnames
+    with metadata and sample name to help in the
+    creation of the SC object.
+    """
+    barcodes = {}
+    counts =  ""
+    metadata = {}
+    for sample in dd.sample_data_iterator(samples):
+        with open(dd.get_sample_barcodes(sample)) as inh:
+            for line in inh:
+                cols = line.strip().split(",")
+                if len(cols) == 1:
+                    # Assign sample name in case of missing in barcodes
+                    cols.append("NaN")
+                barcodes[cols[0]] = cols[1:]
+
+        counts = dd.get_combined_counts(sample)
+        meta = map(str, list(sample["metadata"].values()))
+        meta_cols = list(sample["metadata"].keys())
+        meta = ["NaN" if not v else v for v in meta]
+        metadata[dd.get_sample_name(sample)] = meta
+
+    metadata_fn = counts + ".metadata"
+    if not file_exists(metadata_fn):
+        with open(metadata_fn, 'w') as outh:
+            outh.write(",".join(["sample"] + meta_cols) + '\n')
+            with open(counts + ".colnames") as inh:
+                for line in inh:
+                    sample = line.split(":")[0]
+                    barcode = sample.split("-")[1]
+                    outh.write(",".join(barcodes[barcode] + metadata[sample]) + '\n')
     return samples
 
 def rnaseq_variant_calling(samples, run_parallel):
@@ -415,4 +451,3 @@ def combine_files(samples):
             data = dd.set_tx2gene(data, tx2gene_file)
         updated_samples.append([data])
     return updated_samples
-
diff --git a/bcbio/rnaseq/bcbiornaseq.py b/bcbio/rnaseq/bcbiornaseq.py
@@ -1,9 +1,8 @@
 import os
-import shutil
 import toolz as tz
 from string import Template
 from bcbio.utils import file_exists, Rscript_cmd, safe_makedir, chdir
-from bcbio.distributed.transaction import file_transaction, tx_tmpdir
+from bcbio.distributed.transaction import file_transaction
 from bcbio.provenance import do
 from bcbio.pipeline import datadict as dd
 

diff --git a/bcbio/rnaseq/singlecellexperiment.py b/bcbio/rnaseq/singlecellexperiment.py
@@ -0,0 +1,79 @@
+from __future__ import print_function
+
+import os
+import subprocess
+
+from bcbio.rnaseq import gtf
+from bcbio.utils import file_exists, Rscript_cmd, R_sitelib
+from bcbio.distributed.transaction import file_transaction
+from bcbio.provenance import do
+from bcbio.pipeline import datadict as dd
+from bcbio.log import logger
+
+def make_scrnaseq_object(samples):
+    """
+    load the initial se.rda object using sinclecell-experiment
+    """
+    local_sitelib = R_sitelib()
+    counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts))
+    gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
+    if not gtf_file:
+        gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
+    rda_file = os.path.join(counts_dir, "se.rda")
+    if not file_exists(rda_file):
+        with file_transaction(rda_file) as tx_out_file:
+            rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
+            rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
+            rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
+            with open(rcode, "w") as out_handle:
+                out_handle.write(_script.format(**locals()))
+            rscript = Rscript_cmd()
+            try:
+                # do.run([rscript, "--no-environ", rcode],
+                #        "SingleCellExperiment",
+                #        log_error=False)
+                print(rcode)
+                rda_file = rcode
+            except subprocess.CalledProcessError as msg:
+                logger.exception()
+
+
+def _find_rRNA_genes(gtf_file, rrna_file):
+    print(gtf_file)
+    rrna_features = gtf.get_rRNA(gtf_file)
+    transcripts = set([x[0] for x in rrna_features if x])
+    with open(rrna_file, 'w') as outh:
+        outh.write("\n".join(transcripts))
+    return rrna_file
+
+
+_script = """
+library(SingleCellExperiment)
+library(Matrix)
+
+counts = readMM("{counts_dir}/tagcounts.mtx")
+rownames = read.csv("{counts_dir}/tagcounts.mtx.rownames", header = F)[["V1"]]
+rownames = as.character(rownames)
+colnames = read.csv("{counts_dir}/tagcounts.mtx.colnames", header = F)[["V1"]]
+colnames = make.names(as.character(colnames))
+reads = read.csv("{counts_dir}/cb-histogram.txt", header = F, sep="\t", row.names = 1)
+rownames(reads) = make.names(rownames(reads))
+
+counts =  as(counts, "dgCMatrix")
+rownames(counts) = rownames
+colnames(counts) = colnames
+metadata = read.csv("{counts_dir}/tagcounts.mtx.metadata")
+rownames(metadata) = colnames
+metadata[["nUMI"]] = colSums(counts)
+metadata[["nGenes"]] = colSums(counts>0)
+metadata[["log10GenesPerUMI"]] = log10(metadata$nGene) / log10(metadata$nUMI)
+metadata[["nReads"]] = reads[colnames,]
+
+rrna = read.csv("{rrna_file}", header=F, stringsAsFactors = F)
+metadata[["mtUMI"]] = colSums(counts[rrna[["V1"]],], na.rm = T)
+metadata[["mtUMI"]][is.na(metadata[["mtUMI"]])] = 0
+metadata[["mitoRatio"]] = metadata$mtUMI/metadata$nUMI
+
+se = SingleCellExperiment(assays=list(raw=counts), colData = metadata)
+save(se, file = "{counts_dir}/se.rda")
+"""
diff --git a/bcbio/rnaseq/umi.py b/bcbio/rnaseq/umi.py
@@ -103,6 +103,20 @@ def get_cellular_barcodes(data):
     else:
         return []
 
+def get_sample_barcodes(fn, out_dir):
+    if not fn:
+        logger.error("Sample demultiplexing needs a list of known indexes provided "
+                     "with via the sample_barcodes option in the algorithm section.")
+        sys.exit(1)
+    # import pdb; pdb.set_trace()
+    utils.safe_makedir(out_dir)
+    out_fn = os.path.join(out_dir, "barcodes.csv")
+    with open(fn) as inh:
+        with open(out_fn, 'w') as outh:
+            for line in inh:
+                outh.write("%s\n" % (line.strip().split(",")[0]))
+    return out_fn
+
 def umi_transform(data):
     """
     transform each read by identifying the barcode and UMI for each read
@@ -242,7 +256,8 @@ def barcode_histogram(data):
             do.run(cmd.format(**locals()), message)
     cutoff = dd.get_minimum_barcode_depth(data)
     filter_barcode_histogram(filtered_out_file, out_file, cutoff)
-    return [[data]]
+    newdata = dd.set_histogram_counts(data, filtered_out_file)
+    return [[newdata]]
 
 def tagcount(data):
     bam = dd.get_transcriptome_bam(data)
@@ -339,6 +354,10 @@ def demultiplex_samples(data):
     """
     demultiplex a fastqtransformed FASTQ file into separate sample barcode files
     """
+    work_dir = os.path.join(dd.get_work_dir(data), "umis")
+    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
+    demulti_dir = os.path.join(sample_dir, "demultiplexed")
+
     files = data["files"]
     if len(files) == 2:
         logger.error("Sample demultiplexing doesn't handle paired-end reads, but "
@@ -351,14 +370,8 @@ def demultiplex_samples(data):
         read = next(in_handle)
         if "SAMPLE_" not in read:
             return [[data]]
-    bcfile = dd.get_sample_barcodes(data)
-    if not bcfile:
-        logger.error("Sample demultiplexing needs a list of known indexes provided "
-                     "with via the sample_barcodes option in the algorithm section.")
-        sys.exit(1)
-    work_dir = os.path.join(dd.get_work_dir(data), "umis")
-    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
-    demulti_dir = os.path.join(sample_dir, "demultiplexed")
+
+    bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir)
     demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
     if demultiplexed:
         return [split_demultiplexed_sampledata(data, demultiplexed)]
@@ -392,6 +405,7 @@ def split_demultiplexed_sampledata(data, demultiplexed):
 def concatenate_sparse_counts(*samples):
     samples = concatenate_sparse_matrices(samples, deduped=True)
     samples = concatenate_sparse_matrices(samples, deduped=False)
+    samples = concatenate_cb_histograms(samples)
     return samples
 
 def concatenate_sparse_matrices(samples, deduped=True):
@@ -434,6 +448,25 @@ def concatenate_sparse_matrices(samples, deduped=True):
         return newsamples
     return samples
 
+def concatenate_cb_histograms(samples):
+    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
+    umi_dir = os.path.join(work_dir, "umis")
+    out_file = os.path.join(umi_dir, "cb-histogram.txt")
+
+    files = [dd.get_histogram_counts(data) for data in
+            dd.sample_data_iterator(samples)
+            if dd.get_histogram_counts(data)]
+    files = " ".join(files)
+    cmd = "cat {files} > {out_file}"
+    if not file_exists(out_file):
+        with file_transaction(out_file) as tx_out_file:
+            message = "Concay cb histograms."
+            do.run(cmd.format(**locals()), message)
+    newsamples = []
+    for data in dd.sample_data_iterator(samples):
+        newsamples.append([dd.set_combined_histogram(data, out_file)])
+    return newsamples
+
 def version(data):
     umis_cmd = config_utils.get_program("umis", data, default="umis")
     version_cmd = [umis_cmd, "version"]

diff --git a/bcbio/srna/sample.py b/bcbio/srna/sample.py
@@ -66,7 +66,8 @@ def trim_srna_sample(data):
             adapter_cmd = "-N %s" % adapter_cmd
         out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
         out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
-        atropos = _get_atropos()
+        # atropos = _get_atropos()
+        atropos = config_utils.get_program("atropos", data, default="atropos")
         options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
         if options.strip() == "-u 4 -u -4":
             options = ""

diff --git a/bcbio/upload/__init__.py b/bcbio/upload/__init__.py
@@ -753,6 +753,8 @@ def _get_files_project(sample, upload_config):
                         "type": "rownames"})
             out.append({"path": count_file + ".colnames",
                         "type": "colnames"})
+            out.append({"path": count_file + ".metadata",
+                        "type": "metadata"})
             umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx"
             if utils.file_exists(umi_file):
                 out.append({"path": umi_file,
@@ -761,6 +763,13 @@ def _get_files_project(sample, upload_config):
                             "type": "rownames"})
                 out.append({"path": umi_file + ".colnames",
                             "type": "colnames"})
+            if dd.get_combined_histogram(sample):
+                out.append({"path": dd.get_combined_histogram(sample),
+                            "type": "txt"})
+            rda = os.path.join(os.path.dirname(count_file), "se.rda")
+            if utils.file_exists(rda):
+                out.append({"path": rda,
+                            "type": "rda"})
         else:
             out.append({"path": dd.get_combined_counts(sample)})
     if dd.get_annotated_combined_counts(sample):