Skip to content

Commit

Permalink
barcodes_samples now accept files like:
Browse files Browse the repository at this point in the history
AAAAAAA,sample1

That will be used to named the cells from that sample in the final
tagcounts.mtx.metadata
  • Loading branch information
lpantano committed Feb 14, 2019
1 parent b481da8 commit c3b057d
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 16 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,5 +1,7 @@
*.pyc
*.idx
*.swo
*.swp
bcbio/pipeline/version.py
bcbio_nextgen.egg-info/
build/
Expand Down
2 changes: 2 additions & 0 deletions bcbio/pipeline/datadict.py
Expand Up @@ -86,6 +86,7 @@
"novel_mirna_counts": {"keys": ["novel_mirna_counts"]},
"novel_isomir_counts": {"keys": ["novel_isomir_counts"]},
"combined_counts": {"keys": ["combined_counts"]},
"combined_histogram": {"keys": ["combined_histogram"]},
"annotated_combined_counts": {"keys": ["annotated_combined_counts"]},
"genome_context_files": {"keys": ["reference", "genome_context"], "default": [], "always_list": True},
"viral_files": {"keys": ["reference", "viral"], "default": [], "always_list": True},
Expand All @@ -96,6 +97,7 @@
"express_fpkm": {"keys": ['express_fpkm']},
"express_tpm": {"keys": ['express_tpm']},
"express_counts": {"keys": ['express_counts']},
"histogram_counts": {"keys": ['histogram_counts']},
"isoform_to_gene": {"keys": ['isoform_to_gene']},
"fusion_mode": {"keys": ['config', 'algorithm', 'fusion_mode']},
"fusion_caller": {"keys": ['config', 'algorithm', 'fusion_caller']},
Expand Down
43 changes: 39 additions & 4 deletions bcbio/pipeline/rnaseq.py
Expand Up @@ -2,14 +2,13 @@
import sys
from bcbio.rnaseq import (featureCounts, cufflinks, oncofuse, count, dexseq,
express, variation, stringtie, sailfish, spikein, pizzly, ericscript,
kallisto, salmon)
kallisto, salmon, singlecellexperiment)
from bcbio.ngsalign import bowtie2, alignprep
from bcbio.variation import effects, joint, multi, population, vardict
import bcbio.pipeline.datadict as dd
from bcbio.utils import filter_missing, flatten, to_single_data
from bcbio.utils import filter_missing, flatten, to_single_data, file_exists
from bcbio.log import logger


def fast_rnaseq(samples, run_parallel):
samples = run_parallel("run_salmon_index", [samples])
samples = run_parallel("run_salmon_reads", samples)
Expand Down Expand Up @@ -40,6 +39,43 @@ def singlecell_rnaseq(samples, run_parallel):
logger.error(("%s is not supported for singlecell RNA-seq "
"quantification." % quantifier))
sys.exit(1)
samples = scrnaseq_concatenate_metadata(samples)
singlecellexperiment.make_scrnaseq_object(samples)
return samples

def scrnaseq_concatenate_metadata(samples):
"""
Create file same dimension than mtx.colnames
with metadata and sample name to help in the
creation of the SC object.
"""
barcodes = {}
counts = ""
metadata = {}
for sample in dd.sample_data_iterator(samples):
with open(dd.get_sample_barcodes(sample)) as inh:
for line in inh:
cols = line.strip().split(",")
if len(cols) == 1:
# Assign sample name in case of missing in barcodes
cols.append("NaN")
barcodes[cols[0]] = cols[1:]

counts = dd.get_combined_counts(sample)
meta = map(str, list(sample["metadata"].values()))
meta_cols = list(sample["metadata"].keys())
meta = ["NaN" if not v else v for v in meta]
metadata[dd.get_sample_name(sample)] = meta

metadata_fn = counts + ".metadata"
if not file_exists(metadata_fn):
with open(metadata_fn, 'w') as outh:
outh.write(",".join(["sample"] + meta_cols) + '\n')
with open(counts + ".colnames") as inh:
for line in inh:
sample = line.split(":")[0]
barcode = sample.split("-")[1]
outh.write(",".join(barcodes[barcode] + metadata[sample]) + '\n')
return samples

def rnaseq_variant_calling(samples, run_parallel):
Expand Down Expand Up @@ -415,4 +451,3 @@ def combine_files(samples):
data = dd.set_tx2gene(data, tx2gene_file)
updated_samples.append([data])
return updated_samples

3 changes: 1 addition & 2 deletions bcbio/rnaseq/bcbiornaseq.py
@@ -1,9 +1,8 @@
import os
import shutil
import toolz as tz
from string import Template
from bcbio.utils import file_exists, Rscript_cmd, safe_makedir, chdir
from bcbio.distributed.transaction import file_transaction, tx_tmpdir
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio.pipeline import datadict as dd

Expand Down
79 changes: 79 additions & 0 deletions bcbio/rnaseq/singlecellexperiment.py
@@ -0,0 +1,79 @@
from __future__ import print_function

import os
import subprocess

from bcbio.rnaseq import gtf
from bcbio.utils import file_exists, Rscript_cmd, R_sitelib
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio.pipeline import datadict as dd
from bcbio.log import logger

def make_scrnaseq_object(samples):
"""
load the initial se.rda object using sinclecell-experiment
"""
local_sitelib = R_sitelib()
counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts))
gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
if not gtf_file:
gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
rda_file = os.path.join(counts_dir, "se.rda")
if not file_exists(rda_file):
with file_transaction(rda_file) as tx_out_file:
rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
with open(rcode, "w") as out_handle:
out_handle.write(_script.format(**locals()))
rscript = Rscript_cmd()
try:
# do.run([rscript, "--no-environ", rcode],
# "SingleCellExperiment",
# log_error=False)
print(rcode)
rda_file = rcode
except subprocess.CalledProcessError as msg:
logger.exception()


def _find_rRNA_genes(gtf_file, rrna_file):
print(gtf_file)
rrna_features = gtf.get_rRNA(gtf_file)
transcripts = set([x[0] for x in rrna_features if x])
with open(rrna_file, 'w') as outh:
outh.write("\n".join(transcripts))
return rrna_file


_script = """
library(SingleCellExperiment)
library(Matrix)
counts = readMM("{counts_dir}/tagcounts.mtx")
rownames = read.csv("{counts_dir}/tagcounts.mtx.rownames", header = F)[["V1"]]
rownames = as.character(rownames)
colnames = read.csv("{counts_dir}/tagcounts.mtx.colnames", header = F)[["V1"]]
colnames = make.names(as.character(colnames))
reads = read.csv("{counts_dir}/cb-histogram.txt", header = F, sep="\t", row.names = 1)
rownames(reads) = make.names(rownames(reads))
counts = as(counts, "dgCMatrix")
rownames(counts) = rownames
colnames(counts) = colnames
metadata = read.csv("{counts_dir}/tagcounts.mtx.metadata")
rownames(metadata) = colnames
metadata[["nUMI"]] = colSums(counts)
metadata[["nGenes"]] = colSums(counts>0)
metadata[["log10GenesPerUMI"]] = log10(metadata$nGene) / log10(metadata$nUMI)
metadata[["nReads"]] = reads[colnames,]
rrna = read.csv("{rrna_file}", header=F, stringsAsFactors = F)
metadata[["mtUMI"]] = colSums(counts[rrna[["V1"]],], na.rm = T)
metadata[["mtUMI"]][is.na(metadata[["mtUMI"]])] = 0
metadata[["mitoRatio"]] = metadata$mtUMI/metadata$nUMI
se = SingleCellExperiment(assays=list(raw=counts), colData = metadata)
save(se, file = "{counts_dir}/se.rda")
"""
51 changes: 42 additions & 9 deletions bcbio/rnaseq/umi.py
Expand Up @@ -103,6 +103,20 @@ def get_cellular_barcodes(data):
else:
return []

def get_sample_barcodes(fn, out_dir):
if not fn:
logger.error("Sample demultiplexing needs a list of known indexes provided "
"with via the sample_barcodes option in the algorithm section.")
sys.exit(1)
# import pdb; pdb.set_trace()
utils.safe_makedir(out_dir)
out_fn = os.path.join(out_dir, "barcodes.csv")
with open(fn) as inh:
with open(out_fn, 'w') as outh:
for line in inh:
outh.write("%s\n" % (line.strip().split(",")[0]))
return out_fn

def umi_transform(data):
"""
transform each read by identifying the barcode and UMI for each read
Expand Down Expand Up @@ -242,7 +256,8 @@ def barcode_histogram(data):
do.run(cmd.format(**locals()), message)
cutoff = dd.get_minimum_barcode_depth(data)
filter_barcode_histogram(filtered_out_file, out_file, cutoff)
return [[data]]
newdata = dd.set_histogram_counts(data, filtered_out_file)
return [[newdata]]

def tagcount(data):
bam = dd.get_transcriptome_bam(data)
Expand Down Expand Up @@ -339,6 +354,10 @@ def demultiplex_samples(data):
"""
demultiplex a fastqtransformed FASTQ file into separate sample barcode files
"""
work_dir = os.path.join(dd.get_work_dir(data), "umis")
sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
demulti_dir = os.path.join(sample_dir, "demultiplexed")

files = data["files"]
if len(files) == 2:
logger.error("Sample demultiplexing doesn't handle paired-end reads, but "
Expand All @@ -351,14 +370,8 @@ def demultiplex_samples(data):
read = next(in_handle)
if "SAMPLE_" not in read:
return [[data]]
bcfile = dd.get_sample_barcodes(data)
if not bcfile:
logger.error("Sample demultiplexing needs a list of known indexes provided "
"with via the sample_barcodes option in the algorithm section.")
sys.exit(1)
work_dir = os.path.join(dd.get_work_dir(data), "umis")
sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
demulti_dir = os.path.join(sample_dir, "demultiplexed")

bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir)
demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
if demultiplexed:
return [split_demultiplexed_sampledata(data, demultiplexed)]
Expand Down Expand Up @@ -392,6 +405,7 @@ def split_demultiplexed_sampledata(data, demultiplexed):
def concatenate_sparse_counts(*samples):
samples = concatenate_sparse_matrices(samples, deduped=True)
samples = concatenate_sparse_matrices(samples, deduped=False)
samples = concatenate_cb_histograms(samples)
return samples

def concatenate_sparse_matrices(samples, deduped=True):
Expand Down Expand Up @@ -434,6 +448,25 @@ def concatenate_sparse_matrices(samples, deduped=True):
return newsamples
return samples

def concatenate_cb_histograms(samples):
work_dir = dd.get_in_samples(samples, dd.get_work_dir)
umi_dir = os.path.join(work_dir, "umis")
out_file = os.path.join(umi_dir, "cb-histogram.txt")

files = [dd.get_histogram_counts(data) for data in
dd.sample_data_iterator(samples)
if dd.get_histogram_counts(data)]
files = " ".join(files)
cmd = "cat {files} > {out_file}"
if not file_exists(out_file):
with file_transaction(out_file) as tx_out_file:
message = "Concay cb histograms."
do.run(cmd.format(**locals()), message)
newsamples = []
for data in dd.sample_data_iterator(samples):
newsamples.append([dd.set_combined_histogram(data, out_file)])
return newsamples

def version(data):
umis_cmd = config_utils.get_program("umis", data, default="umis")
version_cmd = [umis_cmd, "version"]
Expand Down
3 changes: 2 additions & 1 deletion bcbio/srna/sample.py
Expand Up @@ -66,7 +66,8 @@ def trim_srna_sample(data):
adapter_cmd = "-N %s" % adapter_cmd
out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
atropos = _get_atropos()
# atropos = _get_atropos()
atropos = config_utils.get_program("atropos", data, default="atropos")
options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
if options.strip() == "-u 4 -u -4":
options = ""
Expand Down
9 changes: 9 additions & 0 deletions bcbio/upload/__init__.py
Expand Up @@ -753,6 +753,8 @@ def _get_files_project(sample, upload_config):
"type": "rownames"})
out.append({"path": count_file + ".colnames",
"type": "colnames"})
out.append({"path": count_file + ".metadata",
"type": "metadata"})
umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx"
if utils.file_exists(umi_file):
out.append({"path": umi_file,
Expand All @@ -761,6 +763,13 @@ def _get_files_project(sample, upload_config):
"type": "rownames"})
out.append({"path": umi_file + ".colnames",
"type": "colnames"})
if dd.get_combined_histogram(sample):
out.append({"path": dd.get_combined_histogram(sample),
"type": "txt"})
rda = os.path.join(os.path.dirname(count_file), "se.rda")
if utils.file_exists(rda):
out.append({"path": rda,
"type": "rda"})
else:
out.append({"path": dd.get_combined_counts(sample)})
if dd.get_annotated_combined_counts(sample):
Expand Down

0 comments on commit c3b057d

Please sign in to comment.