Skip to content

Commit

Permalink
Merge cc5744b into ed106c3
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc committed Jan 30, 2019
2 parents ed106c3 + cc5744b commit f67f556
Show file tree
Hide file tree
Showing 14 changed files with 6,520 additions and 4 deletions.
92 changes: 91 additions & 1 deletion illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import util.file
import util.misc
import tools.picard
from util.illumina_indices import IlluminaIndexReference
from util.illumina_indices import IlluminaIndexReference, IlluminaBarcodeHelper

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -438,6 +438,96 @@ def sum_reducer(accumulator, element):

log.info("done")

# ======================================
# *** guess_low-abundance_barcodes ***
# ======================================

def parser_guess_barcodes(parser=argparse.ArgumentParser()):
parser.add_argument('in_barcodes', help='The barcode counts file produced by common_barcodes.')
parser.add_argument('in_picard_metrics', help='The demultiplexing read metrics produced by Picard.')
parser.add_argument('out_summary_tsv', help='''Path to the summary file (.tsv format). It includes several columns:
(sample_name, expected_barcode_1, expected_barcode_2,
expected_barcode_1_name, expected_barcode_2_name,
expected_barcodes_read_count, guessed_barcode_1,
guessed_barcode_2, guessed_barcode_1_name,
guessed_barcode_2_name, guessed_barcodes_read_count,
match_type),
where the expected values are those used by Picard during demultiplexing
and the guessed values are based on the barcodes seen among the data.''')

group = parser.add_mutually_exclusive_group()
group.add_argument('--readcount_threshold',
default=None,
type=int,
help='''If specified, guess barcodes for samples with fewer than this many reads.''')
group.add_argument('--sample_names',
nargs='*',
help='If specified, only guess barcodes for these sample names.',
type=str,
default=None)
parser.add_argument('--outlier_threshold',
help='threshold of how far from unbalanced a sample must be to be considered an outlier.',
type=float,
default=0.675)
parser.add_argument('--expected_assigned_fraction',
help='The fraction of reads expected to be assigned. An exception is raised if fewer than this fraction are assigned.',
type=float,
default=0.7)
parser.add_argument('--number_of_negative_controls',
help='The number of negative controls in the pool, for calculating expected number of reads in the rest of the pool.',
type=int,
default=1)

parser.add_argument('--rows_limit',
default=1000,
type=int,
help='''The number of rows to use from the in_barcodes.''')


util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, main_guess_barcodes, split_args=True)
return parser

def main_guess_barcodes(in_barcodes,
in_picard_metrics,
out_summary_tsv,
sample_names,
outlier_threshold,
expected_assigned_fraction,
number_of_negative_controls,
readcount_threshold,
rows_limit):
"""
Guess the barcode value for a sample name,
based on the following:
- a list is made of novel barcode pairs seen in the data, but not in the picard metrics
- for the sample in question, get the most abundant novel barcode pair where one of the
barcodes seen in the data matches one of the barcodes in the picard metrics (partial match)
- if there are no partial matches, get the most abundant novel barcode pair
Limitations:
- If multiple samples share a barcode with multiple novel barcodes, disentangling them
is difficult or impossible
The names of samples to guess are selected:
- explicitly by name, passed via argument, OR
- explicitly by read count threshold, OR
- automatically (if names or count threshold are omitted)
based on basic outlier detection of deviation from an assumed-balanced pool with
some number of negative controls
"""

bh = util.illumina_indices.IlluminaBarcodeHelper(in_barcodes, in_picard_metrics, rows_limit)
guessed_barcodes = bh.find_uncertain_barcodes(sample_names=sample_names,
outlier_threshold=outlier_threshold,
expected_assigned_fraction=expected_assigned_fraction,
number_of_negative_controls=number_of_negative_controls,
readcount_threshold=readcount_threshold)
bh.write_guessed_barcodes(out_summary_tsv, guessed_barcodes)

__commands__.append(('guess_barcodes', parser_guess_barcodes))


# ============================
# *** IlluminaDirectory ***
# ============================
Expand Down
3 changes: 3 additions & 0 deletions pipes/WDL/workflows/tasks/tasks_demux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ task illumina_demux {
--compression_level=5 \
--loglevel=DEBUG

illumina.py guess_barcodes --expected_assigned_fraction=0 barcodes.txt metrics.txt barcodes_outliers.txt

mkdir -p unmatched
mv Unmatched.bam unmatched/

Expand All @@ -172,6 +174,7 @@ task illumina_demux {
output {
File metrics = "metrics.txt"
File commonBarcodes = "barcodes.txt"
File outlierBarcodes = "barcodes_outliers.txt"
Array[File] raw_reads_unaligned_bams = glob("*.bam")
File unmatched_reads_bam = "unmatched/Unmatched.bam"
Array[File] raw_reads_fastqc = glob("*_fastqc.html")
Expand Down
4 changes: 3 additions & 1 deletion pipes/rules/demux.rules
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ rule illumina_demux:
output:
config['tmp_dir']+'/'+config['subdirs']['demux']+'/bams_per_lane/{flowcell}.{lane}/Unmatched.bam',
config['reports_dir']+'/barcodes/barcodes-metrics-{flowcell}.{lane}.txt',
config['reports_dir']+'/barcodes/common-barcodes-{flowcell}.{lane}.txt'
config['reports_dir']+'/barcodes/common-barcodes-{flowcell}.{lane}.txt',
config['reports_dir']+'/barcodes/outlier-barcodes-{flowcell}.{lane}.txt'
resources:
mem_mb = 8*1000,
threads = 16
Expand All @@ -100,6 +101,7 @@ rule illumina_demux:
if lane.get(opt):
opts += ' --%s=%s' % (opt, lane[opt])
shell("{config[bin_dir]}/illumina.py illumina_demux {input[0]} {wildcards.lane} {outdir} --sampleSheet={input[1]} --sequencing_center={params.center} --outMetrics={output[1]} --commonBarcodes={output[2]} --flowcell={wildcards.flowcell} {opts}")
shell("{config[bin_dir]}/illumina.py guess_barcodes --expected_assigned_fraction=0 {output[2]} {output[1]} {output[3]}")


def demux_move_bams_inputs(wildcards):
Expand Down
Loading

0 comments on commit f67f556

Please sign in to comment.