diff --git a/src/bis_seq/merge_bismark_coverage.py b/src/bis_seq/merge_bismark_coverage.py new file mode 100755 index 0000000..7f51c7b --- /dev/null +++ b/src/bis_seq/merge_bismark_coverage.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +""" +Combine Bismark methylation coverage files, either by aggregating coverage +or by appending columns +""" + +import os +import sys +import re + +from collections import defaultdict + + +def read_file(filename, region=None, region_re=re.compile('^\s*(\w+)\s*:\s*(\d+)\s*-\s*(\d+)\s*$')): + if region: + region_match = region_re.match(region) + assert region_match, 'Invalid region: {}'.format(region) + region_chrom, region_start, region_end = region_match.groups() + region_start = int(region_start) + region_end = int(region_end) + + counts = {} + with open(filename) as ifp: + for line in ifp: + line = line.rstrip('\n') + if not line or line.startswith('#'): continue + + tokens = line.split('\t') + if len(tokens) == 5: + chrom, start, cpg, n_mc, n_c = tokens + else: + raise NotImplementedError() + + start = int(start) + n_mc = int(n_mc) + n_c = int(n_c) + + if region and (chrom != region_chrom or + start < region_start or + start > region_end): + continue + + locus = (chrom, start, cpg) + assert locus not in counts + counts[locus] = (n_mc, n_c) + + return counts + +def read_many_files(filenames, aggregate=False, region=None): + all_loci = set() + all_counts = [] + for filename in filenames: + counts = read_file(filename, region=region) + all_loci.update(counts) + all_counts.append(counts) + + merged_counts = {} + for locus in all_loci: + if aggregate: + row = [0, 0] + else: + row = [] + + for counts in all_counts: + count = counts.get(locus, (0, 0)) + if aggregate: + row[0] += count[0] + row[1] += count[1] + else: + row.extend(count) + + merged_counts[locus] = row + + return merged_counts + +def script(filenames, aggregate=False, add_header=False, region=None): + counts = read_many_files(filenames, aggregate=aggregate, region=region) + loci = sorted(counts) + + if add_header: + colnames = ['chrom', 'pos', 'cpg'] + if aggregate: + colnames.extend(['methyl', 'unmethyl']) + else: + for filename in filenames: + base = os.path.dirname(filename) + colnames.extend([base + '_methyl', base + '_unmethyl']) + print('#{}'.format('\t'.join(colnames))) + for locus in loci: + row = counts[locus] + line = list(locus) + line.extend(row) + print('\t'.join(map(str, line))) + +def parse_args(args): + from argparse import ArgumentParser + description = __doc__.strip() + + parser = ArgumentParser(description=description) + parser.add_argument('filenames', metavar='BISMARK.COV', nargs='+') + parser.add_argument('--add-header', default=False, action='store_true') + parser.add_argument('--region', help='Filter to only sites within given' + ' chr:start-end region.') + parser.add_argument('--aggregate', default=False, action='store_true', + help='Add together coverage counts, rather than' + ' appending columns') + + return parser.parse_args(args) + +def main(args=sys.argv[1:]): + args = parse_args(args) + script(**vars(args)) + +if __name__ == '__main__': + sys.exit(main())