Permalink
Please sign in to comment.
Showing
with
116 additions
and 0 deletions.
| @@ -0,0 +1,116 @@ | ||
| +#!/usr/bin/env python | ||
| + | ||
| +""" | ||
| +Combine Bismark methylation coverage files, either by aggregating coverage | ||
| +or by appending columns | ||
| +""" | ||
| + | ||
| +import os | ||
| +import sys | ||
| +import re | ||
| + | ||
| +from collections import defaultdict | ||
| + | ||
| + | ||
| +def read_file(filename, region=None, region_re=re.compile('^\s*(\w+)\s*:\s*(\d+)\s*-\s*(\d+)\s*$')): | ||
| + if region: | ||
| + region_match = region_re.match(region) | ||
| + assert region_match, 'Invalid region: {}'.format(region) | ||
| + region_chrom, region_start, region_end = region_match.groups() | ||
| + region_start = int(region_start) | ||
| + region_end = int(region_end) | ||
| + | ||
| + counts = {} | ||
| + with open(filename) as ifp: | ||
| + for line in ifp: | ||
| + line = line.rstrip('\n') | ||
| + if not line or line.startswith('#'): continue | ||
| + | ||
| + tokens = line.split('\t') | ||
| + if len(tokens) == 5: | ||
| + chrom, start, cpg, n_mc, n_c = tokens | ||
| + else: | ||
| + raise NotImplementedError() | ||
| + | ||
| + start = int(start) | ||
| + n_mc = int(n_mc) | ||
| + n_c = int(n_c) | ||
| + | ||
| + if region and (chrom != region_chrom or | ||
| + start < region_start or | ||
| + start > region_end): | ||
| + continue | ||
| + | ||
| + locus = (chrom, start, cpg) | ||
| + assert locus not in counts | ||
| + counts[locus] = (n_mc, n_c) | ||
| + | ||
| + return counts | ||
| + | ||
| +def read_many_files(filenames, aggregate=False, region=None): | ||
| + all_loci = set() | ||
| + all_counts = [] | ||
| + for filename in filenames: | ||
| + counts = read_file(filename, region=region) | ||
| + all_loci.update(counts) | ||
| + all_counts.append(counts) | ||
| + | ||
| + merged_counts = {} | ||
| + for locus in all_loci: | ||
| + if aggregate: | ||
| + row = [0, 0] | ||
| + else: | ||
| + row = [] | ||
| + | ||
| + for counts in all_counts: | ||
| + count = counts.get(locus, (0, 0)) | ||
| + if aggregate: | ||
| + row[0] += count[0] | ||
| + row[1] += count[1] | ||
| + else: | ||
| + row.extend(count) | ||
| + | ||
| + merged_counts[locus] = row | ||
| + | ||
| + return merged_counts | ||
| + | ||
| +def script(filenames, aggregate=False, add_header=False, region=None): | ||
| + counts = read_many_files(filenames, aggregate=aggregate, region=region) | ||
| + loci = sorted(counts) | ||
| + | ||
| + if add_header: | ||
| + colnames = ['chrom', 'pos', 'cpg'] | ||
| + if aggregate: | ||
| + colnames.extend(['methyl', 'unmethyl']) | ||
| + else: | ||
| + for filename in filenames: | ||
| + base = os.path.dirname(filename) | ||
| + colnames.extend([base + '_methyl', base + '_unmethyl']) | ||
| + print('#{}'.format('\t'.join(colnames))) | ||
| + for locus in loci: | ||
| + row = counts[locus] | ||
| + line = list(locus) | ||
| + line.extend(row) | ||
| + print('\t'.join(map(str, line))) | ||
| + | ||
| +def parse_args(args): | ||
| + from argparse import ArgumentParser | ||
| + description = __doc__.strip() | ||
| + | ||
| + parser = ArgumentParser(description=description) | ||
| + parser.add_argument('filenames', metavar='BISMARK.COV', nargs='+') | ||
| + parser.add_argument('--add-header', default=False, action='store_true') | ||
| + parser.add_argument('--region', help='Filter to only sites within given' | ||
| + ' chr:start-end region.') | ||
| + parser.add_argument('--aggregate', default=False, action='store_true', | ||
| + help='Add together coverage counts, rather than' | ||
| + ' appending columns') | ||
| + | ||
| + return parser.parse_args(args) | ||
| + | ||
| +def main(args=sys.argv[1:]): | ||
| + args = parse_args(args) | ||
| + script(**vars(args)) | ||
| + | ||
| +if __name__ == '__main__': | ||
| + sys.exit(main()) |
0 comments on commit
3b0392e