From 15b0ffda4dec316fe377cb28b47d1f5e4312cea1 Mon Sep 17 00:00:00 2001 From: Orion Buske Date: Fri, 7 Nov 2014 19:00:18 -0500 Subject: [PATCH] Added capture-c probe creation script --- src/capture_c/fa2oligo.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 src/capture_c/fa2oligo.py diff --git a/src/capture_c/fa2oligo.py b/src/capture_c/fa2oligo.py new file mode 100644 index 0000000..ba32f00 --- /dev/null +++ b/src/capture_c/fa2oligo.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +""" +Given a single-sequence fasta, generate candidate oligonuclotide sequences +around restriction sites in a fasta output. +""" + +__author__ = 'Orion Buske (buske@cs.toronto.edu)' + +import os +import sys +import logging + +def read_fasta(filename): + parts = [] + with open(filename) as ifp: + header = ifp.readline() + assert header.startswith('>') + seq_name = header.lstrip('>').strip() + logging.info('Designing probes for sequence: {}'.format(seq_name)) + for line in ifp: + line = line.rstrip('\n') + assert not line.startswith('>') + parts.append(line) + + return ''.join(parts) + +def generate_probes(fragment, probe_length, max_probe_distance): + probes = set() + if len(fragment) < probe_length: + return probes + + # Number of steps to tile from each end + n_tiles = min(len(fragment), max_probe_distance) - probe_length + logging.debug('Tiling {} probes from each end'.format(n_tiles)) + + for offset in range(n_tiles): + oligo = fragment[offset:offset + probe_length] + probes.add(oligo) + + # If smaller than max_probe_distance, probes already span full fragment + if len(fragment) > max_probe_distance: + # Otherwise, need to capture some probes from other end + for offset in range(n_tiles): + start = len(fragment) - offset - probe_length + end = len(fragment) - offset + oligo = fragment[start:end] + probes.add(oligo) + + return probes + +def print_probes(probes): + for i, probe in enumerate(probes): + print('>probe_{}\n{}'.format(i, probe)) + +def script(fasta_filename, restriction_site, probe_length, max_probe_distance): + seq = read_fasta(fasta_filename) + logging.info('Sequence is {}bp'.format(len(seq))) + + fragments = seq.split(restriction_site) + logging.info('{} fragments expected after restriction with {}'.format(len(fragments), restriction_site)) + + probes = set() + for fragment in fragments: + logging.debug('Fragment of length: {}'.format(len(fragment))) + frag_probes = generate_probes(fragment, probe_length, max_probe_distance) + logging.debug('Designed {} probes'.format(len(frag_probes))) + probes.update(frag_probes) + + logging.info('Generated {} unique {}bp probe sequences'.format(len(probes), probe_length)) + print_probes(probes) + +def parse_args(args): + from argparse import ArgumentParser + description = __doc__.strip() + + parser = ArgumentParser(description=description) + parser.add_argument('fasta_filename', metavar='FASTA', + help='Single-sequence fasta to design probes for') + parser.add_argument('--restriction-site', default='GATC', + help='Restriction site sequence (default for DpnII)') + parser.add_argument('--probe-length', default=120, + help='The length of each probe oligo') + parser.add_argument('--max-probe-distance', default=140, + help='Maximum distance from far end of probe to' + ' nearest restriction site') + parser.add_argument('--log', default='INFO', + help='Logging level (e.g. DEBUG, INFO, WARNING)') + + return parser.parse_args(args) + +def main(args=sys.argv[1:]): + args = vars(parse_args(args)) + logging.basicConfig(level=args.pop('log')) + + script(**args) + +if __name__ == '__main__': + sys.exit(main())