diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..d476e2ef --- /dev/null +++ b/.coveragerc @@ -0,0 +1,14 @@ +# .coveragerc to control test coverage report +[report] + +exclude_lines = + pragma: no cover + + raise AssertionError + raise NotImplementedError + + def __repr__ + if self\.debug + + if 0: + if __name__ == .__main__.: diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..ac9fb640 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,29 @@ +language: python +sudo: false + +cache: + directories: + - $HOME/virtualenv + +env: + global: + - CACHE_DIR="$HOME/virtualenv" + - PYTHONIOENCODING=UTF8 + +python: + - 2.7 + - 3.4 + +before_install: + - travis/before_install.sh + +install: + - travis/install-pip.sh + - travis/install-tools.sh + +script: + - travis/tests-unit.sh + - travis/tests-long.sh + +after_success: + - coveralls diff --git a/README.md b/README.md index a791f2fb..ad086b35 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ **Composite of Multiple Signals**: tests for selection in meiotically recombinant populations +[![Travis CI Status](https://api.travis-ci.org/broadinstitute/cms.svg)](https://travis-ci.org/broadinstitute/cms) +[![Coverage Status](https://coveralls.io/repos/broadinstitute/cms/badge.svg)](https://coveralls.io/r/broadinstitute/cms) [![Documentation Status](https://readthedocs.org/projects/broad-cms/badge/?version=latest)](https://readthedocs.org/projects/broad-cms/?badge=latest) More detailed documentation can be found at http://broad-cms.readthedocs.org/ diff --git a/cms/VERSION b/cms/VERSION new file mode 100644 index 00000000..f5fc016e --- /dev/null +++ b/cms/VERSION @@ -0,0 +1 @@ +7bc1b96-dirty diff --git a/cms/cms_exceptions.py b/cms/cms_exceptions.py new file mode 100644 index 00000000..3622362b --- /dev/null +++ b/cms/cms_exceptions.py @@ -0,0 +1,6 @@ +class CMSException(Exception): + '''base class for CMS exceptions. except for built-ins, all should be descendants of this exception so that an except block for this exception will catch all others''' + pass + +class CMSInputError(CMSException): + pass diff --git a/cms/main.py b/cms/main.py new file mode 100644 index 00000000..ef4726d9 --- /dev/null +++ b/cms/main.py @@ -0,0 +1,12 @@ +#!/usr/bin/python + +class CMS(object): + def __init__(self): + pass + + def main(self): + return 0 + +if __name__ == "__main__": + cms = CMS() + cms.main() \ No newline at end of file diff --git a/cms/selection.py b/cms/selection.py new file mode 100644 index 00000000..54ab0c23 --- /dev/null +++ b/cms/selection.py @@ -0,0 +1,8 @@ +#!/usr/bin/python + +class Selection(object): + def __init__(self): + pass + + def main(self): + return 0 diff --git a/cms/test/__init__.py b/cms/test/__init__.py new file mode 100644 index 00000000..6fb3c5dd --- /dev/null +++ b/cms/test/__init__.py @@ -0,0 +1,35 @@ +'''utilities for tests''' + +__author__ = "irwin@broadinstitute.org" + +import filecmp, os, unittest + +import util.file + +def assert_equal_contents(testCase, filename1, filename2) : + 'Assert contents of two files are equal for a unittest.TestCase' + testCase.assertTrue(filecmp.cmp(filename1, filename2, shallow=False)) + +class TestCaseWithTmp(unittest.TestCase) : + 'Base class for tests that use tempDir' + def setUp(self) : + util.file.set_tmpDir(type(self).__name__) + + def tearDown(self) : + util.file.destroy_tmpDir() + + def assertEqualContents(self, f1, f2) : + assert_equal_contents(self, f1, f2) + + +""" +When "nose" executes python scripts for automated testing, it excludes ones with +the executable bit set (in case they aren't import safe). To prevent any of the +tests in this folder from being silently excluded, assure this bit is not set. +""" +def assert_none_executable() : + testDir = os.path.dirname(__file__) + assert all(not os.access(os.path.join(testDir, filename), os.X_OK) + for filename in os.listdir(testDir) + if filename.endswith('.py')) +assert_none_executable() diff --git a/util/__init__.py b/cms/test/integration/__init__.py similarity index 100% rename from util/__init__.py rename to cms/test/integration/__init__.py diff --git a/cms/test/integration/test_main.py b/cms/test/integration/test_main.py new file mode 100644 index 00000000..bbd570a9 --- /dev/null +++ b/cms/test/integration/test_main.py @@ -0,0 +1,18 @@ +#!/usr/bin/python + +import unittest +import main + +class TestCMS(unittest.TestCase): + # to be executed prior to running tests + def setUp(self): + self.cms = main.CMS() + pass + + # to be executed after running tests, regardless of pass/fail + # only called if setUp succeeds + def tearDown(self): + pass + + def test_main_return_code(self): + self.assertEqual( self.cms.main(), 0 ) diff --git a/cms/test/unit/__init__.py b/cms/test/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cms/test/unit/test_selection.py b/cms/test/unit/test_selection.py new file mode 100644 index 00000000..e30d8b2c --- /dev/null +++ b/cms/test/unit/test_selection.py @@ -0,0 +1,18 @@ +#!/usr/bin/python + +import unittest +from selection import Selection + +class TestSelection(unittest.TestCase): + # to be executed prior to running tests + def setUp(self): + self.cms_selection = Selection() + pass + + # to be executed after running tests, regardless of pass/fail + # only called if setUp succeeds + def tearDown(self): + pass + + def test_selection_return_code(self): + self.assertEqual( self.cms_selection.main(), 0 ) diff --git a/cms/test/unit/test_tools.py b/cms/test/unit/test_tools.py new file mode 100644 index 00000000..0dcc2452 --- /dev/null +++ b/cms/test/unit/test_tools.py @@ -0,0 +1,33 @@ +# Unit tests for tools/__init__.py + +__author__ = "dpark@broadinstitute.org" + +import tools +from tools import * +import unittest, tempfile, shutil, os, logging +import util.cmd, util.file +from test import TestCaseWithTmp + +log = logging.getLogger(__name__) + +class TestToolsInstallation(TestCaseWithTmp): + def setUp(self): + super(TestToolsInstallation, self).setUp() + util.cmd.setup_logger('INFO') + def testAllToolInstallers(self): + def iter_leaf_subclasses(aClass) : + "Iterate over subclasses at all levels that don't themselves have a subclass" + isLeaf = True + for subclass in aClass.__subclasses__() : + isLeaf = False + for leafClass in iter_leaf_subclasses(subclass) : + yield leafClass + #if isLeaf : + # yield aClass + '''Load every tool's default chain of install methods and try them.''' + for tool_class in iter_leaf_subclasses(tools.Tool): + t = tool_class() + t.install() + self.assertTrue(t.is_installed(), "installation of tool %s failed" % tool_class.__name__) + log.info(".. installation of %s succeeded with installer %s" % (tool_class.__name__, t.installed_method.__class__.__name__)) + diff --git a/cms/tools/__init__.py b/cms/tools/__init__.py new file mode 100644 index 00000000..7fbcc04e --- /dev/null +++ b/cms/tools/__init__.py @@ -0,0 +1,222 @@ +'''class Tool, class InstallMethod, and related subclasses and methods''' + +__author__ = "dpark@broadinstitute.org,irwin@broadinstitute.org" + +import os, logging, tempfile, shutil +import util.file + +try: + # Python 3.x + from urllib.request import urlretrieve + from urllib.parse import urlparse +except ImportError: + # Python 2.x + from urllib import urlretrieve + from urlparse import urlparse + +# Put all tool files in __all__ +# allows "from tools import *" to import all tooles for testtools +__all__ = [filename[:-3] # Remove .py + for filename in os.listdir(os.path.dirname(__file__)) # tools directory + if filename.endswith('.py') and filename != '__init__.py' and + filename not in [ # Add any files to exclude here: + # e.g. 'sometool.py', + ] + ] +installed_tools = {} + +log = logging.getLogger(__name__) + +def get_tool_by_name(name): + if name not in installed_tools: + pass + raise NotImplementedError + return installed_tools[name] + +class Tool(object): + ''' Base tool class that includes install machinery. + + TO DO: add something about dependencies.. + ''' + def __init__(self, install_methods=[]): + self.install_methods = install_methods + self.installed_method = None + self.exec_path = None + def is_installed(self): + return (self.installed_method != None) + def install(self): + if not self.is_installed(): + for m in self.install_methods: + if not m.is_attempted(): + m.attempt_install() + if m.is_installed(): + self.installed_method = m + self.exec_path = m.executable_path() + installed_tools[self.__class__.__name__] = self + break + def get_install_methods(self): + return self.install_methods + def set_install_methods(self, methods): + self.install_methods = methods + def version(self): + return None + def executable_path(self): + return self.exec_path + def execute(self, args): + assert not os.system(self.exec_path + ' ' + args) + def install_and_get_path(self) : + self.install() + if self.executable_path()==None: + raise NameError("unsuccessful in installing " + type(self).__name__) + return self.executable_path() + +class InstallMethod(object): + ''' Base class for installation methods for a given tool. + None of these methods should ever fail/error. attempt_install should + return silently regardless of the outcome (is_installed must be + called to verify success or failure). + ''' + def __init__(self): + self.attempts = 0 + def is_attempted(self): + return self.attempts + def attempt_install(self): # Override _attempt_install, not this. + self.attempts += 1 + self._attempt_install() + def _attempt_install(self): + raise NotImplementedError + def is_installed(self): + raise NotImplementedError + def executable_path(self): + raise NotImplementedError + +class PrexistingUnixCommand(InstallMethod): + ''' This is an install method that tries to find whether an executable + binary already exists for free on the unix file system--it doesn't + actually try to install anything. + ''' + def __init__(self, path, verifycmd=None, verifycode=0, + require_executability=True): + self.path = path + self.verifycmd = verifycmd + self.verifycode = verifycode + self.installed = False + self.require_executability = require_executability + InstallMethod.__init__(self) + def _attempt_install(self): + if os.access(self.path, (os.X_OK | os.R_OK) if + self.require_executability else os.R_OK): + if self.verifycmd: + self.installed = (os.system(self.verifycmd) == self.verifycode) + else: + self.installed = True + else: + self.installed = False + def is_installed(self): + if not self.is_attempted(): + self.attempt_install() + return self.installed + def executable_path(self): + return self.installed and self.path or None + +class DownloadPackage(InstallMethod): + ''' This is an install method for downloading, unpacking, and post- + processing straight from the source. + target_rel_path is the executable's path relative to destination_dir + destination_dir defaults to the project build directory + post_download_command will be executed if it isn't None, in + destination_dir. + if post_download_ret != None, assert it is returned by + post_download_command + ''' + def __init__(self, url, target_rel_path, destination_dir=None, + verifycmd=None, verifycode=0, require_executability=True, + post_download_command=None, post_download_ret=0): + if destination_dir == None : + destination_dir = util.file.get_build_path() + self.url = url + self.targetpath = os.path.join(destination_dir, target_rel_path) + self.destination_dir = destination_dir + self.verifycmd = verifycmd + self.verifycode = verifycode + self.installed = False + self.require_executability = require_executability + self.post_download_command = post_download_command + self.post_download_ret = post_download_ret + InstallMethod.__init__(self) + def is_installed(self): + return self.installed + def executable_path(self): + return self.installed and self.targetpath or None + def verify_install(self): + if os.access(self.targetpath, (os.X_OK | os.R_OK) if + self.require_executability else os.R_OK): + if self.verifycmd: + log.debug("validating") + self.installed = (os.system(self.verifycmd) == self.verifycode) + else: + self.installed = True + else: + self.installed = False + return self.installed + def _attempt_install(self): + if not self.verify_install(): + self.pre_download() + self.download() + self.post_download() + self.verify_install() + def pre_download(self): + pass + def download(self): + download_dir = tempfile.gettempdir() + util.file.mkdir_p(download_dir) + filepath = urlparse(self.url).path + filename = filepath.split('/')[-1] + log.info("Downloading from {} ...".format(self.url, + download_dir, + filename)) + urlretrieve(self.url, "%s/%s" % (download_dir,filename)) + self.download_file = filename + self.unpack(download_dir) + def post_download(self): + if self.post_download_command: + return_code = os.system('cd "{}" && {}'.format( + self.destination_dir, self.post_download_command)) + if self.post_download_ret != None: + assert return_code == self.post_download_ret + def unpack(self, download_dir): + log.debug("unpacking") + util.file.mkdir_p(self.destination_dir) + if self.download_file.endswith('.zip'): + if os.system("unzip -o %s/%s -d %s > /dev/null" % (download_dir, + self.download_file, self.destination_dir)): + + return + else: + os.unlink(os.path.join(download_dir, self.download_file)) + elif (self.download_file.endswith('.tar.gz') or + self.download_file.endswith('.tgz') or + self.download_file.endswith('.tar.bz2') or + self.download_file.endswith('.tar')): + if self.download_file.endswith('.tar'): + compression_option = '' + elif self.download_file.endswith('.tar.bz2'): + compression_option = 'j' + else: + compression_option = 'z' + untar_cmd = "tar -C {} -x{}pf {}/{}".format(self.destination_dir, + compression_option, + download_dir, + self.download_file) + log.debug("Untaring with command: {}".format(untar_cmd)) + exitCode = os.system(untar_cmd) + if exitCode: + log_str="tar returned non-zero exitcode {}".format(exitCode) + log.info(log_str) + return + else: + log.debug("tar returned with exit code 0") + os.unlink(os.path.join(download_dir, self.download_file)) + else : + shutil.move(os.path.join(download_dir, self.download_file), + os.path.join(self.destination_dir, self.download_file)) diff --git a/cms/util/__init__.py b/cms/util/__init__.py new file mode 100644 index 00000000..f5d73ac7 --- /dev/null +++ b/cms/util/__init__.py @@ -0,0 +1,10 @@ +import os + +# allows for "from util import *" +__all__ = [filename[:-3] # Remove .py extension + for filename in os.listdir( os.path.dirname(__file__) ) + if filename.endswith('.py') and filename != '__init__.py' and + filename not in [ # Add any files to exclude here: + # e.g. 'sometool.py', + ] + ] \ No newline at end of file diff --git a/cms/util/annot.py b/cms/util/annot.py new file mode 100644 index 00000000..b58dc4d7 --- /dev/null +++ b/cms/util/annot.py @@ -0,0 +1,161 @@ +'''A few methods for dealing with gene annotation from snpEff.''' + +__author__ = "dpark@broadinstitute.org" +__version__ = "PLACEHOLDER" +__date__ = "PLACEHOLDER" + +import sqlite3, itertools, urllib, logging, re, os +import util.file, util.misc + +log = logging.getLogger(__name__) + +class SnpAnnotater: + ''' Add annotations to snps based on a snpEff-annotated VCF file. + ''' + def __init__(self, snpEffVcf=None, snpIterator=None): + self.snpIterator = snpIterator + self.dbFile = util.file.mkstempfname(prefix='SnpAnnotater-', suffix='.db') + self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED') + self.cur = self.conn.cursor() + self.cur.execute("""create table annot ( + chr not null, + pos integer not null, + allele_ref not null, + allele_alt not null, + effect not null, + impact not null, + gene_id, + gene_name, + protein_pos integer, + residue_ref, + residue_alt + )""") + self.cur.execute("create index idx_annot on annot(chr,pos)") + if snpEffVcf: + self.loadVcf(snpEffVcf) + def loadVcf(self, snpEffVcf): + #log.info("reading in snpEff VCF file: %s" % snpEffVcf) + with util.file.open_or_gzopen(snpEffVcf, 'rt') as inf: + ffp = util.file.FlatFileParser(inf) + try: + imap = hasattr(itertools, 'imap') and itertools.imap or map #py2 & py3 compatibility + ifilter = hasattr(itertools, 'ifilter') and itertools.ifilter or filter #py2 & py3 compatibility + self.cur.executemany("""insert into annot (chr,pos,allele_ref,allele_alt, + effect,impact,gene_id,gene_name,protein_pos,residue_ref,residue_alt) + values (?,?,?,?,?,?,?,?,?,?,?)""", + imap(lambda row: + [row['CHROM'], int(row['POS']), row['REF'], row['ALT']] + + parse_eff(row['CHROM'], row['POS'], row['INFO']), + ifilter(lambda r: r['ALT'] != '.', ffp))) + except Exception as e: + log.exception("exception processing file %s line %s" % (snpEffVcf, ffp.line_num)) + raise + self.cur.execute("select chr,pos from annot group by chr,pos having count(*)>1") + dupes = [(c,p) for c,p in self.cur] + if dupes: + log.info("deleting annotation for %d duplicate positions: %s" % (len(dupes), ', '.join(['%s:%s'%(c,p) for c,p in dupes]))) + self.cur.executemany("delete from annot where chr=? and pos=?", dupes) + self.conn.commit() + def __iter__(self): + assert self.snpIterator + for snp in self.snpIterator: + yield self.annotate(snp) + def annotate(self, row): + self.cur.execute("""select effect,impact,gene_id,gene_name,protein_pos, + allele_ref,allele_alt,residue_ref,residue_alt + from annot where chr=? and pos=?""", [row['chr'], int(row['pos'])]) + x = self.cur.fetchone() + if x != None: + row['effect'],row['impact'],row['gene_id'],row['gene_name'],row['protein_pos'],row['allele_ref'],row['allele_alt'],row['residue_ref'],row['residue_alt'] = x + row['alleles'] = '/'.join((row['allele_ref'],row['allele_alt'])) + if row['residue_alt']: + row['residues'] = '/'.join((row['residue_ref'],row['residue_alt'])) + else: + row['residues'] = row['residue_ref'] + else: + row['effect'] = 'UNKNOWN' + row['impact'] = 'UNKNOWN' + return row + def new_fields(self): + return ('effect', 'impact', 'gene_id', 'gene_name', 'protein_pos', 'alleles', 'residues') + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return 0 + def close(self): + self.cur.close() + self.conn.close() + os.unlink(self.dbFile) + + + +def parse_eff(chrom, pos, info, required=True): + try: + impact_rank = {'HIGH':0,'MODERATE':1,'LOW':2,'MODIFIER':3} + infos = [x for x in info.split(';') if x.startswith('EFF=')] + assert len(infos)<=1 + if not infos: + assert not required + return ['', '', '', '', '', '', ''] + info = infos[0] + effs = info[4:].split(',') + out = [] + for eff in effs: + assert eff.endswith(')') + effect, other = eff[:-1].split('(') + other = other.split('|') + assert len(other)>=10 + impact = other[0] + gene_id = other[8] + assert not gene_id or (gene_id.endswith('-1') and gene_id.startswith('rna_')) + if gene_id: + gene_id = gene_id[4:-2] + if gene_id=='PF14_0620' or gene_id=='PF3D7_1465300': + gene_name = 'tRNA 3-trailer sequence RNase, putative' + else: + try: + gene_name = urllib.unquote_plus(other[5]).encode('ascii') + except UnicodeDecodeError as e: + log.error("error at %s:%s decoding the string '%s'" % (chrom, pos, other[5])) + raise + aa_chg = other[3] + if aa_chg: + if effect.startswith('SYNON'): + aas = (aa_chg[0], '') + codon = int(aa_chg[1:]) + elif effect.startswith('NON_SYNON') or effect.startswith('START_') or effect.startswith('STOP_') or effect.startswith('CODON_'): + mo = re.match(r'^(\D+)(\d+)(\D+)$', aa_chg) + assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg) + aas = (mo.group(1), mo.group(3)) + codon = int(mo.group(2)) + elif effect=='FRAME_SHIFT': + mo = re.match(r'^(\D*)(\d+)(\D*)$', aa_chg) + assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg) + aas = ('','') + codon = int(mo.group(2)) + else: + assert 0, "unrecognized effect type (%s) for variant with coding change (%s)" % (effect, aa_chg) + else: + aas = ('','') + codon = '' + out.append([impact_rank[impact], effect, impact, gene_id, gene_name, + codon, aas[0], aas[1]]) + + if len(out)>1: + out.sort() + if out[0][0] == out[1][0]: + #log.debug("SNP found with multiple effects of the same impact level: %s:%s - %s" % (chrom, pos, info)) + #assert out[0][2] in ('MODIFIER','LOW'), "Error: SNP found with multiple effects of the same impact level" + out = [[';'.join(util.misc.unique([str(o[i]) for o in out])) for i in range(len(out[0]))]] + eff = out[0][1:] + return eff + + except Exception as e: + log.exception("exception parsing snpEff on row %s:%s - %s" % (chrom, pos, info)) + raise + except Error as e: + log.error("error parsing snpEff on row %s:%s - %s" % (chrom, pos, info)) + raise + + diff --git a/cms/util/cmd.py b/cms/util/cmd.py new file mode 100644 index 00000000..11a6b1ff --- /dev/null +++ b/cms/util/cmd.py @@ -0,0 +1,161 @@ +'''This gives a main() function that serves as a nice wrapper +around other commands and presents the ability to serve up multiple +command-line functions from a single python script. +''' + +import os, os.path, tempfile, sys, shutil, logging, argparse +import util.version + +__author__ = "dpark@broadinstitute.org" +__version__ = util.version.get_version() + +log = logging.getLogger() +tmpDir = None + +def setup_logger(log_level): + loglevel = getattr(logging, log_level.upper(), None) + assert loglevel, "unrecognized log level: %s" % log_level + log.setLevel(loglevel) + h = logging.StreamHandler() + h.setFormatter(logging.Formatter("%(asctime)s - %(module)s:%(lineno)d:%(funcName)s - %(levelname)s - %(message)s")) + log.addHandler(h) + +def script_name(): + return os.path.basename(sys.argv[0]).rsplit('.',1)[0] + +def common_args(parser, arglist=(('tmpDir',None), ('loglevel',None))): + for k,v in arglist: + if k=='loglevel': + if not v: + v = 'DEBUG' + parser.add_argument("--loglevel", dest="loglevel", + help="Verboseness of output. [default: %(default)s]", + default=v, + choices=('DEBUG','INFO','WARNING','ERROR','CRITICAL','EXCEPTION')) + elif k=='tmpDir': + if not v: + v = find_tmpDir() + parser.add_argument("--tmpDir", dest="tmpDir", + help="Base directory for temp files. [default: %(default)s]", + default=v) + parser.add_argument("--tmpDirKeep", + action="store_true", dest="tmpDirKeep", + help="""Keep the tmpDir if an exception occurs while + running. Default is to delete all temp files at + the end, even if there's a failure.""", + default=False) + elif k=='version': + if not v: + v=__version__ + parser.add_argument('--version', '-V', action='version', version=v) + else: + raise Exception("unrecognized argument %s" % arg) + return parser + +def main_command(mainfunc): + ''' This wraps a python method in another method that can be called + with an argparse.Namespace object. When called, it will pass all + the values of the object on as parameters to the function call. + ''' + def _main(args): + args2 = dict((k,v) for k,v in vars(args).items() if k not in ('loglevel','tmpDir','tmpDirKeep','version','func_main','command')) + mainfunc(**args2) + _main.__doc__ = mainfunc.__doc__ + return _main + +def attach_main(parser, cmd_main, split_args=False): + ''' This attaches the main function call to a parser object. + ''' + if split_args: + cmd_main = main_command(cmd_main) + parser.description = cmd_main.__doc__ + parser.set_defaults(func_main=cmd_main) + return parser + +def make_parser(commands, description): + ''' commands: a list of pairs containing the following: + 1. name of command (string, no whitespace) + 2. method to call (no arguments) that returns an argparse parser. + If commands contains exactly one member and the name of the + only command is None, then we get rid of the whole multi-command + thing and just present the options for that one function. + description: a long string to present as a description of your script + as a whole if the script is run with no arguments + ''' + if len(commands)==1 and commands[0][0]==None: + # only one (nameless) command in this script, simplify + parser = commands[0][1]() + parser.set_defaults(command='') + else: + # multiple commands available + parser = argparse.ArgumentParser(description=description, + usage='%(prog)s subcommand', add_help=False) + parser.add_argument('--help', '-h', action='help', help=argparse.SUPPRESS) + parser.add_argument('--version', '-V', action='version', version=__version__, help=argparse.SUPPRESS) + subparsers = parser.add_subparsers(title='subcommands', dest='command') + for cmd_name, cmd_parser in commands: + p = subparsers.add_parser(cmd_name) + cmd_parser(p) + return parser + +def main_argparse(commands, description): + parser = make_parser(commands, description) + + # if called with no arguments, print help + if len(sys.argv)==1: + parser.parse_args(['--help']) + elif len(sys.argv)==2 and (len(commands)>1 or commands[0][0]!=None): + parser.parse_args([sys.argv[1], '--help']) + args = parser.parse_args() + + setup_logger(not hasattr(args, 'loglevel') and 'DEBUG' or args.loglevel) + log.info("software version: %s, python version: %s" % (__version__, sys.version)) + log.info("command: %s %s %s" % ( + sys.argv[0], sys.argv[1], + ' '.join(["%s=%s" % (k,v) for k,v in vars(args).items() if k not in ('command', 'func_main')]))) + + if hasattr(args, 'tmpDir'): + ''' If this command has a tmpDir option, use that as a base directory + and create a subdirectory within it which we will then destroy at + the end of execution. + ''' + proposed_dir = 'tmp-%s-%s' % (script_name(),args.command) + if 'LSB_JOBID' in os.environ: + proposed_dir = 'tmp-%s-%s-%s-%s' % (script_name(),args.command,os.environ['LSB_JOBID'],os.environ['LSB_JOBINDEX']) + tempfile.tempdir = tempfile.mkdtemp(prefix='%s-'%proposed_dir, dir=args.tmpDir) + log.debug("using tempDir: %s" % tempfile.tempdir) + os.environ['TMPDIR'] = tempfile.tempdir # this is for running R + try: + ret = args.func_main(args) + except: + if hasattr(args, 'tmpDirKeep') and args.tmpDirKeep and not (tempfile.tempdir.startswith('/tmp') or tempfile.tempdir.startswith('/local')): + log.exception("Exception occurred while running %s, saving tmpDir at %s" % (args.command, tempfile.tempdir)) + else: + shutil.rmtree(tempfile.tempdir) + raise + else: + shutil.rmtree(tempfile.tempdir) + else: + # otherwise just run the command + ret = args.func_main(args) + if ret==None: + ret = 0 + return ret + +def find_tmpDir(): + ''' This provides a suggested base directory for a temp dir for use in your + argparse-based tmpDir option. + ''' + tmpdir = '/tmp' + if os.access('/local/scratch', os.X_OK | os.W_OK | os.R_OK) and os.path.isdir('/local/scratch'): + tmpdir = '/local/scratch' + if 'LSB_JOBID' in os.environ: + # this directory often exists for LSF jobs, but not always. + # for example, if the job is part of a job array, this directory is called + # something unpredictable and unfindable, so just use /local/scratch + proposed_dir = '/local/scratch/%s.tmpdir' % os.environ['LSB_JOBID'] + if os.access(proposed_dir, os.X_OK | os.W_OK | os.R_OK): + tmpdir = proposed_dir + elif 'TMPDIR' in os.environ and os.path.isdir(os.environ['TMPDIR']): + tmpdir = os.environ['TMPDIR'] + return tmpdir diff --git a/cms/util/file.py b/cms/util/file.py new file mode 100644 index 00000000..dd399e5b --- /dev/null +++ b/cms/util/file.py @@ -0,0 +1,217 @@ +'''This gives a number of useful quick methods for dealing with +tab-text files and gzipped files. +''' + +__author__ = "dpark@broadinstitute.org" +__version__ = "PLACEHOLDER" +__date__ = "PLACEHOLDER" + +import os, gzip, tempfile, shutil, errno, logging, json +import util.cmd + +log = logging.getLogger(__name__) + +def get_project_path() : + '''Return the absolute path of the top-level project, assumed to be the + parent of the directory containing this script.''' + # abspath converts relative to absolute path; expanduser interprets ~ + path = __file__ # path to this script + path = os.path.expanduser(path) # interpret ~ + path = os.path.abspath(path) # convert to absolute path + path = os.path.dirname(path) # containing directory: util + path = os.path.dirname(path) # containing directory: main project dir + return path + +def get_build_path() : + '''Return absolute path of "build" directory''' + return os.path.join(get_project_path(), 'tools', 'build') + +def get_scripts_path() : + '''Return absolute path of "scripts" directory''' + return os.path.join(get_project_path(), 'tools', 'scripts') + +def get_binaries_path() : + '''Return absolute path of "binaries" directory''' + return os.path.join(get_project_path(), 'tools', 'binaries') + +def get_test_path() : + '''Return absolute path of "test" directory''' + return os.path.join(get_project_path(), 'test') + +def get_test_input_path(testClassInstance=None) : + '''Return the path to the directory containing input files for the specified + test class + ''' + if testClassInstance is not None : + return os.path.join(get_test_path(), 'input', + type(testClassInstance).__name__) + else: + return os.path.join(get_test_path(), 'input') + +def get_resources() : + ''' Return the project resources dictionary ''' + jsonfile = os.path.join(get_project_path(), 'resources.json') + with open(jsonfile, 'rt') as inf: + resources = json.load(inf) + return resources + +def mkstempfname(suffix='', prefix='tmp', dir=None, text=False): + ''' There's no other one-liner way to securely ask for a temp file by + filename only. This calls mkstemp, which does what we want, except + that it returns an open file handle, which causes huge problems on NFS + if we don't close it. So close it first then return the name part only. + ''' + fd, fn = tempfile.mkstemp(prefix=prefix, suffix=suffix, dir=dir, text=text) + os.close(fd) + return fn + +def set_tmpDir(name): + proposed_prefix = ['tmp'] + if name: + proposed_prefix.append(name) + for e in ('LSB_JOBID','LSB_JOBINDEX'): + if e in os.environ: + proposed_prefix.append(os.environ[e]) + tempfile.tempdir = tempfile.mkdtemp(prefix='-'.join(proposed_prefix)+'-', + dir=util.cmd.find_tmpDir()) + +def destroy_tmpDir(): + if tempfile.tempdir: + shutil.rmtree(tempfile.tempdir) + tempfile.tempdir = None + +def mkdir_p(dirpath): + ''' Verify that the directory given exists, and if not, create it. + ''' + try: + os.makedirs(dirpath) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(dirpath): + pass + else: raise + +def open_or_gzopen(fname, *opts): + return fname.endswith('.gz') and gzip.open(fname, *opts) or open(fname, *opts) + +def read_tabfile_dict(inFile): + ''' Read a tab text file (possibly gzipped) and return contents as an + iterator of dicts. + ''' + with open_or_gzopen(inFile, 'rt') as inf: + header = None + for line in inf: + row = line.rstrip('\n').split('\t') + if line.startswith('#'): + row[0] = row[0][1:] + header = row + elif header==None: + header = row + else: + assert len(header)==len(row) + yield dict((k,v) for k,v in zip(header, row) if v) + +def read_tabfile(inFile): + ''' Read a tab text file (possibly gzipped) and return contents as an + iterator of arrays. + ''' + with open_or_gzopen(inFile, 'rt') as inf: + for line in inf: + if not line.startswith('#'): + yield line.rstrip('\n').split('\t') + +def readFlatFileHeader(filename, headerPrefix='#', delim='\t'): + with open_or_gzopen(filename, 'rt') as inf: + header = inf.readline().rstrip('\n').split(delim) + if header and header[0].startswith(headerPrefix): + header[0] = header[0][len(headerPrefix):] + return header + +class FlatFileParser: + ''' Generic flat file parser that parses tabular text input + ''' + def __init__(self, lineIter=None, name=None, outType='dict', + readHeader=True, headerPrefix='#', delim='\t', + requireUniqueHeader=False): + self.lineIter = lineIter + self.header = None + self.name = name + self.headerPrefix = headerPrefix + self.readHeader = readHeader + self.delim = delim + self.requireUniqueHeader = requireUniqueHeader + self.line_num=0 + assert outType in ('dict','arrayStrict', 'arrayLoose','both') + self.outType=outType + assert readHeader or outType in ('arrayStrict', 'arrayLoose') + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return 0 + + def __iter__(self): + assert self.lineIter + for row in self.lineIter: + out = self.parse(row) + if out != None: + yield out + + def parse(self, row): + self.line_num += 1 + try: + line = row.rstrip('\n').split(self.delim) + if self.readHeader: + if self.headerPrefix and row.startswith(self.headerPrefix): + line[0] = line[0][len(self.headerPrefix):] + assert not (self.requireUniqueHeader and self.header) + self.parseHeader(line) + return None + elif not self.header: + self.parseHeader(line) + return None + else: + return self.parseRow(line) + else: + return self.parseRow(line) + except Exception as e: + template = "Exception parsing file at line {}. Line contents: '{}'" + message = template.format(self.line_num, row) + if self.name: + log.exception("{message} File: {self.name}".format(**locals())) + else: + log.exception(message) + raise + + def parseHeader(self, row): + assert row + self.header = row + if self.outType != 'arrayLoose': + assert len(row) == len(dict([(x,0) for x in row])) + + def parseRow(self, row): + assert self.outType == 'arrayLoose' or ( self.header and + len(self.header) == len(row) ) + + if self.outType =='arrayLoose' or self.outType == 'arrayStrict' : + return row + out = { self.header[i]: row[i] for i in range( len(self.header) ) } + if self.outType=='both': + for i in range(len(self.header)): + out[i] = row[i] + return out + + +def fastaMaker(seqs, linewidth=60): + assert linewidth > 0 + + for id, seq in seqs: + yield ">{}\n".format(id) + + while len(seq) > linewidth: + line = seq[:linewidth] + seq = seq[linewidth:] + yield "{}\n".format(line) + + if seq: + yield seq+"\n" diff --git a/cms/util/misc.py b/cms/util/misc.py new file mode 100644 index 00000000..8052bea6 --- /dev/null +++ b/cms/util/misc.py @@ -0,0 +1,78 @@ +'''A few miscellaneous tools. ''' +from __future__ import division # Division of integers with / should never round! +import itertools + +__author__ = "dpark@broadinstitute.org" + +def unique(items): + ''' Return unique items in the same order as seen in the input. ''' + seen = set() + for i in items: + if i not in seen: + seen.add(i) + yield i + +def histogram(items): + ''' I count the number of times I see stuff and return a dict of counts. ''' + out = {} + for i in items: + out.setdefault(i, 0) + out[i] += 1 + return out + +def freqs(items, zero_checks = set()): + ''' Given a list of comparable, non-unique items, produce an iterator of + (item, count, freq) tuples. + item is a unique instance of one of the items seen on input + count is a positive integer describing the number of times this item was observed + freq is count / sum(count) for all outputs. + If zero_checks is specified, then the output iterator will emit tuples for the + items in zero_checks even if they do not appear in the input. If they are not in + the input, they will be emitted with a zero count and freq. + See histogram(items) + ''' + tot = 0 + out = {} + for i in items: + out.setdefault(i, 0) + out[i] += 1 + tot += 1 + for k,v in out.items(): + yield (k,v,float(v)/tot) + for i in zero_checks: + if i not in out: + yield (i,0,0.0) + +def intervals(i, n, l): + ''' Divide something of length l into n equally sized parts and return the + start-stop values of the i'th part. Values are 1-based. Each part + will be adjacent and non-overlapping with the next part. i must be a + number from 1 to n. + ''' + assert 1 <= i <= n and l>=n + part_size = l//n + start = 1 + part_size * (i-1) + stop = part_size * i + if i==n: + stop = l + return (start,stop) + +# from http://stackoverflow.com/a/312467 +def batch_iterator(iterator, batch_size) : + """Returns lists of length batch_size. + + This can be used on any iterator, for example to batch up + SeqRecord objects from Bio.SeqIO.parse(...), or to batch + Alignment objects from Bio.AlignIO.parse(...), or simply + lines from a file handle. + + This is a generator function, and it returns lists of the + entries from the supplied iterator. Each list will have + batch_size entries, although the final list may be shorter. + """ + it = iter(iterator) + item = list(itertools.islice(it, batch_size)) + while item: + yield item + item = list(itertools.islice(it, batch_size)) + diff --git a/cms/util/stats.py b/cms/util/stats.py new file mode 100644 index 00000000..98f1120c --- /dev/null +++ b/cms/util/stats.py @@ -0,0 +1,202 @@ +'''A few pure-python statistical tools to avoid the need to install scipy. ''' +from __future__ import division # Division of integers with / should never round! +from math import exp, log, pi, sqrt, gamma, lgamma, erf +import itertools + +__author__ = "dpark@broadinstitute.org, irwin@broadinstitute.org" + +try: + # Python 3.4 + from statistics import mean, median +except ImportError: + # Python <3.4, avoid numpy if these two methods are all we really need + def mean(l): + if len(l)>0: + return float(sum(l))/len(l) + else: + raise Exception("empty list for mean") + def median(l): + if len(l)>0: + half = len(l) // 2 + l.sort() + if len(l) % 2 == 0: + return (l[half-1] + l[half]) / 2.0 + else: + return l[half] + else: + raise Exception("empty list for median") + +def product(iter) : + prod = 1 + for x in iter : + prod *= x + return prod + +def chi2_contingency(contingencyTable, correction = True) : + """ contingencyTable is a sequence of m sequences, each of length n. + Return an estimate using the chi-square distribution of the two-tailed + p-value for an m x n contingency table against the null hypothesis + that the row and column criteria are independent. This is not as + accurate as fisher_exact, but is faster (and is implemented for all + m and n). + If correction is True and there is 1 degree of freedom, apply Yates's + correction for continuity, i.e., adjust each observed value + by 0.5 towards the corresponding expected value, which brings + the result closer to the Fisher exact test result. + Not recommended if any of the counts or expected counts are + less than 5. + """ + # scipy equivalent: scipy.stats.chi2_contingency(contingencyTable)[1] + + if len(contingencyTable) == 0 : + return 1.0 + if len(set(map(len, contingencyTable))) != 1 : + raise ValueError('Not all rows have the same length') + + # Eliminate rows and columns with 0 sum + colSums = [sum(row[col] for row in contingencyTable) + for col in range(len(contingencyTable[0]))] + table = [[x for x, colSum in zip(row, colSums) if colSum != 0] + for row in contingencyTable if sum(row) != 0] + + if len(table) < 2 or len(table[0]) < 2 : + return 1.0 + + m = len(table) + n = len(table[0]) + rowSums = [sum(row) for row in table] + colSums = [sum(row[col] for row in table) for col in range(n)] + N = sum(rowSums) + expect = [[rowSums[i] * colSums[j] / N for j in range(n)] for i in range(m)] + if correction and m == n == 2 : + def corr(i, j) : + if expect[i][j] > table[i][j] : + return min(table[i][j] + 0.5, expect[i][j]) + else : + return max(table[i][j] - 0.5, expect[i][j]) + table = [[corr(i, j) for j in range(n)] for i in range(m)] + chisq = sum((table[i][j] - expect[i][j]) ** 2 / expect[i][j] + for j in range(n) + for i in range(m)) + pval = 1 - pchisq(chisq, (m - 1) * (n - 1)) + return pval + +def fisher_exact(contingencyTable) : + """ Fisher exact test for the 2 x n case. + contingencyTable is a sequence of 2 length-n sequences of integers. + Return the two-tailed p-value against the null hypothesis that the row + and column criteria are independent, using Fisher's exact test. + For n larger than 2, this is very slow, O(S^(n-1)), where S is the + smaller of the two row sums. Better to use chi2_contingency unless + one of the row sums is small. + Handles m x n contingencyTable with m > 2 if it can be reduced to the + 2 x n case by transposing or by removing rows that are all 0s. Also + handles degenerate cases of 0 or 1 row by returning 1.0. + """ + if len(contingencyTable) == 0 : + return 1.0 + if len(set(map(len, contingencyTable))) != 1 : + raise ValueError('Not all rows have the same length') + if any(x != int(x) for row in contingencyTable for x in row) : + raise ValueError('Some table entry is not an integer') + if any(x < 0 for row in contingencyTable for x in row) : + raise ValueError('Some table entry is negative') + + # Eliminate rows and columns with 0 sum + colSums = [sum(row[col] for row in contingencyTable) + for col in range(len(contingencyTable[0]))] + table = [[x for x, colSum in zip(row, colSums) if colSum != 0] + for row in contingencyTable if sum(row) != 0] + + if len(table) < 2 or len(table[0]) < 2 : + return 1.0 + + if len(table) > len(table[0]) : + table = list(zip(*table)) # Transpose + + m = len(table) + n = len(table[0]) + if m != 2 : + raise NotImplementedError('More than 2 non-zero rows and columns.') + + # Put row with smaller sum first. Makes the loop iterations simpler. + table.sort(key = sum) + # Put column with largest sum last. Makes loop quick rejection faster. + table = list(zip(*table)) # Transpose + table.sort(key = sum) + table = list(zip(*table)) # Transpose back + + # There are many optimizations possible for the following code, but it would + # still be O(S^(n-1)) so it would still be too slow for anything + # sizeable, and it's usable as it for small things. + + rowSums = [sum(row) for row in table] + colSums = [sum(row[col] for row in table) for col in range(n)] + + logChooseNrowSum = log_choose(sum(rowSums), rowSums[0]) + + def prob_of_table(firstRow) : + return exp(sum(log_choose(cs, a) for cs, a in zip(colSums, firstRow)) - + logChooseNrowSum) + + p0 = prob_of_table(table[0]) + result = 0 + for firstRowM1 in itertools.product(*[range(min(rowSums[0], colSums[i]) + 1) + for i in range(n - 1)]) : + lastElmt = rowSums[0] - sum(firstRowM1) + if lastElmt < 0 or lastElmt > colSums[-1] : + continue + prob = prob_of_table(firstRowM1 + (lastElmt,)) + if prob <= p0 + 1e-9 : # (1e-9 handles floating point round off) + result += prob + + return result + +def log_choose(n, k) : + # Return log(n choose k). Compute using lgamma(x + 1) = log(x!) + if not (0 <= k <=n) : + raise ValueError('%d is negative or more than %d' % (k, n)) + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1) + +def gammainc_halfint(s, x) : + """ Lower incomplete gamma function = + integral from 0 to x of t ** (s-1) exp(-t) dt divided by gamma(s), + i.e., the fraction of gamma that you get if you integrate only until + x instead of all the way to infinity. + Implemented here only if s is a positive multiple of 0.5. + """ + # scipy equivalent: scipy.special.gammainc(s,x) + + if s <= 0 : + raise ValueError('%s is not positive' % s) + if x < 0 : + raise ValueError('%s < 0' % x) + if s * 2 != int(s * 2) : + raise NotImplementedError('%s is not a multiple of 0.5' % s) + + # Handle integers analytically + if s == int(s) : + term = 1 + total = 1 + for k in range(1, int(s)) : + term *= x / k + total += term + return 1 - exp(-x) * total + + # Otherwise s is integer + 0.5. Decrease to 0.5 using recursion formula: + result = 0.0 + while s > 1 : + result -= x ** (s - 1) * exp(-x) / gamma(s) + s = s - 1 + # Then use gammainc(0.5, x) = erf(sqrt(x)) + result += erf(sqrt(x)) + return result + +def pchisq(x, k) : + "Cumulative distribution function of chi squared with k degrees of freedom." + if k < 1 or k != int(k) : + raise ValueError('%s is not a positive integer' % k) + if x < 0 : + raise ValueError('%s < 0' % x) + return gammainc_halfint(k / 2, x / 2) + diff --git a/cms/util/vcf.py b/cms/util/vcf.py new file mode 100644 index 00000000..d23a8c96 --- /dev/null +++ b/cms/util/vcf.py @@ -0,0 +1,386 @@ +'''This gives a number of useful quick methods for dealing with VCF files. +''' + +__author__ = "dpark@broadinstitute.org" +__version__ = "PLACEHOLDER" +__date__ = "PLACEHOLDER" + +import os, shutil, logging, itertools, sqlite3 +import pysam +import util.file, util.misc + +log = logging.getLogger(__name__) + +def make_intervals(i, n, fasta, chr_prefix='', verbose=False): + ''' Divide a sorted genome into n equally sized parts and return the i'th + part. We will return a list of intervals: chr, start, stop. It may + contain multiple chromosomes in order to fill out a total length equal + to the other parts. Each part will be adjacent and non-overlapping with + the next part. i must be a number from 1 to n. + ''' + assert 1 <= i <= n + + # read genome dict file + tot = 0 + chrlens = [] + for c,c_len in get_chrlens(fasta): + if c.startswith(chr_prefix): + chrlens.append((c,c_len,tot)) + tot += c_len + + # define our chunk by gpos: + part_size = tot//n + g_start = 1 + part_size * (i-1) + g_stop = part_size * i + if i==n: + g_stop = tot + + # find the genomic intervals that correspond to our gpos window + out = [] + for c, c_len, c_g_start in chrlens: + c_g_stop = c_g_start + c_len + c_g_start += 1 + if c_g_stop >= g_start and c_g_start <= g_stop: + start = max(g_start, c_g_start) - c_g_start + 1 + stop = min(g_stop, c_g_stop) - c_g_start + 1 + out.append((c, start, stop)) + + if verbose: + log.info("Dividing the %d bp genome into %d chunks of %d bp each. The %dth chunk contains the following %d intervals: %s" % ( + tot, n, part_size, i, len(out), ', '.join(["%s:%d-%d"%x for x in out]))) + return out + + +def sliding_windows(fasta, width, offset, chr_prefix=''): + ''' Divide a genome into sliding windows and return as an iterator of + intervals (chr, start, stop). The windows are of fixed width + (except maybe the last window on each chromosome) and may overlap + (offsetwidth). + ''' + assert width>0 and offset>0 + for c,c_len in get_chrlens(fasta): + if c.startswith(chr_prefix): + start = 1 + while start <= c_len: + stop = min(c_len, start + width - 1) + yield (c, start, stop) + start += offset + + +class GenomePosition: + ''' Provide a mapping from chr:pos to genomic position. + Read chromosome lengths and order from either a Picard/GATK-index for + a FASTA file (a .dict file) or from a VCF header. + ''' + def __init__(self, seqDb): + self.gpos_map = {} + self.clen_map = {} + self.chrs = [] + totlen = 0 + for c,clen in get_chrlens(seqDb): + self.chrs.append((c,clen)) + self.gpos_map[c] = totlen + self.clen_map[c] = clen + totlen += clen + self.total = totlen + def get_gpos(self, c, p): + assert isinstance(p, int) + assert c in self.gpos_map + assert 1 <= p <= self.clen_map[c] + return p + self.gpos_map[c] + def get_chr_pos(self, gpos): + assert isinstance(gpos, int) + assert 1 <= gpos <= self.total + totlen = 0 + for c,clen in self.chrs: + if gpos <= totlen+clen: + break + totlen += clen + return (c,gpos-totlen) + +def get_chrlens(inFile): + ''' Read chromosome lengths and order from either a Picard/GATK-index for + a FASTA file (a .dict file) or from "contig" rows in the VCF header. + ''' + chrlens = [] + if hasattr(inFile, 'chrlens'): + chrlens = inFile.chrlens() + else: + if inFile.endswith('.fasta'): + inFile = inFile[:-len('.fasta')] + '.dict' + elif inFile.endswith('.fa'): + inFile = inFile[:-len('.fa')] + '.dict' + if inFile.endswith('.dict'): + with open(inFile, 'rt') as inf: + for line in inf: + row = line.rstrip('\n').split('\t') + if row[0]=='@SQ': + assert row[1].startswith('SN:') and row[2].startswith('LN:') + c = row[1][3:] + c_len = int(row[2][3:]) + chrlens.append((c,c_len)) + elif inFile.endswith('.vcf') or inFile.endswith('.vcf.gz'): + with util.file.open_or_gzopen(inFile, 'rt') as inf: + for line in inf: + line = line.rstrip('\n') + if line.startswith('##contig='): + line = line[13:-1] + c = line.split(',')[0] + clen = int(line.split('=')[1]) + chrlens.append((c,clen)) + elif line.startswith('#CHROM'): + break + else: + raise AssertionError("unrecognized file type %s" % inFile) + assert chrlens, "no sequence data found in %s % inFile" + return chrlens + +def calc_maf(genos, ancestral=None, ploidy=1): + # get list of alleles + if ploidy==1: + alleles = genos + else: + alleles = [] + for g in genos: + g = g.split('/') + assert len(g)==ploidy + alleles += g + + # count up + out = {'n_tot':len(alleles)} + acounts = util.misc.histogram(alleles) + alist = sorted([(n,a) for a,n in acounts.items()]) + + # daf + if ancestral != None: + out['a_ancestral'] = ancestral + derived = list(sorted([a for a in acounts.keys() if a!=ancestral])) + out['a_derived'] = ','.join(derived) + out['dac'] = sum(acounts[a] for a in derived) + out['daf'] = out['n_tot'] and float(out['dac'])/out['n_tot'] or None + + # maf + if out['n_tot']: + out['a_major'] = alist[-1][1] + out['a_minor'] = ','.join([a for n,a in alist[:-1]]) + out['mac'] = out['n_tot'] - alist[-1][0] + out['maf'] = float(out['mac'])/out['n_tot'] + else: + out['a_major'] = None + out['a_minor'] = None + out['mac'] = None + out['maf'] = None + + return out + + +class TabixReader(pysam.Tabixfile): + ''' A wrapper around pysam.Tabixfile that provides a context and + allows us to query using 1-based coordinates. + + We should request upstream to the Pysam people to implement + methods __reversed__() and __len__() in pysam.TabixIterator. + __getitem__ could be a bonus, but prob unnecessary. + ''' + def __init__(self, inFile, parser=pysam.asTuple()): + # because of odd Cython weirdness, we don't actually want to call super.__init__ here.. + #super(TabixReader, self).__init__(inFile, parser=parser) + self.parser = parser + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return 0 + def close(self): + super(TabixReader, self).close() + def chroms(self): + return self.contigs + def get(self, chrom=None, start=None, stop=None, region=None): + if start!=None: + start -= 1 + return self.fetch(reference=chrom, start=start, end=stop, + region=region, parser=self.parser) + + +def get_pos_from_vcf_record(vcfrec): + # new versions of pysam return a zero-based position here + return vcfrec.pos + 1 + +def bytes_to_string(o): + if type(o) == bytes: + o = o.decode('utf-8') + return o + +class VcfReader(TabixReader): + ''' Same as TabixReader with a few more perks for VCF files: + - emit results parsed as pysam VCF rows + - provide self.chrlens(), a dict mapping chrom name to chrom length + - provide self.samples(), a list of sample names in order of appearance + - provide get_range(c,start,stop) and get_snp_genos(c,pos) + ''' + def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()): + super(VcfReader, self).__init__(inFile, parser=parser) + assert ploidy in (1,2) + self.ploidy = ploidy + self.clens = [] + self.sample_names = None + for line in self.header: + line = bytes_to_string(line) + if line.startswith('##contig='): + line = line[13:-1] + c = line.split(',')[0] + clen = int(line.split('=')[1]) + self.clens.append((c,clen)) + elif line.startswith('#CHROM'): + row = line.split('\t') + self.sample_names = row[9:] + self.clens = dict(self.clens) + assert self.sample_names + def samples(self): + return self.sample_names + def chrlens(self): + return self.clens + def get_positions(self, c=None, start=None, stop=None, region=None): + for snp in self.get(c,start,stop,region): + yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), get_pos_from_vcf_record(snp)+len(snp.ref)-1) + def get_range(self, c=None, start=None, stop=None, region=None, as_strings=True, more=False): + ''' Read a VCF file (optionally just a piece of it) and return contents + as an iterator with parsed contents. Each row is returned as + a 4-tuple: + 1: chr + 2: pos (int) + 3: list of allele strings (in order) + 4: list of genotypes as 2-tuples: + haploid: (sample, allele) + diploid: (sample, [allele, allele]) + If as_strings, the alleles will be actual alleles. Otherwise, + alleles will be integers. + If more is true, a fifth column will be emitted with the pysam VCF object. + ''' + for snp in self.get(c,start,stop,region): + alleles = [bytes_to_string(snp.ref)] + bytes_to_string(snp.alt).split(',') + alleles = [a for a in alleles if a != '.'] + if self.ploidy==1: + genos = [(self.sample_names[i], int(bytes_to_string(snp[i])[0])) + for i in range(len(self.sample_names)) + if bytes_to_string(snp[i])[0] != '.'] + if as_strings: + genos = [(s,alleles[a]) for s,a in genos] + else: + genos = [(self.sample_names[i], [int(bytes_to_string(snp[i])[j*2]) for j in range(self.ploidy)]) + for i in range(len(self.sample_names)) + if bytes_to_string(snp[i])[0] != '.'] + if as_strings: + genos = [(s,[alleles[a] for a in g]) for s,g in genos] + if more: + yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos, snp) + else: + yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos) + def get_snp_genos(self, c, p, as_strings=True): + ''' Read a single position from a VCF file and return the genotypes + as a sample -> allele map. If there is not exactly one matching + row in the VCF file at this position (if there are none or multiple) + then we return an empty map: {}. + ''' + snps = [x for x in self.get_range(c,p,p,as_strings=as_strings)] + return len(snps)==1 and dict(snps[0][3]) or {} + def getFullSequences(self, c, start, stop, samples, + na='-', refSeq=None, refInvariantsOnly=False, ignoreIndels=False): + ''' chr - chromosome name + start - start position + stop - default = start + samples is a list of samples. None can be used for the ref sample. + if refSeq is a string with length = stop-start+1, then use this as the + base sequence for missing data, otherwise, use the na string. + if refInvariantsOnly is True, refSeq is only used for invariant + sites or sites with no entries for any samples. if False, refSeq + is used for all missing data. + ''' + assert 1<=start<=stop + assert len(na)==1 + + # get all the VCF records + vcf_records = [(p-start,alleles,dict(genos)) for chrom,p,alleles,genos + in self.get_range(c, start, stop, as_strings=True) + if not ignoreIndels or set(map(len, alleles)) == set([1])] + + # Construct a list called "seq" into which we will replace alleles as + # we discover them. This is a list, not a string, because each position + # might be replaced with an arbitrary length string (deletions will be + # empty strings, insertions will be strings > length 1). This all gets + # converted to a string at the end. + if refSeq: + # assume refSeq alleles as a baseline for missing data + assert len(refSeq)==(stop-start+1) + seq = list(refSeq) + if refInvariantsOnly: + # but don't assume refSeq alleles for any known variant sites + for i,alleles,genos in vcf_records: + if len(set(genos[s] for s in samples if s in genos))>1: + for j in range(len(alleles[0])): + if i+j < len(seq): + seq[i+j] = na + else: + # assume nothing when data is missing + seq = list(na * (stop-start+1)) + + for sample in samples: + assert sample==None or sample in self.samples() + + # Make copy of reference sequence + newseq = [s for s in seq] + + # Replace alleles, one site at a time. + replaceAlleles(sample, newseq, vcf_records) + + # Convert list back to string. This string may or may not be the same + # length as the original (if ignoreIndels is True, it must be the same + # length as the original). + newseq = ''.join(newseq) + assert len(newseq)==(stop-start+1) or not ignoreIndels + yield (sample, newseq) + + +def replaceAlleles(sample,seq,vcf_records): + ''' Replace alleles, one site at a time. ''' + for i,alleles,genos in vcf_records: + + # set allele to the DNA sequence we will replace at positions i through i+len(refallele)-1 + if sample==None: + # caller is asking for the reference sample's sequence + allele = alleles[0] + alleles = [allele] + else: + # caller is asking for a normal sample + samp_geno = genos.get(sample) + if not samp_geno: + continue + if isinstance(samp_geno, list): + log.warn("TO DO: add code to turn hets into IUPAC ambiguity codes (%s %s = %s)." % (i,sample, '/'.join(samp_geno))) + continue + allele = samp_geno + + # replace portion of sequence with allele + # NEED BUGFIXES HERE for overlapping VCF records + if allele == None: + # nothing here ... is this even possible anymore? + pass + elif len(alleles[0])==1: + # the most common cases: SNP, novar, or indel replacing one ref base (with zero or more bases) + seq[i] = allele + else: + # most complicated case: something replacing multiple ref bases + # TODO: beware--this is all untested! + for j in range(max(len(alleles[0]), len(allele))): + if i+j < len(seq): + if j= ref length, fill out the rest of the bases + seq[i+j] = allele[j:] + else: + seq[i+j] = allele[j] + else: + # new allele is shorter than ref, so delete extra bases + seq[i+j] = '' + return seq + diff --git a/cms/util/version.py b/cms/util/version.py new file mode 100644 index 00000000..eb55cb66 --- /dev/null +++ b/cms/util/version.py @@ -0,0 +1,69 @@ +''' This gets the git version into python-land +''' + +__author__ = "dpark@broadinstitute.org" +__version__ = None + +import subprocess, os, os.path + +def get_project_path() : + '''Return the absolute path of the top-level project, assumed to be the + parent of the directory containing this script.''' + # abspath converts relative to absolute path; expanduser interprets ~ + path = __file__ # path to this script + path = os.path.expanduser(path) # interpret ~ + path = os.path.abspath(path) # convert to absolute path + path = os.path.dirname(path) # containing directory: util + path = os.path.dirname(path) # containing directory: main project dir + return path + +def call_git_describe(): + cwd = os.getcwd() + try: + os.chdir(get_project_path()) + cmd = ['git', 'describe', '--tags', '--always', '--dirty'] + out = subprocess.check_output(cmd) + if type(out) != str: + out = out.decode('utf-8') + ver = out.strip() + except: + ver = None + os.chdir(cwd) + return ver + +def release_file(): + return os.path.join(get_project_path(), 'VERSION') + +def read_release_version(): + try: + with open(release_file(), 'rt') as inf: + version = inf.readlines()[0].strip() + except: + version = None + return version + +def write_release_version(version): + with open(release_file(), 'wt') as outf: + outf.write(version+'\n') + +def get_version(): + global __version__ + if __version__ == None: + from_git = call_git_describe() + from_file = read_release_version() + + if from_git: + if from_file != from_git: + write_release_version(from_git) + __version__ = from_git + else: + __version__ = from_file + + if __version__ == None: + raise ValueError("Cannot find the version number!") + + return __version__ + + +if __name__ == "__main__": + print(get_version()) diff --git a/docs/conf.py b/docs/conf.py index cce92ddf..af05e99a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,7 +19,24 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.dirname(os.path.abspath('.'))) + +# -- Mock out the heavyweight pip packages, esp those that require C ---- +import mock +MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam', + 'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData'] +for mod_name in MOCK_MODULES: + sys.modules[mod_name] = mock.Mock() + +# -- Obtain GIT version -- +import subprocess +def _git_version(): + cmd = ['git', 'describe', '--tags', '--always'] # omit "--dirty" from doc build + out = subprocess.check_output(cmd) + if type(out) != str: + out = out.decode('utf-8') + return out.strip() +__version__ = _git_version() # -- General configuration ------------------------------------------------ @@ -60,10 +77,8 @@ # |version| and |release|, also used in various other places throughout the # built documents. # -# The short X.Y version. -version = '1.0' -# The full version, including alpha/beta/rc tags. -release = '1.0' +release = __version__ +version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -76,7 +91,7 @@ # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -114,7 +129,12 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +#html_theme = 'default' +import sphinx_rtd_theme + +html_theme = "sphinx_rtd_theme" + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -152,7 +172,7 @@ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = '%Y-%m-%d. {}'.format(release) # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. diff --git a/docs/install.rst b/docs/install.rst index bad8afaf..de94dd15 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -35,4 +35,4 @@ most of the bioinformatic tools we rely on here. They will auto-download and install the first time they are needed by any command. If you want to pre-install all of the external tools, simply type this:: - python -m unittest test.test_tools.TestToolsInstallation -v + python -m unittest test.unit.test_tools.TestToolsInstallation -v diff --git a/requirements.txt b/requirements.txt index 95311c9f..1008ad0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ biopython==1.65 -numpy==1.8.0rc1 +numpy==1.8.0 pandas==0.16.0 paramiko==1.15.2 crypto==1.3.4 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..2e40912e --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +try: + from setuptools import setup, find_packages +except ImportError: + from distutils.core import setup + +EXCLUDE_FROM_PACKAGES = ['old','cms.test*'] + +config = { + 'description': 'Composite of Multiple Signals: tests for selection in meiotically recombinant populations', + 'author': 'Broad Institute', + 'url': 'https://github.com/broadinstitute/cms', + 'download_url': 'https://github.com/broadinstitute/cms', + 'author_email': '', + 'version': '0.1', + 'install_requires': ['nose','biopython','pysam'], + 'packages': find_packages(exclude=EXCLUDE_FROM_PACKAGES), + 'scripts': ['cms/selection.py','cms/main.py'], + 'name': 'python-cms' +} + +setup(**config) diff --git a/travis/before_install.sh b/travis/before_install.sh new file mode 100755 index 00000000..e69de29b diff --git a/travis/install-pip.sh b/travis/install-pip.sh new file mode 100755 index 00000000..39f2d25b --- /dev/null +++ b/travis/install-pip.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +echo "pip installing required python packages" +pip install -r requirements.txt + +#PYVER=`python -V 2>&1 | cut -c 8` +# PYVER=`echo $TRAVIS_PYTHON_VERSION | cut -c 1` +# if [ "$PYVER" = "3" ]; then +# echo "pip installing snakemake packages (py3 only)" +# pip install -r requirements-pipes.txt +# fi + +echo "pip installing coveralls" +pip install -q coveralls nose-cov diff --git a/travis/install-tools.sh b/travis/install-tools.sh new file mode 100755 index 00000000..e69de29b diff --git a/travis/tests-long.sh b/travis/tests-long.sh new file mode 100755 index 00000000..1ccf0c10 --- /dev/null +++ b/travis/tests-long.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +echo "TRAVIS_BRANCH: $TRAVIS_BRANCH" +echo "TRAVIS_PULL_REQUEST: $TRAVIS_PULL_REQUEST" + +if [ $TRAVIS_PULL_REQUEST != "false" -o $TRAVIS_BRANCH = "master" -o -n "$TRAVIS_TAG" ]; then + echo "This is on master or is a pull request: executing long running tests..." + nosetests -v --with-xunit --with-coverage --nocapture \ + --cover-inclusive --cover-branches --cover-tests \ + --cover-package main \ + -w cms/test/integration/ + +else + echo "This is not a pull request: skipping long running tests." +fi diff --git a/travis/tests-unit.sh b/travis/tests-unit.sh new file mode 100755 index 00000000..8843d0f6 --- /dev/null +++ b/travis/tests-unit.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + + nosetests -v --with-xunit --with-coverage --nocapture \ + --cover-inclusive --cover-branches --cover-tests \ + --cover-package selection \ + -w cms/test/unit/ \ No newline at end of file