diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000..d476e2ef
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,14 @@
+# .coveragerc to control test coverage report
+[report]
+
+exclude_lines =
+    pragma: no cover
+
+    raise AssertionError
+    raise NotImplementedError
+
+    def __repr__
+    if self\.debug
+
+    if 0:
+    if __name__ == .__main__.:
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..ac9fb640
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,29 @@
+language: python
+sudo: false
+
+cache:
+ directories:
+  - $HOME/virtualenv
+
+env:
+  global:
+  - CACHE_DIR="$HOME/virtualenv"
+  - PYTHONIOENCODING=UTF8
+
+python:
+  - 2.7
+  - 3.4
+
+before_install:
+  - travis/before_install.sh
+
+install:
+  - travis/install-pip.sh
+  - travis/install-tools.sh
+
+script:
+  - travis/tests-unit.sh
+  - travis/tests-long.sh
+
+after_success:
+  - coveralls
diff --git a/README.md b/README.md
index a791f2fb..ad086b35 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 **Composite of Multiple Signals**: tests for selection in meiotically recombinant populations
 
+[![Travis CI Status](https://api.travis-ci.org/broadinstitute/cms.svg)](https://travis-ci.org/broadinstitute/cms)
+[![Coverage Status](https://coveralls.io/repos/broadinstitute/cms/badge.svg)](https://coveralls.io/r/broadinstitute/cms)
 [![Documentation Status](https://readthedocs.org/projects/broad-cms/badge/?version=latest)](https://readthedocs.org/projects/broad-cms/?badge=latest)
 
 More detailed documentation can be found at http://broad-cms.readthedocs.org/
diff --git a/cms/VERSION b/cms/VERSION
new file mode 100644
index 00000000..f5fc016e
--- /dev/null
+++ b/cms/VERSION
@@ -0,0 +1 @@
+7bc1b96-dirty
diff --git a/cms/cms_exceptions.py b/cms/cms_exceptions.py
new file mode 100644
index 00000000..3622362b
--- /dev/null
+++ b/cms/cms_exceptions.py
@@ -0,0 +1,6 @@
+class CMSException(Exception):
+    '''base class for CMS exceptions. except for built-ins, all should be descendants of this exception so that an except block for this exception will catch all others'''
+    pass
+
+class CMSInputError(CMSException):
+    pass
diff --git a/cms/main.py b/cms/main.py
new file mode 100644
index 00000000..ef4726d9
--- /dev/null
+++ b/cms/main.py
@@ -0,0 +1,12 @@
+#!/usr/bin/python
+
+class CMS(object):
+    def __init__(self):
+        pass
+
+    def main(self):
+        return 0
+
+if __name__ == "__main__":
+    cms = CMS()
+    cms.main()
\ No newline at end of file
diff --git a/cms/selection.py b/cms/selection.py
new file mode 100644
index 00000000..54ab0c23
--- /dev/null
+++ b/cms/selection.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python
+
+class Selection(object):
+	def __init__(self):
+		pass
+
+	def main(self):
+		return 0
diff --git a/cms/test/__init__.py b/cms/test/__init__.py
new file mode 100644
index 00000000..6fb3c5dd
--- /dev/null
+++ b/cms/test/__init__.py
@@ -0,0 +1,35 @@
+'''utilities for tests'''
+
+__author__ = "irwin@broadinstitute.org"
+
+import filecmp, os, unittest
+
+import util.file
+
+def assert_equal_contents(testCase, filename1, filename2) :
+    'Assert contents of two files are equal for a unittest.TestCase'
+    testCase.assertTrue(filecmp.cmp(filename1, filename2, shallow=False))
+
+class TestCaseWithTmp(unittest.TestCase) :
+    'Base class for tests that use tempDir'
+    def setUp(self) :
+        util.file.set_tmpDir(type(self).__name__)
+
+    def tearDown(self) :
+        util.file.destroy_tmpDir()
+
+    def assertEqualContents(self, f1, f2) :
+        assert_equal_contents(self, f1, f2)
+
+
+"""
+When "nose" executes python scripts for automated testing, it excludes ones with
+the executable bit set (in case they aren't import safe). To prevent any of the 
+tests in this folder from being silently excluded, assure this bit is not set.
+"""
+def assert_none_executable() :
+    testDir = os.path.dirname(__file__)
+    assert all(not os.access(os.path.join(testDir, filename), os.X_OK)
+               for filename in os.listdir(testDir)
+               if filename.endswith('.py'))
+assert_none_executable()
diff --git a/util/__init__.py b/cms/test/integration/__init__.py
similarity index 100%
rename from util/__init__.py
rename to cms/test/integration/__init__.py
diff --git a/cms/test/integration/test_main.py b/cms/test/integration/test_main.py
new file mode 100644
index 00000000..bbd570a9
--- /dev/null
+++ b/cms/test/integration/test_main.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+
+import unittest
+import main
+
+class TestCMS(unittest.TestCase):
+    # to be executed prior to running tests
+    def setUp(self):
+        self.cms = main.CMS()
+        pass
+
+    # to be executed after running tests, regardless of pass/fail
+    # only called if setUp succeeds
+    def tearDown(self):
+        pass
+
+    def test_main_return_code(self):
+        self.assertEqual( self.cms.main(), 0 )
diff --git a/cms/test/unit/__init__.py b/cms/test/unit/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cms/test/unit/test_selection.py b/cms/test/unit/test_selection.py
new file mode 100644
index 00000000..e30d8b2c
--- /dev/null
+++ b/cms/test/unit/test_selection.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+
+import unittest
+from selection import Selection
+
+class TestSelection(unittest.TestCase):
+    # to be executed prior to running tests
+    def setUp(self):
+        self.cms_selection = Selection()
+        pass
+
+    # to be executed after running tests, regardless of pass/fail
+    # only called if setUp succeeds
+    def tearDown(self):
+        pass
+
+    def test_selection_return_code(self):
+        self.assertEqual( self.cms_selection.main(), 0 )
diff --git a/cms/test/unit/test_tools.py b/cms/test/unit/test_tools.py
new file mode 100644
index 00000000..0dcc2452
--- /dev/null
+++ b/cms/test/unit/test_tools.py
@@ -0,0 +1,33 @@
+# Unit tests for tools/__init__.py
+
+__author__ = "dpark@broadinstitute.org"
+
+import tools
+from tools import *
+import unittest, tempfile, shutil, os, logging
+import util.cmd, util.file
+from test import TestCaseWithTmp
+
+log = logging.getLogger(__name__)
+
+class TestToolsInstallation(TestCaseWithTmp):
+    def setUp(self):
+        super(TestToolsInstallation, self).setUp()
+        util.cmd.setup_logger('INFO')
+    def testAllToolInstallers(self):
+        def iter_leaf_subclasses(aClass) :
+            "Iterate over subclasses at all levels that don't themselves have a subclass"
+            isLeaf = True
+            for subclass in aClass.__subclasses__() :
+                isLeaf = False
+                for leafClass in iter_leaf_subclasses(subclass) :
+                    yield leafClass
+            #if isLeaf :
+            #    yield aClass
+        '''Load every tool's default chain of install methods and try them.'''
+        for tool_class in iter_leaf_subclasses(tools.Tool):
+            t = tool_class()
+            t.install()
+            self.assertTrue(t.is_installed(), "installation of tool %s failed" % tool_class.__name__)
+            log.info(".. installation of %s succeeded with installer %s" % (tool_class.__name__, t.installed_method.__class__.__name__))
+
diff --git a/cms/tools/__init__.py b/cms/tools/__init__.py
new file mode 100644
index 00000000..7fbcc04e
--- /dev/null
+++ b/cms/tools/__init__.py
@@ -0,0 +1,222 @@
+'''class Tool, class InstallMethod, and related subclasses and methods'''
+
+__author__ = "dpark@broadinstitute.org,irwin@broadinstitute.org"
+
+import os, logging, tempfile, shutil
+import util.file
+
+try:
+    # Python 3.x
+    from urllib.request import urlretrieve
+    from urllib.parse import urlparse
+except ImportError:
+    # Python 2.x
+    from urllib import urlretrieve
+    from urlparse import urlparse
+
+# Put all tool files in __all__
+# allows "from tools import *" to import all tooles for testtools
+__all__ = [filename[:-3] # Remove .py
+    for filename in os.listdir(os.path.dirname(__file__)) # tools directory
+        if filename.endswith('.py') and filename != '__init__.py' and
+            filename not in [ # Add any files to exclude here:
+                              # e.g. 'sometool.py',
+                            ]
+    ]
+installed_tools = {}
+
+log = logging.getLogger(__name__)
+
+def get_tool_by_name(name):
+    if name not in installed_tools:
+        pass
+        raise NotImplementedError
+    return installed_tools[name]
+
+class Tool(object):
+    ''' Base tool class that includes install machinery.
+
+        TO DO: add something about dependencies..
+    '''
+    def __init__(self, install_methods=[]):
+        self.install_methods = install_methods
+        self.installed_method = None
+        self.exec_path = None
+    def is_installed(self):
+        return (self.installed_method != None)
+    def install(self):
+        if not self.is_installed():
+            for m in self.install_methods:
+                if not m.is_attempted():
+                    m.attempt_install()
+                if m.is_installed():
+                    self.installed_method = m
+                    self.exec_path = m.executable_path()
+                    installed_tools[self.__class__.__name__] = self
+                    break
+    def get_install_methods(self):
+        return self.install_methods
+    def set_install_methods(self, methods):
+        self.install_methods = methods
+    def version(self):
+        return None
+    def executable_path(self):
+        return self.exec_path
+    def execute(self, args):
+        assert not os.system(self.exec_path + ' ' + args)
+    def install_and_get_path(self) :
+        self.install()
+        if self.executable_path()==None:
+            raise NameError("unsuccessful in installing " + type(self).__name__)
+        return self.executable_path()
+
+class InstallMethod(object):
+    ''' Base class for installation methods for a given tool.
+        None of these methods should ever fail/error. attempt_install should
+        return silently regardless of the outcome (is_installed must be
+        called to verify success or failure).
+    '''
+    def __init__(self):
+        self.attempts = 0
+    def is_attempted(self):
+        return self.attempts
+    def attempt_install(self): # Override _attempt_install, not this.
+        self.attempts += 1
+        self._attempt_install()
+    def _attempt_install(self):
+        raise NotImplementedError
+    def is_installed(self):
+        raise NotImplementedError
+    def executable_path(self):
+        raise NotImplementedError
+
+class PrexistingUnixCommand(InstallMethod):
+    ''' This is an install method that tries to find whether an executable
+        binary already exists for free on the unix file system--it doesn't
+        actually try to install anything.
+    '''
+    def __init__(self, path, verifycmd=None, verifycode=0,
+                 require_executability=True):
+        self.path = path
+        self.verifycmd = verifycmd
+        self.verifycode = verifycode
+        self.installed = False
+        self.require_executability = require_executability
+        InstallMethod.__init__(self)
+    def _attempt_install(self):
+        if os.access(self.path, (os.X_OK | os.R_OK) if
+                self.require_executability else os.R_OK):
+            if self.verifycmd:
+                self.installed = (os.system(self.verifycmd) == self.verifycode)
+            else:
+                self.installed = True
+        else:
+            self.installed = False
+    def is_installed(self):
+        if not self.is_attempted():
+            self.attempt_install()
+        return self.installed
+    def executable_path(self):
+        return self.installed and self.path or None
+
+class DownloadPackage(InstallMethod):
+    ''' This is an install method for downloading, unpacking, and post-
+            processing straight from the source.
+        target_rel_path is the executable's path relative to destination_dir
+        destination_dir defaults to the project build directory
+        post_download_command will be executed if it isn't None, in 
+            destination_dir.
+        if post_download_ret != None, assert it is returned by
+            post_download_command
+    '''
+    def __init__(self, url, target_rel_path, destination_dir=None,
+                 verifycmd=None, verifycode=0, require_executability=True,
+                 post_download_command=None, post_download_ret=0):
+        if destination_dir == None :
+            destination_dir = util.file.get_build_path()
+        self.url = url
+        self.targetpath = os.path.join(destination_dir, target_rel_path)
+        self.destination_dir = destination_dir
+        self.verifycmd = verifycmd
+        self.verifycode = verifycode
+        self.installed = False
+        self.require_executability = require_executability
+        self.post_download_command = post_download_command
+        self.post_download_ret = post_download_ret
+        InstallMethod.__init__(self)
+    def is_installed(self):
+        return self.installed
+    def executable_path(self):
+        return self.installed and self.targetpath or None
+    def verify_install(self):
+        if os.access(self.targetpath, (os.X_OK | os.R_OK) if
+                self.require_executability else os.R_OK):
+            if self.verifycmd:
+                log.debug("validating")
+                self.installed = (os.system(self.verifycmd) == self.verifycode)
+            else:
+                self.installed = True
+        else:
+            self.installed = False
+        return self.installed
+    def _attempt_install(self):
+        if not self.verify_install():
+            self.pre_download()
+            self.download()
+            self.post_download()
+            self.verify_install()
+    def pre_download(self):
+        pass
+    def download(self):
+        download_dir = tempfile.gettempdir()
+        util.file.mkdir_p(download_dir)
+        filepath = urlparse(self.url).path
+        filename = filepath.split('/')[-1]
+        log.info("Downloading from {} ...".format(self.url,
+                                                  download_dir,
+                                                  filename))
+        urlretrieve(self.url, "%s/%s" % (download_dir,filename))
+        self.download_file = filename
+        self.unpack(download_dir)
+    def post_download(self):
+        if self.post_download_command:
+            return_code = os.system('cd "{}" && {}'.format(
+                self.destination_dir, self.post_download_command))
+            if self.post_download_ret != None:
+                assert return_code == self.post_download_ret
+    def unpack(self, download_dir):
+        log.debug("unpacking")
+        util.file.mkdir_p(self.destination_dir)
+        if self.download_file.endswith('.zip'):
+            if os.system("unzip -o %s/%s -d %s > /dev/null" % (download_dir,
+                self.download_file, self.destination_dir)):
+
+                return
+            else:
+                os.unlink(os.path.join(download_dir, self.download_file))
+        elif (self.download_file.endswith('.tar.gz') or
+              self.download_file.endswith('.tgz') or
+              self.download_file.endswith('.tar.bz2') or
+              self.download_file.endswith('.tar')):
+            if self.download_file.endswith('.tar'):
+                compression_option = ''
+            elif self.download_file.endswith('.tar.bz2'):
+                compression_option = 'j'
+            else:
+                compression_option = 'z'
+            untar_cmd = "tar -C {} -x{}pf {}/{}".format(self.destination_dir,
+                                                        compression_option,
+                                                        download_dir,
+                                                        self.download_file)
+            log.debug("Untaring with command: {}".format(untar_cmd))
+            exitCode = os.system(untar_cmd)
+            if exitCode:
+                log_str="tar returned non-zero exitcode {}".format(exitCode)
+                log.info(log_str)
+                return
+            else:
+                log.debug("tar returned with exit code 0")
+                os.unlink(os.path.join(download_dir, self.download_file))
+        else :
+            shutil.move(os.path.join(download_dir, self.download_file),
+                      os.path.join(self.destination_dir, self.download_file))
diff --git a/cms/util/__init__.py b/cms/util/__init__.py
new file mode 100644
index 00000000..f5d73ac7
--- /dev/null
+++ b/cms/util/__init__.py
@@ -0,0 +1,10 @@
+import os
+
+# allows for "from util import *"
+__all__ = [filename[:-3] # Remove .py extension
+    for filename in os.listdir( os.path.dirname(__file__) )
+        if filename.endswith('.py') and filename != '__init__.py' and
+            filename not in [ # Add any files to exclude here:
+                              # e.g. 'sometool.py',
+                            ]
+    ]
\ No newline at end of file
diff --git a/cms/util/annot.py b/cms/util/annot.py
new file mode 100644
index 00000000..b58dc4d7
--- /dev/null
+++ b/cms/util/annot.py
@@ -0,0 +1,161 @@
+'''A few methods for dealing with gene annotation from snpEff.'''
+
+__author__ = "dpark@broadinstitute.org"
+__version__ = "PLACEHOLDER"
+__date__ = "PLACEHOLDER"
+
+import sqlite3, itertools, urllib, logging, re, os
+import util.file, util.misc
+
+log = logging.getLogger(__name__)
+
+class SnpAnnotater:
+    ''' Add annotations to snps based on a snpEff-annotated VCF file.
+    '''
+    def __init__(self, snpEffVcf=None, snpIterator=None):
+        self.snpIterator = snpIterator
+        self.dbFile = util.file.mkstempfname(prefix='SnpAnnotater-', suffix='.db')
+        self.conn = sqlite3.connect(self.dbFile, isolation_level='DEFERRED')
+        self.cur = self.conn.cursor()
+        self.cur.execute("""create table annot (
+            chr not null,
+            pos integer not null,
+            allele_ref not null,
+            allele_alt not null,
+            effect not null,
+            impact not null,
+            gene_id,
+            gene_name,
+            protein_pos integer,
+            residue_ref,
+            residue_alt
+        )""")
+        self.cur.execute("create index idx_annot on annot(chr,pos)")
+        if snpEffVcf:
+            self.loadVcf(snpEffVcf)
+    def loadVcf(self, snpEffVcf):
+        #log.info("reading in snpEff VCF file: %s" % snpEffVcf)
+        with util.file.open_or_gzopen(snpEffVcf, 'rt') as inf:
+            ffp = util.file.FlatFileParser(inf)
+            try:
+                imap = hasattr(itertools, 'imap') and itertools.imap or map  #py2 & py3 compatibility
+                ifilter = hasattr(itertools, 'ifilter') and itertools.ifilter or filter  #py2 & py3 compatibility
+                self.cur.executemany("""insert into annot (chr,pos,allele_ref,allele_alt,
+                    effect,impact,gene_id,gene_name,protein_pos,residue_ref,residue_alt)
+                    values (?,?,?,?,?,?,?,?,?,?,?)""",
+                    imap(lambda row:
+                        [row['CHROM'], int(row['POS']), row['REF'], row['ALT']]
+                        + parse_eff(row['CHROM'], row['POS'], row['INFO']),
+                        ifilter(lambda r: r['ALT'] != '.', ffp)))
+            except Exception as e:
+                log.exception("exception processing file %s line %s" % (snpEffVcf, ffp.line_num))
+                raise
+            self.cur.execute("select chr,pos from annot group by chr,pos having count(*)>1")
+            dupes = [(c,p) for c,p in self.cur]
+            if dupes:
+                log.info("deleting annotation for %d duplicate positions: %s" % (len(dupes), ', '.join(['%s:%s'%(c,p) for c,p in dupes])))
+                self.cur.executemany("delete from annot where chr=? and pos=?", dupes)
+            self.conn.commit()
+    def __iter__(self):
+        assert self.snpIterator
+        for snp in self.snpIterator:
+            yield self.annotate(snp)
+    def annotate(self, row):
+        self.cur.execute("""select effect,impact,gene_id,gene_name,protein_pos,
+            allele_ref,allele_alt,residue_ref,residue_alt
+            from annot where chr=? and pos=?""", [row['chr'], int(row['pos'])])
+        x = self.cur.fetchone()
+        if x != None:
+            row['effect'],row['impact'],row['gene_id'],row['gene_name'],row['protein_pos'],row['allele_ref'],row['allele_alt'],row['residue_ref'],row['residue_alt'] = x
+            row['alleles'] = '/'.join((row['allele_ref'],row['allele_alt']))
+            if row['residue_alt']:
+                row['residues'] = '/'.join((row['residue_ref'],row['residue_alt']))
+            else:
+                row['residues'] = row['residue_ref']
+        else:
+            row['effect'] = 'UNKNOWN'
+            row['impact'] = 'UNKNOWN'
+        return row
+    def new_fields(self):
+        return ('effect', 'impact', 'gene_id', 'gene_name', 'protein_pos', 'alleles', 'residues')
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return 0
+    def close(self):
+        self.cur.close()
+        self.conn.close()
+        os.unlink(self.dbFile)
+
+
+
+def parse_eff(chrom, pos, info, required=True):
+    try:
+        impact_rank = {'HIGH':0,'MODERATE':1,'LOW':2,'MODIFIER':3}
+        infos = [x for x in info.split(';') if x.startswith('EFF=')]
+        assert len(infos)<=1
+        if not infos:
+            assert not required
+            return ['', '', '', '', '', '', '']
+        info = infos[0]
+        effs = info[4:].split(',')
+        out = []
+        for eff in effs:
+            assert eff.endswith(')')
+            effect, other = eff[:-1].split('(')
+            other = other.split('|')
+            assert len(other)>=10
+            impact = other[0]
+            gene_id = other[8]
+            assert not gene_id or (gene_id.endswith('-1') and gene_id.startswith('rna_'))
+            if gene_id:
+                gene_id = gene_id[4:-2]
+            if gene_id=='PF14_0620' or gene_id=='PF3D7_1465300':
+                gene_name = 'tRNA 3-trailer sequence RNase, putative'
+            else:
+                try:
+                    gene_name = urllib.unquote_plus(other[5]).encode('ascii')
+                except UnicodeDecodeError as e:
+                    log.error("error at %s:%s decoding the string '%s'" % (chrom, pos, other[5]))
+                    raise
+            aa_chg = other[3]
+            if aa_chg:
+                if effect.startswith('SYNON'):
+                    aas = (aa_chg[0], '')
+                    codon = int(aa_chg[1:])
+                elif effect.startswith('NON_SYNON') or effect.startswith('START_') or effect.startswith('STOP_') or effect.startswith('CODON_'):
+                    mo = re.match(r'^(\D+)(\d+)(\D+)$', aa_chg)
+                    assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg)
+                    aas = (mo.group(1), mo.group(3))
+                    codon = int(mo.group(2))
+                elif effect=='FRAME_SHIFT':
+                    mo = re.match(r'^(\D*)(\d+)(\D*)$', aa_chg)
+                    assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg)
+                    aas = ('','')
+                    codon = int(mo.group(2))
+                else:
+                    assert 0, "unrecognized effect type (%s) for variant with coding change (%s)" % (effect, aa_chg)
+            else:
+                aas = ('','')
+                codon = ''
+            out.append([impact_rank[impact], effect, impact, gene_id, gene_name,
+                codon, aas[0], aas[1]])
+
+        if len(out)>1:
+            out.sort()
+            if out[0][0] == out[1][0]:
+                #log.debug("SNP found with multiple effects of the same impact level: %s:%s - %s" % (chrom, pos, info))
+                #assert out[0][2] in ('MODIFIER','LOW'), "Error: SNP found with multiple effects of the same impact level"
+                out = [[';'.join(util.misc.unique([str(o[i]) for o in out])) for i in range(len(out[0]))]]
+        eff = out[0][1:]
+        return eff
+
+    except Exception as e:
+        log.exception("exception parsing snpEff on row %s:%s - %s" % (chrom, pos, info))
+        raise
+    except Error as e:
+        log.error("error parsing snpEff on row %s:%s - %s" % (chrom, pos, info))
+        raise
+
+
diff --git a/cms/util/cmd.py b/cms/util/cmd.py
new file mode 100644
index 00000000..11a6b1ff
--- /dev/null
+++ b/cms/util/cmd.py
@@ -0,0 +1,161 @@
+'''This gives a main() function that serves as a nice wrapper
+around other commands and presents the ability to serve up multiple
+command-line functions from a single python script.
+'''
+
+import os, os.path, tempfile, sys, shutil, logging, argparse
+import util.version
+
+__author__ = "dpark@broadinstitute.org"
+__version__ = util.version.get_version()
+
+log = logging.getLogger()
+tmpDir = None
+
+def setup_logger(log_level):
+    loglevel = getattr(logging, log_level.upper(), None)
+    assert loglevel, "unrecognized log level: %s" % log_level
+    log.setLevel(loglevel)
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("%(asctime)s - %(module)s:%(lineno)d:%(funcName)s - %(levelname)s - %(message)s"))
+    log.addHandler(h)
+
+def script_name():
+    return os.path.basename(sys.argv[0]).rsplit('.',1)[0]
+
+def common_args(parser, arglist=(('tmpDir',None), ('loglevel',None))):
+    for k,v in arglist:
+        if k=='loglevel':
+            if not v:
+                v = 'DEBUG'
+            parser.add_argument("--loglevel", dest="loglevel",
+                help="Verboseness of output.  [default: %(default)s]",
+                default=v,
+                choices=('DEBUG','INFO','WARNING','ERROR','CRITICAL','EXCEPTION'))
+        elif k=='tmpDir':
+            if not v:
+                v = find_tmpDir()
+            parser.add_argument("--tmpDir", dest="tmpDir",
+                help="Base directory for temp files. [default: %(default)s]",
+                default=v)
+            parser.add_argument("--tmpDirKeep",
+                action="store_true", dest="tmpDirKeep",
+                help="""Keep the tmpDir if an exception occurs while
+                    running. Default is to delete all temp files at
+                    the end, even if there's a failure.""",
+                default=False)
+        elif k=='version':
+            if not v:
+                v=__version__
+            parser.add_argument('--version', '-V', action='version', version=v)
+        else:
+            raise Exception("unrecognized argument %s" % arg)
+    return parser
+
+def main_command(mainfunc):
+    ''' This wraps a python method in another method that can be called
+        with an argparse.Namespace object. When called, it will pass all
+        the values of the object on as parameters to the function call.
+    '''
+    def _main(args):
+        args2 = dict((k,v) for k,v in vars(args).items() if k not in ('loglevel','tmpDir','tmpDirKeep','version','func_main','command'))
+        mainfunc(**args2)
+    _main.__doc__ = mainfunc.__doc__
+    return _main
+
+def attach_main(parser, cmd_main, split_args=False):
+    ''' This attaches the main function call to a parser object.
+    '''
+    if split_args:
+        cmd_main = main_command(cmd_main)
+    parser.description = cmd_main.__doc__
+    parser.set_defaults(func_main=cmd_main)
+    return parser
+
+def make_parser(commands, description):
+    ''' commands: a list of pairs containing the following:
+            1. name of command (string, no whitespace)
+            2. method to call (no arguments) that returns an argparse parser.
+            If commands contains exactly one member and the name of the
+            only command is None, then we get rid of the whole multi-command
+            thing and just present the options for that one function.
+        description: a long string to present as a description of your script
+            as a whole if the script is run with no arguments
+    '''
+    if len(commands)==1 and commands[0][0]==None:
+        # only one (nameless) command in this script, simplify
+        parser = commands[0][1]()
+        parser.set_defaults(command='')
+    else:
+        # multiple commands available
+        parser = argparse.ArgumentParser(description=description,
+            usage='%(prog)s subcommand', add_help=False)
+        parser.add_argument('--help', '-h', action='help', help=argparse.SUPPRESS)
+        parser.add_argument('--version', '-V', action='version', version=__version__, help=argparse.SUPPRESS)
+        subparsers = parser.add_subparsers(title='subcommands', dest='command')
+        for cmd_name, cmd_parser in commands:
+            p = subparsers.add_parser(cmd_name)
+            cmd_parser(p)
+    return parser
+
+def main_argparse(commands, description):
+    parser = make_parser(commands, description)
+    
+    # if called with no arguments, print help
+    if len(sys.argv)==1:
+        parser.parse_args(['--help'])
+    elif len(sys.argv)==2 and (len(commands)>1 or commands[0][0]!=None):
+        parser.parse_args([sys.argv[1], '--help'])
+    args = parser.parse_args()
+    
+    setup_logger(not hasattr(args, 'loglevel') and 'DEBUG' or args.loglevel)
+    log.info("software version: %s, python version: %s" % (__version__, sys.version))
+    log.info("command: %s %s %s" % (
+        sys.argv[0], sys.argv[1],
+        ' '.join(["%s=%s" % (k,v) for k,v in vars(args).items() if k not in ('command', 'func_main')])))
+    
+    if hasattr(args, 'tmpDir'):
+        ''' If this command has a tmpDir option, use that as a base directory
+            and create a subdirectory within it which we will then destroy at
+            the end of execution.
+        '''
+        proposed_dir = 'tmp-%s-%s' % (script_name(),args.command)
+        if 'LSB_JOBID' in os.environ:
+            proposed_dir = 'tmp-%s-%s-%s-%s' % (script_name(),args.command,os.environ['LSB_JOBID'],os.environ['LSB_JOBINDEX'])
+        tempfile.tempdir = tempfile.mkdtemp(prefix='%s-'%proposed_dir, dir=args.tmpDir)
+        log.debug("using tempDir: %s" % tempfile.tempdir)
+        os.environ['TMPDIR'] = tempfile.tempdir     # this is for running R
+        try:
+            ret = args.func_main(args)
+        except:
+            if hasattr(args, 'tmpDirKeep') and args.tmpDirKeep and not (tempfile.tempdir.startswith('/tmp') or tempfile.tempdir.startswith('/local')):
+                log.exception("Exception occurred while running %s, saving tmpDir at %s" % (args.command, tempfile.tempdir))
+            else:
+                shutil.rmtree(tempfile.tempdir)
+            raise
+        else:
+            shutil.rmtree(tempfile.tempdir)
+    else:
+        # otherwise just run the command
+        ret = args.func_main(args)
+    if ret==None:
+        ret = 0
+    return ret
+
+def find_tmpDir():
+    ''' This provides a suggested base directory for a temp dir for use in your
+        argparse-based tmpDir option.
+    '''
+    tmpdir = '/tmp'
+    if os.access('/local/scratch', os.X_OK | os.W_OK | os.R_OK) and os.path.isdir('/local/scratch'):
+        tmpdir = '/local/scratch'
+    if 'LSB_JOBID' in os.environ:
+        # this directory often exists for LSF jobs, but not always.
+        # for example, if the job is part of a job array, this directory is called
+        # something unpredictable and unfindable, so just use /local/scratch
+        proposed_dir = '/local/scratch/%s.tmpdir' % os.environ['LSB_JOBID']
+        if os.access(proposed_dir, os.X_OK | os.W_OK | os.R_OK):
+            tmpdir = proposed_dir
+    elif 'TMPDIR' in os.environ and os.path.isdir(os.environ['TMPDIR']):
+        tmpdir = os.environ['TMPDIR']
+    return tmpdir
diff --git a/cms/util/file.py b/cms/util/file.py
new file mode 100644
index 00000000..dd399e5b
--- /dev/null
+++ b/cms/util/file.py
@@ -0,0 +1,217 @@
+'''This gives a number of useful quick methods for dealing with
+tab-text files and gzipped files.
+'''
+
+__author__ = "dpark@broadinstitute.org"
+__version__ = "PLACEHOLDER"
+__date__ = "PLACEHOLDER"
+
+import os, gzip, tempfile, shutil, errno, logging, json
+import util.cmd
+
+log = logging.getLogger(__name__)
+
+def get_project_path() :
+    '''Return the absolute path of the top-level project, assumed to be the
+       parent of the directory containing this script.'''
+    # abspath converts relative to absolute path; expanduser interprets ~
+    path = __file__                  # path to this script
+    path = os.path.expanduser(path)  # interpret ~
+    path = os.path.abspath(path)     # convert to absolute path
+    path = os.path.dirname(path)     # containing directory: util
+    path = os.path.dirname(path)     # containing directory: main project dir
+    return path
+
+def get_build_path() :
+    '''Return absolute path of "build" directory'''
+    return os.path.join(get_project_path(), 'tools', 'build')
+
+def get_scripts_path() :
+    '''Return absolute path of "scripts" directory'''
+    return os.path.join(get_project_path(), 'tools', 'scripts')
+
+def get_binaries_path() :
+    '''Return absolute path of "binaries" directory'''
+    return os.path.join(get_project_path(), 'tools', 'binaries')
+
+def get_test_path() :
+    '''Return absolute path of "test" directory'''
+    return os.path.join(get_project_path(), 'test')
+
+def get_test_input_path(testClassInstance=None) :
+    '''Return the path to the directory containing input files for the specified
+       test class
+    '''
+    if testClassInstance is not None :
+        return os.path.join(get_test_path(), 'input',
+                            type(testClassInstance).__name__)
+    else:
+        return os.path.join(get_test_path(), 'input')
+
+def get_resources() :
+    ''' Return the project resources dictionary '''
+    jsonfile = os.path.join(get_project_path(), 'resources.json')
+    with open(jsonfile, 'rt') as inf:
+        resources = json.load(inf)
+    return resources
+
+def mkstempfname(suffix='', prefix='tmp', dir=None, text=False):
+    ''' There's no other one-liner way to securely ask for a temp file by
+        filename only.  This calls mkstemp, which does what we want, except
+        that it returns an open file handle, which causes huge problems on NFS
+        if we don't close it.  So close it first then return the name part only.
+    '''
+    fd, fn = tempfile.mkstemp(prefix=prefix, suffix=suffix, dir=dir, text=text)
+    os.close(fd)
+    return fn
+
+def set_tmpDir(name):
+    proposed_prefix = ['tmp']
+    if name:
+        proposed_prefix.append(name)
+    for e in ('LSB_JOBID','LSB_JOBINDEX'):
+        if e in os.environ:
+            proposed_prefix.append(os.environ[e])
+    tempfile.tempdir = tempfile.mkdtemp(prefix='-'.join(proposed_prefix)+'-',
+                                        dir=util.cmd.find_tmpDir())
+
+def destroy_tmpDir():
+    if tempfile.tempdir:
+        shutil.rmtree(tempfile.tempdir)
+    tempfile.tempdir = None
+
+def mkdir_p(dirpath):
+    ''' Verify that the directory given exists, and if not, create it.
+    '''
+    try:
+        os.makedirs(dirpath)
+    except OSError as exc: # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(dirpath):
+            pass
+        else: raise
+
+def open_or_gzopen(fname, *opts):
+    return fname.endswith('.gz') and gzip.open(fname, *opts) or open(fname, *opts)
+
+def read_tabfile_dict(inFile):
+    ''' Read a tab text file (possibly gzipped) and return contents as an
+        iterator of dicts.
+    '''
+    with open_or_gzopen(inFile, 'rt') as inf:
+        header = None
+        for line in inf:
+            row = line.rstrip('\n').split('\t')
+            if line.startswith('#'):
+                row[0] = row[0][1:]
+                header = row
+            elif header==None:
+                header = row
+            else:
+                assert len(header)==len(row)
+                yield dict((k,v) for k,v in zip(header, row) if v)
+
+def read_tabfile(inFile):
+    ''' Read a tab text file (possibly gzipped) and return contents as an
+        iterator of arrays.
+    '''
+    with open_or_gzopen(inFile, 'rt') as inf:
+        for line in inf:
+            if not line.startswith('#'):
+                yield line.rstrip('\n').split('\t')
+
+def readFlatFileHeader(filename, headerPrefix='#', delim='\t'):
+    with open_or_gzopen(filename, 'rt') as inf:
+        header = inf.readline().rstrip('\n').split(delim)
+    if header and header[0].startswith(headerPrefix):
+        header[0] = header[0][len(headerPrefix):]
+    return header
+
+class FlatFileParser:
+    ''' Generic flat file parser that parses tabular text input
+    '''
+    def __init__(self, lineIter=None, name=None, outType='dict',
+            readHeader=True, headerPrefix='#', delim='\t',
+            requireUniqueHeader=False):
+        self.lineIter = lineIter
+        self.header = None
+        self.name = name
+        self.headerPrefix = headerPrefix
+        self.readHeader = readHeader
+        self.delim = delim
+        self.requireUniqueHeader = requireUniqueHeader
+        self.line_num=0
+        assert outType in ('dict','arrayStrict', 'arrayLoose','both')
+        self.outType=outType
+        assert readHeader or outType in ('arrayStrict', 'arrayLoose')
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return 0
+
+    def __iter__(self):
+        assert self.lineIter
+        for row in self.lineIter:
+            out = self.parse(row)
+            if out != None:
+                yield out
+
+    def parse(self, row):
+        self.line_num += 1
+        try:
+            line = row.rstrip('\n').split(self.delim)
+            if self.readHeader:
+                if self.headerPrefix and row.startswith(self.headerPrefix):
+                    line[0] = line[0][len(self.headerPrefix):]
+                    assert not (self.requireUniqueHeader and self.header)
+                    self.parseHeader(line)
+                    return None
+                elif not self.header:
+                    self.parseHeader(line)
+                    return None
+                else:
+                    return self.parseRow(line)
+            else:
+                return self.parseRow(line)
+        except Exception as e:
+            template = "Exception parsing file at line {}. Line contents: '{}'"
+            message = template.format(self.line_num, row)
+            if self.name:
+                log.exception("{message}  File: {self.name}".format(**locals()))
+            else:
+                log.exception(message)
+            raise
+
+    def parseHeader(self, row):
+        assert row
+        self.header = row
+        if self.outType != 'arrayLoose':
+            assert len(row) == len(dict([(x,0) for x in row]))
+
+    def parseRow(self, row):
+        assert self.outType == 'arrayLoose' or ( self.header and
+                len(self.header) == len(row) )
+
+        if self.outType =='arrayLoose' or self.outType == 'arrayStrict' :
+            return row
+        out = { self.header[i]: row[i] for i in range( len(self.header) ) }
+        if self.outType=='both':
+            for i in range(len(self.header)):
+                out[i] = row[i]
+        return out
+
+
+def fastaMaker(seqs, linewidth=60):
+    assert linewidth > 0
+
+    for id, seq in seqs:
+        yield ">{}\n".format(id)
+
+        while len(seq) > linewidth:
+            line = seq[:linewidth]
+            seq = seq[linewidth:]
+            yield "{}\n".format(line)
+
+        if seq:
+            yield seq+"\n"
diff --git a/cms/util/misc.py b/cms/util/misc.py
new file mode 100644
index 00000000..8052bea6
--- /dev/null
+++ b/cms/util/misc.py
@@ -0,0 +1,78 @@
+'''A few miscellaneous tools. '''
+from __future__ import division # Division of integers with / should never round!
+import itertools
+
+__author__ = "dpark@broadinstitute.org"
+
+def unique(items):
+    ''' Return unique items in the same order as seen in the input. '''
+    seen = set()
+    for i in items:
+        if i not in seen:
+            seen.add(i)
+            yield i
+
+def histogram(items):
+    ''' I count the number of times I see stuff and return a dict of counts. '''
+    out = {}
+    for i in items:
+        out.setdefault(i, 0)
+        out[i] += 1
+    return out
+
+def freqs(items, zero_checks = set()):
+    ''' Given a list of comparable, non-unique items, produce an iterator of
+            (item, count, freq) tuples.
+            item is a unique instance of one of the items seen on input
+            count is a positive integer describing the number of times this item was observed
+            freq is count / sum(count) for all outputs.
+        If zero_checks is specified, then the output iterator will emit tuples for the
+        items in zero_checks even if they do not appear in the input.  If they are not in
+        the input, they will be emitted with a zero count and freq.
+        See histogram(items)
+    '''
+    tot = 0
+    out = {}
+    for i in items:
+        out.setdefault(i, 0)
+        out[i] += 1
+        tot += 1
+    for k,v in out.items():
+        yield (k,v,float(v)/tot)
+    for i in zero_checks:
+        if i not in out:
+            yield (i,0,0.0)
+
+def intervals(i, n, l):
+    ''' Divide something of length l into n equally sized parts and return the
+        start-stop values of the i'th part.  Values are 1-based.  Each part
+        will be adjacent and non-overlapping with the next part. i must be a
+        number from 1 to n.
+    '''
+    assert 1 <= i <= n and l>=n
+    part_size = l//n
+    start = 1 + part_size * (i-1)
+    stop = part_size * i
+    if i==n:
+        stop = l
+    return (start,stop)
+
+# from http://stackoverflow.com/a/312467
+def batch_iterator(iterator, batch_size) :
+    """Returns lists of length batch_size.
+ 
+    This can be used on any iterator, for example to batch up
+    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
+    Alignment objects from Bio.AlignIO.parse(...), or simply
+    lines from a file handle.
+ 
+    This is a generator function, and it returns lists of the
+    entries from the supplied iterator.  Each list will have
+    batch_size entries, although the final list may be shorter.
+    """
+    it = iter(iterator)
+    item = list(itertools.islice(it, batch_size))
+    while item:
+        yield item
+        item = list(itertools.islice(it, batch_size))
+
diff --git a/cms/util/stats.py b/cms/util/stats.py
new file mode 100644
index 00000000..98f1120c
--- /dev/null
+++ b/cms/util/stats.py
@@ -0,0 +1,202 @@
+'''A few pure-python statistical tools to avoid the need to install scipy. '''
+from __future__ import division # Division of integers with / should never round!
+from math import exp, log, pi, sqrt, gamma, lgamma, erf
+import itertools
+
+__author__ = "dpark@broadinstitute.org, irwin@broadinstitute.org"
+
+try:
+    # Python 3.4
+    from statistics import mean, median
+except ImportError:
+    # Python <3.4, avoid numpy if these two methods are all we really need
+    def mean(l):
+        if len(l)>0:
+            return float(sum(l))/len(l)
+        else:
+            raise Exception("empty list for mean")
+    def median(l):
+        if len(l)>0:
+            half = len(l) // 2
+            l.sort()
+            if len(l) % 2 == 0:
+                return (l[half-1] + l[half]) / 2.0
+            else:
+                return l[half]
+        else:
+            raise Exception("empty list for median")
+
+def product(iter) :
+    prod = 1
+    for x in iter :
+        prod *= x
+    return prod
+
+def chi2_contingency(contingencyTable, correction = True) :
+    """ contingencyTable is a sequence of m sequences, each of length n.
+        Return an estimate using the chi-square distribution of the two-tailed 
+            p-value for an m x n contingency table against the null hypothesis
+            that the row and column criteria are independent. This is not as 
+            accurate as fisher_exact, but is faster (and is implemented for all 
+            m and n).
+        If correction is True and there is 1 degree of freedom, apply Yates's
+            correction for continuity, i.e., adjust each observed value
+            by 0.5 towards the corresponding expected value, which brings
+            the result closer to the Fisher exact test result.
+        Not recommended if any of the counts or expected counts are
+            less than 5.
+    """
+    # scipy equivalent:  scipy.stats.chi2_contingency(contingencyTable)[1]
+    
+    if len(contingencyTable) == 0 :
+        return 1.0
+    if len(set(map(len, contingencyTable))) != 1 :
+        raise ValueError('Not all rows have the same length')
+    
+    # Eliminate rows and columns with 0 sum
+    colSums = [sum(row[col] for row in contingencyTable)
+               for col in range(len(contingencyTable[0]))]
+    table = [[x for x, colSum in zip(row, colSums) if colSum != 0]
+             for row in contingencyTable if sum(row) != 0]
+
+    if len(table) < 2 or len(table[0]) < 2 :
+        return 1.0
+
+    m = len(table)
+    n = len(table[0])
+    rowSums = [sum(row) for row in table]
+    colSums = [sum(row[col] for row in table) for col in range(n)]
+    N = sum(rowSums)
+    expect = [[rowSums[i] * colSums[j] / N for j in range(n)] for i in range(m)]
+    if correction and m == n == 2 :
+        def corr(i, j) :
+            if expect[i][j] > table[i][j] :
+                return min(table[i][j] + 0.5, expect[i][j])
+            else :
+                return max(table[i][j] - 0.5, expect[i][j])
+        table = [[corr(i, j) for j in range(n)] for i in range(m)]
+    chisq = sum((table[i][j] - expect[i][j]) ** 2 / expect[i][j]
+                for j in range(n)
+                for i in range(m))
+    pval = 1 - pchisq(chisq, (m - 1) * (n - 1))
+    return pval
+
+def fisher_exact(contingencyTable) :
+    """ Fisher exact test for the 2 x n case.
+        contingencyTable is a sequence of 2 length-n sequences of integers.
+        Return the two-tailed p-value against the null hypothesis that the row
+            and column criteria are independent, using Fisher's exact test.
+        For n larger than 2, this is very slow, O(S^(n-1)), where S is the
+            smaller of the two row sums. Better to use chi2_contingency unless
+            one of the row sums is small.
+        Handles m x n contingencyTable with m > 2 if it can be reduced to the
+            2 x n case by transposing or by removing rows that are all 0s. Also
+            handles degenerate cases of 0 or 1 row by returning 1.0.
+    """
+    if len(contingencyTable) == 0 :
+        return 1.0
+    if len(set(map(len, contingencyTable))) != 1 :
+        raise ValueError('Not all rows have the same length')
+    if any(x != int(x) for row in contingencyTable for x in row) :
+        raise ValueError('Some table entry is not an integer')
+    if any(x < 0 for row in contingencyTable for x in row) :
+        raise ValueError('Some table entry is negative')
+
+    # Eliminate rows and columns with 0 sum
+    colSums = [sum(row[col] for row in contingencyTable)
+               for col in range(len(contingencyTable[0]))]
+    table = [[x for x, colSum in zip(row, colSums) if colSum != 0]
+             for row in contingencyTable if sum(row) != 0]
+
+    if len(table) < 2 or len(table[0]) < 2 :
+        return 1.0
+
+    if len(table) > len(table[0]) :
+        table = list(zip(*table)) # Transpose
+
+    m = len(table)
+    n = len(table[0])
+    if m != 2 :
+        raise NotImplementedError('More than 2 non-zero rows and columns.')
+
+    # Put row with smaller sum first. Makes the loop iterations simpler.
+    table.sort(key = sum)
+    # Put column with largest sum last. Makes loop quick rejection faster.
+    table = list(zip(*table)) # Transpose
+    table.sort(key = sum)
+    table = list(zip(*table)) # Transpose back
+
+    # There are many optimizations possible for the following code, but it would
+    #     still be O(S^(n-1)) so it would still be too slow for anything
+    #     sizeable, and it's usable as it for small things.
+
+    rowSums = [sum(row) for row in table]
+    colSums = [sum(row[col] for row in table) for col in range(n)]
+
+    logChooseNrowSum = log_choose(sum(rowSums), rowSums[0])
+
+    def prob_of_table(firstRow) :
+        return exp(sum(log_choose(cs, a) for cs, a in zip(colSums, firstRow)) -
+                   logChooseNrowSum)
+
+    p0 = prob_of_table(table[0])
+    result = 0
+    for firstRowM1 in itertools.product(*[range(min(rowSums[0], colSums[i]) + 1)
+                                         for i in range(n - 1)]) :
+        lastElmt = rowSums[0] - sum(firstRowM1)
+        if lastElmt < 0 or lastElmt > colSums[-1] :
+            continue
+        prob = prob_of_table(firstRowM1 + (lastElmt,))
+        if prob <= p0 + 1e-9 : # (1e-9 handles floating point round off)
+            result += prob
+
+    return result
+
+def log_choose(n, k) :
+    # Return log(n choose k). Compute using lgamma(x + 1) = log(x!)
+    if not (0 <= k <=n) :
+        raise ValueError('%d is negative or more than %d' % (k, n))
+    return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1)
+
+def gammainc_halfint(s, x) :
+    """ Lower incomplete gamma function = 
+            integral from 0 to x of t ** (s-1) exp(-t) dt divided by gamma(s),
+        i.e., the fraction of gamma that you get if you integrate only until
+            x instead of all the way to infinity.
+        Implemented here only if s is a positive multiple of 0.5.
+    """
+    # scipy equivalent: scipy.special.gammainc(s,x)
+    
+    if s <= 0 :
+        raise ValueError('%s is not positive' % s)
+    if x < 0 :
+        raise ValueError('%s < 0' % x)
+    if s * 2 != int(s * 2) :
+        raise NotImplementedError('%s is not a multiple of 0.5' % s)
+    
+    # Handle integers analytically
+    if s == int(s) :
+        term = 1
+        total = 1
+        for k in range(1, int(s)) :
+            term *= x /  k
+            total += term
+        return 1 - exp(-x) * total
+
+    # Otherwise s is integer + 0.5. Decrease to 0.5 using recursion formula:
+    result = 0.0
+    while s > 1 :
+        result -= x ** (s - 1) * exp(-x) / gamma(s)
+        s = s - 1
+    # Then use gammainc(0.5, x) = erf(sqrt(x))
+    result += erf(sqrt(x))
+    return result
+
+def pchisq(x, k) :
+    "Cumulative distribution function of chi squared with k degrees of freedom."
+    if k < 1 or k != int(k) :
+        raise ValueError('%s is not a positive integer' % k)
+    if x < 0 :
+        raise ValueError('%s < 0' % x)
+    return gammainc_halfint(k / 2, x / 2)
+
diff --git a/cms/util/vcf.py b/cms/util/vcf.py
new file mode 100644
index 00000000..d23a8c96
--- /dev/null
+++ b/cms/util/vcf.py
@@ -0,0 +1,386 @@
+'''This gives a number of useful quick methods for dealing with VCF files.
+'''
+
+__author__ = "dpark@broadinstitute.org"
+__version__ = "PLACEHOLDER"
+__date__ = "PLACEHOLDER"
+
+import os, shutil, logging, itertools, sqlite3
+import pysam
+import util.file, util.misc
+
+log = logging.getLogger(__name__)
+
+def make_intervals(i, n, fasta, chr_prefix='', verbose=False):
+    ''' Divide a sorted genome into n equally sized parts and return the i'th
+        part.  We will return a list of intervals: chr, start, stop.  It may
+        contain multiple chromosomes in order to fill out a total length equal
+        to the other parts.  Each part will be adjacent and non-overlapping with
+        the next part. i must be a number from 1 to n.
+    '''
+    assert 1 <= i <= n
+
+    # read genome dict file
+    tot = 0
+    chrlens = []
+    for c,c_len in get_chrlens(fasta):
+        if c.startswith(chr_prefix):
+            chrlens.append((c,c_len,tot))
+            tot += c_len
+
+    # define our chunk by gpos:
+    part_size = tot//n
+    g_start = 1 + part_size * (i-1)
+    g_stop = part_size * i
+    if i==n:
+        g_stop = tot
+
+    # find the genomic intervals that correspond to our gpos window
+    out = []
+    for c, c_len, c_g_start in chrlens:
+        c_g_stop = c_g_start + c_len
+        c_g_start += 1
+        if c_g_stop >= g_start and c_g_start <= g_stop:
+            start = max(g_start, c_g_start) - c_g_start + 1
+            stop  = min(g_stop,  c_g_stop)  - c_g_start + 1
+            out.append((c, start, stop))
+
+    if verbose:
+        log.info("Dividing the %d bp genome into %d chunks of %d bp each.  The %dth chunk contains the following %d intervals: %s" % (
+            tot, n, part_size, i, len(out), ', '.join(["%s:%d-%d"%x for x in out])))
+    return out
+
+
+def sliding_windows(fasta, width, offset, chr_prefix=''):
+    ''' Divide a genome into sliding windows and return as an iterator of
+        intervals (chr, start, stop).  The windows are of fixed width
+        (except maybe the last window on each chromosome) and may overlap
+        (offset<width) or be discontinuous (offset>width).
+    '''
+    assert width>0 and offset>0
+    for c,c_len in get_chrlens(fasta):
+        if c.startswith(chr_prefix):
+            start = 1
+            while start <= c_len:
+                stop = min(c_len, start + width - 1)
+                yield (c, start, stop)
+                start += offset
+
+
+class GenomePosition:
+    ''' Provide a mapping from chr:pos to genomic position.
+        Read chromosome lengths and order from either a Picard/GATK-index for
+        a FASTA file (a .dict file) or from a VCF header.
+    '''
+    def __init__(self, seqDb):
+        self.gpos_map = {}
+        self.clen_map = {}
+        self.chrs = []
+        totlen = 0
+        for c,clen in get_chrlens(seqDb):
+            self.chrs.append((c,clen))
+            self.gpos_map[c] = totlen
+            self.clen_map[c] = clen
+            totlen += clen
+        self.total = totlen
+    def get_gpos(self, c, p):
+        assert isinstance(p, int)
+        assert c in self.gpos_map
+        assert 1 <= p <= self.clen_map[c]
+        return p + self.gpos_map[c]
+    def get_chr_pos(self, gpos):
+        assert isinstance(gpos, int)
+        assert 1 <= gpos <= self.total
+        totlen = 0
+        for c,clen in self.chrs:
+            if gpos <= totlen+clen:
+                break
+            totlen += clen
+        return (c,gpos-totlen)
+
+def get_chrlens(inFile):
+    ''' Read chromosome lengths and order from either a Picard/GATK-index for
+        a FASTA file (a .dict file) or from "contig" rows in the VCF header.
+    '''
+    chrlens = []
+    if hasattr(inFile, 'chrlens'):
+        chrlens = inFile.chrlens()
+    else:
+        if inFile.endswith('.fasta'):
+            inFile = inFile[:-len('.fasta')] + '.dict'
+        elif inFile.endswith('.fa'):
+            inFile = inFile[:-len('.fa')] + '.dict'
+        if inFile.endswith('.dict'):
+            with open(inFile, 'rt') as inf:
+                for line in inf:
+                    row = line.rstrip('\n').split('\t')
+                    if row[0]=='@SQ':
+                        assert row[1].startswith('SN:') and row[2].startswith('LN:')
+                        c = row[1][3:]
+                        c_len = int(row[2][3:])
+                        chrlens.append((c,c_len))
+        elif inFile.endswith('.vcf') or inFile.endswith('.vcf.gz'):
+            with util.file.open_or_gzopen(inFile, 'rt') as inf:
+                for line in inf:
+                    line = line.rstrip('\n')
+                    if line.startswith('##contig=<ID=') and line.endswith('>'):
+                        line = line[13:-1]
+                        c = line.split(',')[0]
+                        clen = int(line.split('=')[1])
+                        chrlens.append((c,clen))
+                    elif line.startswith('#CHROM'):
+                        break
+        else:
+            raise AssertionError("unrecognized file type %s" % inFile)
+    assert chrlens, "no sequence data found in %s % inFile"
+    return chrlens
+
+def calc_maf(genos, ancestral=None, ploidy=1):
+    # get list of alleles
+    if ploidy==1:
+        alleles = genos
+    else:
+        alleles = []
+        for g in genos:
+            g = g.split('/')
+            assert len(g)==ploidy
+            alleles += g
+
+    # count up
+    out = {'n_tot':len(alleles)}
+    acounts = util.misc.histogram(alleles)
+    alist = sorted([(n,a) for a,n in acounts.items()])
+
+    # daf
+    if ancestral != None:
+        out['a_ancestral'] = ancestral
+        derived = list(sorted([a for a in acounts.keys() if a!=ancestral]))
+        out['a_derived'] = ','.join(derived)
+        out['dac'] = sum(acounts[a] for a in derived)
+        out['daf'] = out['n_tot'] and float(out['dac'])/out['n_tot'] or None
+
+    # maf
+    if out['n_tot']:
+        out['a_major'] = alist[-1][1]
+        out['a_minor'] = ','.join([a for n,a in alist[:-1]])
+        out['mac'] = out['n_tot'] - alist[-1][0]
+        out['maf'] = float(out['mac'])/out['n_tot']
+    else:
+        out['a_major'] = None
+        out['a_minor'] = None
+        out['mac'] = None
+        out['maf'] = None
+
+    return out
+
+
+class TabixReader(pysam.Tabixfile):
+    ''' A wrapper around pysam.Tabixfile that provides a context and
+        allows us to query using 1-based coordinates.
+
+        We should request upstream to the Pysam people to implement
+        methods __reversed__() and __len__() in pysam.TabixIterator.
+        __getitem__ could be a bonus, but prob unnecessary.
+    '''
+    def __init__(self, inFile, parser=pysam.asTuple()):
+        # because of odd Cython weirdness, we don't actually want to call super.__init__ here..
+        #super(TabixReader, self).__init__(inFile, parser=parser)
+        self.parser = parser
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return 0
+    def close(self):
+        super(TabixReader, self).close()
+    def chroms(self):
+        return self.contigs
+    def get(self, chrom=None, start=None, stop=None, region=None):
+        if start!=None:
+            start -= 1
+        return self.fetch(reference=chrom, start=start, end=stop,
+            region=region, parser=self.parser)
+
+
+def get_pos_from_vcf_record(vcfrec):
+    # new versions of pysam return a zero-based position here
+    return vcfrec.pos + 1
+
+def bytes_to_string(o):
+    if type(o) == bytes:
+        o = o.decode('utf-8')
+    return o
+
+class VcfReader(TabixReader):
+    ''' Same as TabixReader with a few more perks for VCF files:
+        - emit results parsed as pysam VCF rows
+        - provide self.chrlens(), a dict mapping chrom name to chrom length
+        - provide self.samples(), a list of sample names in order of appearance
+        - provide get_range(c,start,stop) and get_snp_genos(c,pos)
+    '''
+    def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
+        super(VcfReader, self).__init__(inFile, parser=parser)
+        assert ploidy in (1,2)
+        self.ploidy = ploidy
+        self.clens = []
+        self.sample_names = None
+        for line in self.header:
+            line = bytes_to_string(line)
+            if line.startswith('##contig=<ID=') and line.endswith('>'):
+                line = line[13:-1]
+                c = line.split(',')[0]
+                clen = int(line.split('=')[1])
+                self.clens.append((c,clen))
+            elif line.startswith('#CHROM'):
+                row = line.split('\t')
+                self.sample_names = row[9:]
+        self.clens = dict(self.clens)
+        assert self.sample_names
+    def samples(self):
+        return self.sample_names
+    def chrlens(self):
+        return self.clens
+    def get_positions(self, c=None, start=None, stop=None, region=None):
+        for snp in self.get(c,start,stop,region):
+            yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), get_pos_from_vcf_record(snp)+len(snp.ref)-1)
+    def get_range(self, c=None, start=None, stop=None, region=None, as_strings=True, more=False):
+        ''' Read a VCF file (optionally just a piece of it) and return contents
+            as an iterator with parsed contents.  Each row is returned as
+            a 4-tuple:
+                1: chr
+                2: pos (int)
+                3: list of allele strings (in order)
+                4: list of genotypes as 2-tuples:
+                    haploid: (sample, allele)
+                    diploid: (sample, [allele, allele])
+            If as_strings, the alleles will be actual alleles.  Otherwise,
+            alleles will be integers.
+            If more is true, a fifth column will be emitted with the pysam VCF object.
+        '''
+        for snp in self.get(c,start,stop,region):
+            alleles = [bytes_to_string(snp.ref)] + bytes_to_string(snp.alt).split(',')
+            alleles = [a for a in alleles if a != '.']
+            if self.ploidy==1:
+                genos = [(self.sample_names[i], int(bytes_to_string(snp[i])[0]))
+                    for i in range(len(self.sample_names))
+                    if bytes_to_string(snp[i])[0] != '.']
+                if as_strings:
+                    genos = [(s,alleles[a]) for s,a in genos]
+            else:
+                genos = [(self.sample_names[i], [int(bytes_to_string(snp[i])[j*2]) for j in range(self.ploidy)])
+                    for i in range(len(self.sample_names))
+                    if bytes_to_string(snp[i])[0] != '.']
+                if as_strings:
+                    genos = [(s,[alleles[a] for a in g]) for s,g in genos]
+            if more:
+                yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos, snp)
+            else:
+                yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos)
+    def get_snp_genos(self, c, p, as_strings=True):
+        ''' Read a single position from a VCF file and return the genotypes
+            as a sample -> allele map.  If there is not exactly one matching
+            row in the VCF file at this position (if there are none or multiple)
+            then we return an empty map: {}.
+        '''
+        snps = [x for x in self.get_range(c,p,p,as_strings=as_strings)]
+        return len(snps)==1 and dict(snps[0][3]) or {}
+    def getFullSequences(self, c, start, stop, samples,
+        na='-', refSeq=None, refInvariantsOnly=False, ignoreIndels=False):
+        ''' chr - chromosome name
+            start - start position
+            stop - default = start
+            samples is a list of samples.  None can be used for the ref sample.
+            if refSeq is a string with length = stop-start+1, then use this as the
+                base sequence for missing data, otherwise, use the na string.
+                if refInvariantsOnly is True, refSeq is only used for invariant
+                sites or sites with no entries for any samples.  if False, refSeq
+                is used for all missing data.
+        '''
+        assert 1<=start<=stop
+        assert len(na)==1
+
+        # get all the VCF records
+        vcf_records = [(p-start,alleles,dict(genos)) for chrom,p,alleles,genos
+            in self.get_range(c, start, stop, as_strings=True)
+            if not ignoreIndels or set(map(len, alleles)) == set([1])]
+
+        # Construct a list called "seq" into which we will replace alleles as
+        # we discover them.  This is a list, not a string, because each position
+        # might be replaced with an arbitrary length string (deletions will be
+        # empty strings, insertions will be strings > length 1).  This all gets
+        # converted to a string at the end.
+        if refSeq:
+            # assume refSeq alleles as a baseline for missing data
+            assert len(refSeq)==(stop-start+1)
+            seq = list(refSeq)
+            if refInvariantsOnly:
+                # but don't assume refSeq alleles for any known variant sites
+                for i,alleles,genos in vcf_records:
+                    if len(set(genos[s] for s in samples if s in genos))>1:
+                        for j in range(len(alleles[0])):
+                            if i+j < len(seq):
+                                seq[i+j] = na
+        else:
+            # assume nothing when data is missing
+            seq = list(na * (stop-start+1))
+
+        for sample in samples:
+            assert sample==None or sample in self.samples()
+
+            # Make copy of reference sequence
+            newseq = [s for s in seq]
+
+            # Replace alleles, one site at a time.
+            replaceAlleles(sample, newseq, vcf_records)
+
+            # Convert list back to string.  This string may or may not be the same
+            # length as the original (if ignoreIndels is True, it must be the same
+            # length as the original).
+            newseq = ''.join(newseq)
+            assert len(newseq)==(stop-start+1) or not ignoreIndels
+            yield (sample, newseq)
+
+
+def replaceAlleles(sample,seq,vcf_records):
+    ''' Replace alleles, one site at a time. '''
+    for i,alleles,genos in vcf_records:
+
+        # set allele to the DNA sequence we will replace at positions i through i+len(refallele)-1
+        if sample==None:
+            # caller is asking for the reference sample's sequence
+            allele = alleles[0]
+            alleles = [allele]
+        else:
+            # caller is asking for a normal sample
+            samp_geno = genos.get(sample)
+            if not samp_geno:
+                continue
+            if isinstance(samp_geno, list):
+                log.warn("TO DO: add code to turn hets into IUPAC ambiguity codes (%s %s = %s)." % (i,sample, '/'.join(samp_geno)))
+                continue
+            allele = samp_geno
+
+        # replace portion of sequence with allele
+        # NEED BUGFIXES HERE for overlapping VCF records
+        if allele == None:
+            # nothing here ... is this even possible anymore?
+            pass
+        elif len(alleles[0])==1:
+            # the most common cases: SNP, novar, or indel replacing one ref base (with zero or more bases)
+            seq[i] = allele
+        else:
+            # most complicated case: something replacing multiple ref bases
+            # TODO: beware--this is all untested!
+            for j in range(max(len(alleles[0]), len(allele))):
+                if i+j < len(seq):
+                    if j<len(allele):
+                        if j==len(alleles[0])-1:
+                            # new allele is >= ref length, fill out the rest of the bases
+                            seq[i+j] = allele[j:]
+                        else:
+                            seq[i+j] = allele[j]
+                    else:
+                        # new allele is shorter than ref, so delete extra bases
+                        seq[i+j] = ''
+    return seq
+
diff --git a/cms/util/version.py b/cms/util/version.py
new file mode 100644
index 00000000..eb55cb66
--- /dev/null
+++ b/cms/util/version.py
@@ -0,0 +1,69 @@
+''' This gets the git version into python-land
+'''
+
+__author__ = "dpark@broadinstitute.org"
+__version__ = None
+
+import subprocess, os, os.path
+ 
+def get_project_path() :
+    '''Return the absolute path of the top-level project, assumed to be the
+       parent of the directory containing this script.'''
+    # abspath converts relative to absolute path; expanduser interprets ~
+    path = __file__                  # path to this script
+    path = os.path.expanduser(path)  # interpret ~
+    path = os.path.abspath(path)     # convert to absolute path
+    path = os.path.dirname(path)     # containing directory: util
+    path = os.path.dirname(path)     # containing directory: main project dir
+    return path
+
+def call_git_describe():
+    cwd = os.getcwd()
+    try:
+        os.chdir(get_project_path())
+        cmd = ['git', 'describe', '--tags', '--always', '--dirty']
+        out = subprocess.check_output(cmd)
+        if type(out) != str:
+            out = out.decode('utf-8')
+        ver = out.strip()
+    except:
+        ver = None
+    os.chdir(cwd)
+    return ver
+
+def release_file():
+    return os.path.join(get_project_path(), 'VERSION')
+ 
+def read_release_version():
+    try:
+        with open(release_file(), 'rt') as inf:
+            version = inf.readlines()[0].strip()
+    except:
+        version = None
+    return version
+ 
+def write_release_version(version):
+    with open(release_file(), 'wt') as outf:
+        outf.write(version+'\n')
+
+def get_version():
+    global __version__
+    if __version__ == None:
+        from_git  = call_git_describe()
+        from_file = read_release_version()
+        
+        if from_git:
+            if from_file != from_git:
+                write_release_version(from_git)
+            __version__ = from_git
+        else:
+            __version__ = from_file
+        
+        if __version__ == None:
+            raise ValueError("Cannot find the version number!")
+    
+    return __version__
+ 
+ 
+if __name__ == "__main__":
+    print(get_version())
diff --git a/docs/conf.py b/docs/conf.py
index cce92ddf..af05e99a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -19,7 +19,24 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.dirname(os.path.abspath('.')))
+
+# -- Mock out the heavyweight pip packages, esp those that require C ----
+import mock
+MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam',
+  'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData']
+for mod_name in MOCK_MODULES:
+   sys.modules[mod_name] = mock.Mock()
+   
+# -- Obtain GIT version --
+import subprocess
+def _git_version():
+    cmd = ['git', 'describe', '--tags', '--always']  # omit "--dirty" from doc build
+    out = subprocess.check_output(cmd)
+    if type(out) != str:
+        out = out.decode('utf-8')
+    return out.strip()
+__version__ = _git_version()
 
 # -- General configuration ------------------------------------------------
 
@@ -60,10 +77,8 @@
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
-# The short X.Y version.
-version = '1.0'
-# The full version, including alpha/beta/rc tags.
-release = '1.0'
+release = __version__
+version = '.'.join(release.split('.')[:2])
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -76,7 +91,7 @@
 # non-false value, then it is used:
 #today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -114,7 +129,12 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'alabaster'
+#html_theme = 'default'
+import sphinx_rtd_theme
+
+html_theme = "sphinx_rtd_theme"
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -152,7 +172,7 @@
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+html_last_updated_fmt = '%Y-%m-%d. {}'.format(release)
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
diff --git a/docs/install.rst b/docs/install.rst
index bad8afaf..de94dd15 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -35,4 +35,4 @@ most of the bioinformatic tools we rely on here. They will auto-download
 and install the first time they are needed by any command. If you want
 to pre-install all of the external tools, simply type this::
 
-  python -m unittest test.test_tools.TestToolsInstallation -v
+  python -m unittest test.unit.test_tools.TestToolsInstallation -v
diff --git a/requirements.txt b/requirements.txt
index 95311c9f..1008ad0f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 biopython==1.65
-numpy==1.8.0rc1
+numpy==1.8.0
 pandas==0.16.0
 paramiko==1.15.2
 crypto==1.3.4
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..2e40912e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,21 @@
+try:
+    from setuptools import setup, find_packages
+except ImportError:
+    from distutils.core import setup
+
+EXCLUDE_FROM_PACKAGES = ['old','cms.test*']
+
+config = {
+    'description': 'Composite of Multiple Signals: tests for selection in meiotically recombinant populations',
+    'author': 'Broad Institute',
+    'url': 'https://github.com/broadinstitute/cms',
+    'download_url': 'https://github.com/broadinstitute/cms',
+    'author_email': '',
+    'version': '0.1',
+    'install_requires': ['nose','biopython','pysam'],
+    'packages': find_packages(exclude=EXCLUDE_FROM_PACKAGES),
+    'scripts': ['cms/selection.py','cms/main.py'],
+    'name': 'python-cms'
+}
+
+setup(**config)
diff --git a/travis/before_install.sh b/travis/before_install.sh
new file mode 100755
index 00000000..e69de29b
diff --git a/travis/install-pip.sh b/travis/install-pip.sh
new file mode 100755
index 00000000..39f2d25b
--- /dev/null
+++ b/travis/install-pip.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+echo "pip installing required python packages"
+pip install -r requirements.txt
+
+#PYVER=`python -V 2>&1 | cut -c 8`
+# PYVER=`echo $TRAVIS_PYTHON_VERSION | cut -c 1`
+# if [ "$PYVER" = "3" ]; then
+#     echo "pip installing snakemake packages (py3 only)"
+#     pip install -r requirements-pipes.txt
+# fi
+
+echo "pip installing coveralls"
+pip install -q coveralls nose-cov
diff --git a/travis/install-tools.sh b/travis/install-tools.sh
new file mode 100755
index 00000000..e69de29b
diff --git a/travis/tests-long.sh b/travis/tests-long.sh
new file mode 100755
index 00000000..1ccf0c10
--- /dev/null
+++ b/travis/tests-long.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+echo "TRAVIS_BRANCH: $TRAVIS_BRANCH"
+echo "TRAVIS_PULL_REQUEST: $TRAVIS_PULL_REQUEST"
+
+if [ $TRAVIS_PULL_REQUEST != "false" -o $TRAVIS_BRANCH = "master" -o -n "$TRAVIS_TAG" ]; then
+    echo "This is on master or is a pull request: executing long running tests..."
+    nosetests -v --with-xunit --with-coverage --nocapture \
+        --cover-inclusive --cover-branches --cover-tests \
+        --cover-package main \
+        -w cms/test/integration/
+
+else
+    echo "This is not a pull request: skipping long running tests."
+fi
diff --git a/travis/tests-unit.sh b/travis/tests-unit.sh
new file mode 100755
index 00000000..8843d0f6
--- /dev/null
+++ b/travis/tests-unit.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+
+ nosetests -v --with-xunit --with-coverage --nocapture \
+     --cover-inclusive --cover-branches --cover-tests \
+     --cover-package selection \
+     -w cms/test/unit/
\ No newline at end of file