diff --git a/.gitignore b/.gitignore index 1e536d7..07db275 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # Autotools cruft (NEVER check this into VCS) Makefile Makefile.in +.version aclocal.m4 .deps/ .dirstamp @@ -17,37 +18,36 @@ configure depcomp install-sh missing +py-compile +ar-lib +/shorah # generated tarball shorah*.tar.bz2 # executables b2w -contain diri_sampler fil -freqEst # samtools lib libbam.a -# python/perl scripts are generated by Autoconf from *.{py,pl}.in -# files, replacing the shebang in the process -fas2read.pl -src/scripts/amplian.py -src/scripts/dec.py -src/scripts/mm.py -src/scripts/shorah.py -src/scripts/snv.py - # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class -py-compile # OS X temporary files that should never be committed .DS_Store *.swp *.lock profile + +# other intermediate stuff +build +.eggs +src/ShoRAH.egg-info/ + +# shorah output +shorah.log diff --git a/.travis.yml b/.travis.yml index 699457a..a6bc46a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,17 @@ -language: c cpp +language: c cpp python + +python: + - "3.6" # GSL and Autotools needed before_install: - sudo add-apt-repository ppa:ondrej/autotools -y - sudo apt-get update -qq - sudo apt-get install -y gsl-bin libgsl0-dev autoconf automake-1.15 pkg-config m4 + - source ~/virtualenv/python3.6/bin/activate compiler: - gcc before_script: autoreconf -vif -script: ./configure && make -j1 && make distcheck +script: ./configure && make -j1 distcheck diff --git a/CHANGELOG b/CHANGELOG.md similarity index 100% rename from CHANGELOG rename to CHANGELOG.md diff --git a/Makefile.am b/Makefile.am index d96df09..3e0ef0f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,48 +1,38 @@ # install scripts in ${bindir} bin_SCRIPTS = \ - src/scripts/amplian.py \ - src/scripts/dec.py \ - src/scripts/mm.py \ - src/scripts/shorah.py \ - src/scripts/snv.py \ - fas2read.pl + shorah CLEANFILES = $(bin_SCRIPTS) EXTRA_DIST = \ - src/scripts/amplian.py.in \ - src/scripts/dec.py.in \ - src/scripts/mm.py.in \ - src/scripts/shorah.py.in \ - src/scripts/snv.py.in \ - src/scripts/fas2read.pl.in \ - src/samtools/COPYING + .version \ + meson.build \ + setup.py \ + src/meson.build \ + src/cpp/meson.build \ + src/samtools/COPYING \ + src/samtools/meson.build +BUILT_SOURCES = .version -fas2read.pl: src/scripts/fas2read.pl.in Makefile - $(SED) 's,[@]perldir[@],$(perldir),g' $< > $@ +shorah: $(top_srcdir)/src/shorah/__main__.py Makefile + $(SED) 's,#!/usr/bin/env python3,#!/usr/bin/env $(PYTHON),g' $(top_srcdir)/src/shorah/__main__.py > $@ # actual C/C++ programs doing the real computation -bin_PROGRAMS = diri_sampler contain freqEst b2w fil +bin_PROGRAMS = diri_sampler b2w fil diri_sampler_SOURCES = \ - src/shorah/data_structures.hpp \ - src/shorah/dpm_sampler.hpp \ - src/shorah/dpm_sampler.cpp + src/cpp/data_structures.hpp \ + src/cpp/dpm_sampler.hpp \ + src/cpp/dpm_sampler.cpp diri_sampler_CPPFLAGS = $(GSL_CFLAGS) diri_sampler_LDADD = $(GSL_LIBS) -contain_SOURCES = \ - src/shorah/contain.cpp - -freqEst_SOURCES = \ - src/shorah/freqEst.cpp - b2w_SOURCES = \ - src/shorah/b2w.cpp + src/cpp/b2w.cpp b2w_CXXFLAGS = $(PTHREAD_CFLAGS) b2w_CPPFLAGS = -I$(top_srcdir)/src/samtools b2w_LDADD = libbam.a $(ZLIB_LIBS) $(PTHREAD_LIBS) fil_SOURCES = \ - src/shorah/fil.cpp + src/cpp/fil.cpp fil_CXXFLAGS = $(PTHREAD_CFLAGS) fil_CPPFLAGS = $(GSL_CFLAGS) -I$(top_srcdir)/src/samtools fil_LDADD = libbam.a $(ZLIB_LIBS) $(GSL_LIBS) $(PTHREAD_LIBS) @@ -85,19 +75,14 @@ libbam_a_SOURCES = \ libbam_a_CPPFLAGS = $(ZLIB_CFLAGS) -D_USE_KNETFILE libbam_a_CFLAGS = $(PTHREAD_CFLAGS) -# perl modules -dist_perl_DATA = \ - src/perl/DNA.pm \ - src/perl/NikoUtil.pm \ - src/perl/ReadUtil.pm \ - src/perl/Util.pm - # python modules -python_PYTHON = \ - src/python/shorah_dec.py \ - src/python/shorah_matching.py \ - src/python/shorah_mm.py \ - src/python/shorah_snv.py +dist_pkgpython_PYTHON = \ + src/shorah/__main__.py \ + src/shorah/amplicon.py \ + src/shorah/cli.py \ + src/shorah/shorah_snv.py \ + src/shorah/shotgun.py +pkgpython_PYTHON = .version # example files dist_pkgdata_DATA = \ @@ -110,3 +95,9 @@ dist_ampliconpkgdata_DATA = \ examples/amplicon_test/ampli_sorted.bam \ examples/amplicon_test/amplicon_reads.fastq \ examples/amplicon_test/reference.fasta + +# version handling +$(top_srcdir)/.version: + echo $(VERSION) > $@-t && mv $@-t $@ +dist-hook: + echo $(VERSION) > $(distdir)/.tarball-version diff --git a/README.md b/README.md index 1883942..ef828fe 100755 --- a/README.md +++ b/README.md @@ -132,10 +132,10 @@ These can be run one after the other, or one can invoke `shorah.py`, that runs the whole process from bam file to frequency estimation and SNV calling. ## Coding style -All changes to the C++ code in `src/shorah` should always be formatted according to the included `.clang-format` style by doing +All changes to the C++ code in `src/cpp` should always be formatted according to the included `.clang-format` style by doing - clang-format -style=file -i src/shorah/*.[ch]pp + clang-format -style=file -i src/cpp/*.[ch]pp in the root of the repository. -All changes to the python code in `src/python` and `src/scripts` should always be formatted conforming to the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style guide. To this end, we advise to use [autopep8](https://pypi.python.org/pypi/autopep8). +All changes to the python code in `src/shorah` should always be formatted conforming to the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style guide. To this end, we advise to use [autopep8](https://pypi.python.org/pypi/autopep8). diff --git a/build-aux/git-version-gen b/build-aux/git-version-gen new file mode 100755 index 0000000..6d073fc --- /dev/null +++ b/build-aux/git-version-gen @@ -0,0 +1,227 @@ +#!/bin/sh +# Print a version string. +scriptversion=2018-03-07.03; # UTC + +# Copyright (C) 2007-2018 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# This script is derived from GIT-VERSION-GEN from GIT: https://git-scm.com/. +# It may be run two ways: +# - from a git repository in which the "git describe" command below +# produces useful output (thus requiring at least one signed tag) +# - from a non-git-repo directory containing a .tarball-version file, which +# presumes this script is invoked like "./git-version-gen .tarball-version". + +# In order to use intra-version strings in your project, you will need two +# separate generated version string files: +# +# .tarball-version - present only in a distribution tarball, and not in +# a checked-out repository. Created with contents that were learned at +# the last time autoconf was run, and used by git-version-gen. Must not +# be present in either $(srcdir) or $(builddir) for git-version-gen to +# give accurate answers during normal development with a checked out tree, +# but must be present in a tarball when there is no version control system. +# Therefore, it cannot be used in any dependencies. GNUmakefile has +# hooks to force a reconfigure at distribution time to get the value +# correct, without penalizing normal development with extra reconfigures. +# +# .version - present in a checked-out repository and in a distribution +# tarball. Usable in dependencies, particularly for files that don't +# want to depend on config.h but do want to track version changes. +# Delete this file prior to any autoconf run where you want to rebuild +# files to pick up a version string change; and leave it stale to +# minimize rebuild time after unrelated changes to configure sources. +# +# As with any generated file in a VC'd directory, you should add +# /.version to .gitignore, so that you don't accidentally commit it. +# .tarball-version is never generated in a VC'd directory, so needn't +# be listed there. +# +# Use the following line in your configure.ac, so that $(VERSION) will +# automatically be up-to-date each time configure is run (and note that +# since configure.ac no longer includes a version string, Makefile rules +# should not depend on configure.ac for version updates). +# +# AC_INIT([GNU project], +# m4_esyscmd([build-aux/git-version-gen .tarball-version]), +# [bug-project@example]) +# +# Then use the following lines in your Makefile.am, so that .version +# will be present for dependencies, and so that .version and +# .tarball-version will exist in distribution tarballs. +# +# EXTRA_DIST = $(top_srcdir)/.version +# BUILT_SOURCES = $(top_srcdir)/.version +# $(top_srcdir)/.version: +# echo $(VERSION) > $@-t && mv $@-t $@ +# dist-hook: +# echo $(VERSION) > $(distdir)/.tarball-version + + +me=$0 + +version="git-version-gen $scriptversion + +Copyright 2011 Free Software Foundation, Inc. +There is NO warranty. You may redistribute this software +under the terms of the GNU General Public License. +For more information about these matters, see the files named COPYING." + +usage="\ +Usage: $me [OPTION]... \$srcdir/.tarball-version [TAG-NORMALIZATION-SED-SCRIPT] +Print a version string. + +Options: + + --prefix PREFIX prefix of git tags (default 'v') + --fallback VERSION + fallback version to use if \"git --version\" fails + + --help display this help and exit + --version output version information and exit + +Running without arguments will suffice in most cases." + +prefix=v +fallback= + +while test $# -gt 0; do + case $1 in + --help) echo "$usage"; exit 0;; + --version) echo "$version"; exit 0;; + --prefix) shift; prefix=${1?};; + --fallback) shift; fallback=${1?};; + -*) + echo "$0: Unknown option '$1'." >&2 + echo "$0: Try '--help' for more information." >&2 + exit 1;; + *) + if test "x$tarball_version_file" = x; then + tarball_version_file="$1" + elif test "x$tag_sed_script" = x; then + tag_sed_script="$1" + else + echo "$0: extra non-option argument '$1'." >&2 + exit 1 + fi;; + esac + shift +done + +if test "x$tarball_version_file" = x; then + echo "$usage" + exit 1 +fi + +tag_sed_script="${tag_sed_script:-s/x/x/}" + +nl=' +' + +# Avoid meddling by environment variable of the same name. +v= +v_from_git= + +# First see if there is a tarball-only version file. +# then try "git describe", then default. +if test -f $tarball_version_file +then + v=`cat $tarball_version_file` || v= + case $v in + *$nl*) v= ;; # reject multi-line output + [0-9]*) ;; + *) v= ;; + esac + test "x$v" = x \ + && echo "$0: WARNING: $tarball_version_file is missing or damaged" 1>&2 +fi + +if test "x$v" != x +then + : # use $v +# Otherwise, if there is at least one git commit involving the working +# directory, and "git describe" output looks sensible, use that to +# derive a version string. +elif test "`git log -1 --pretty=format:x . 2>&1`" = x \ + && v=`git describe --abbrev=4 --match="$prefix*" HEAD 2>/dev/null \ + || git describe --abbrev=4 HEAD 2>/dev/null` \ + && v=`printf '%s\n' "$v" | sed "$tag_sed_script"` \ + && case $v in + $prefix[0-9]*) ;; + *) (exit 1) ;; + esac +then + # Is this a new git that lists number of commits since the last + # tag or the previous older version that did not? + # Newer: v6.10-77-g0f8faeb + # Older: v6.10-g0f8faeb + vprefix=`expr "X$v" : 'X\(.*\)-g[^-]*$'` || vprefix=$v + case $vprefix in + *-*) : git describe is probably okay three part flavor ;; + *) + : git describe is older two part flavor + # Recreate the number of commits and rewrite such that the + # result is the same as if we were using the newer version + # of git describe. + vtag=`echo "$v" | sed 's/-.*//'` + commit_list=`git rev-list "$vtag"..HEAD 2>/dev/null` \ + || { commit_list=failed; + echo "$0: WARNING: git rev-list failed" 1>&2; } + numcommits=`echo "$commit_list" | wc -l` + v=`echo "$v" | sed "s/\(.*\)-\(.*\)/\1-$numcommits-\2/"`; + test "$commit_list" = failed && v=UNKNOWN + ;; + esac + + # Change the penultimate "-" to ".", for version-comparing tools. + # Remove the "g" to save a byte. + v=`echo "$v" | sed 's/-\([^-]*\)-g\([^-]*\)$/.\1-\2/'`; + v_from_git=1 +elif test "x$fallback" = x || git --version >/dev/null 2>&1; then + v=UNKNOWN +else + v=$fallback +fi + +v=`echo "$v" |sed "s/^$prefix//"` + +# Test whether to append the "-dirty" suffix only if the version +# string we're using came from git. I.e., skip the test if it's "UNKNOWN" +# or if it came from .tarball-version. +if test "x$v_from_git" != x; then + # Don't declare a version "dirty" merely because a timestamp has changed. + git update-index --refresh > /dev/null 2>&1 + + dirty=`exec 2>/dev/null;git diff-index --name-only HEAD` || dirty= + case "$dirty" in + '') ;; + *) # Append the suffix only if there isn't one already. + case $v in + *-dirty) ;; + *) v="$v-dirty" ;; + esac ;; + esac +fi + +# Omit the trailing newline, so that m4_esyscmd can use the result directly. +printf %s "$v" + +# Local variables: +# eval: (add-hook 'before-save-hook 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC0" +# time-stamp-end: "; # UTC" +# End: diff --git a/configure.ac b/configure.ac index 058fcc1..ed793c5 100644 --- a/configure.ac +++ b/configure.ac @@ -21,10 +21,10 @@ dnl https://github.com/cbg-ethz/shorah/releases dnl Initialise Autoconf AC_PREREQ([2.69]) AC_INIT( - [ShoRAH], - [1.1.1], + [shorah], + m4_esyscmd([build-aux/git-version-gen .tarball-version]), [osvaldo.zagordi@gmail.com]) -AC_CONFIG_SRCDIR([src/shorah/freqEst.cpp]) +AC_CONFIG_SRCDIR([src/cpp/dpm_sampler.cpp]) AC_CONFIG_MACRO_DIR([m4]) @@ -64,6 +64,7 @@ AC_PROG_SED AC_PROG_CC AC_PROG_CXX AC_PROG_RANLIB +AM_PROG_AR AC_LANG([C]) AX_COMPILER_VENDOR @@ -142,23 +143,14 @@ AM_SILENT_RULES([yes]) dnl ====================================== dnl Initialise Perl and Python directories dnl ====================================== -AM_PATH_PYTHON -AC_ARG_VAR([perldir], [path in which to install perl modules, '${libdir}/perl' by default]) -: ${perldir="${libdir}/perl"} +AM_PATH_PYTHON([3.5]) dnl ======== dnl Finalise dnl ======== -AC_CONFIG_FILES([ - Makefile - src/scripts/amplian.py - src/scripts/dec.py - src/scripts/mm.py - src/scripts/shorah.py - src/scripts/snv.py - ]) +AC_CONFIG_FILES([Makefile]) AC_OUTPUT @@ -169,12 +161,15 @@ dnl ============================ AC_MSG_RESULT([ ${PACKAGE_NAME} ${VERSION} - CC: ${CC} (${ax_cv_c_compiler_vendor}, ${ax_cv_c_compiler_version}) - CFLAGS: ${CFLAGS} + CC : ...................... ${CC} (${ax_cv_c_compiler_vendor}, ${ax_cv_c_compiler_version}) + CFLAGS : .................. ${CFLAGS} + + CXX : ..................... ${CXX} (${ax_cv_cxx_compiler_vendor}, ${ax_cv_cxx_compiler_version}) + CXXFLAGS : ................ ${CXXFLAGS} - CXX: ${CXX} (${ax_cv_cxx_compiler_vendor}, ${ax_cv_cxx_compiler_version}) - CXXFLAGS: ${CXXFLAGS} + CPPFLAGS : ................ ${CPPFLAGS} + LDFLAGS : ................. ${LDFLAGS} - CPPFLAGS: ${CPPFLAGS} - LDFLAGS: ${LDFLAGS} + PYTHON : .................. ${PYTHON} + Python version : .......... ${PYTHON_VERSION} ]) diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..e2ae37c --- /dev/null +++ b/meson.build @@ -0,0 +1,18 @@ +project('shorah', + ['c', 'cpp'], + version : '1.9.95', + default_options : [ + 'buildtype=release', + 'default_library=shared', + 'warning_level=3', + 'b_ndebug=if-release'], + license : 'GPL-2+') + +# info +as_version = meson.project_version() +ver_arr = as_version.split('.') +as_major_version = ver_arr[0] +as_minor_version = ver_arr[1] +as_micro_version = ver_arr[2] + +subdir('src') diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ae76819 --- /dev/null +++ b/setup.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +import glob +import sys +import os +import shutil + +from setuptools import setup, find_packages +from setuptools.command.test import test as TestCommand +from setuptools.command.develop import develop +from setuptools.command.install import install + +def move_files(): + """Identify the build directory and copy python files and executables into src/shorah.""" + try: + main_file = glob.glob('*/__main__.py')[0] + except IndexError: + sys.exit('No __main__.py found. Build first.') + build_dir = os.path.dirname(main_file) + for py_file in glob.glob('%s/*py' % build_dir): + shutil.copy(py_file, 'src/shorah/') + for exe in ['b2w', 'diri_sampler', 'fil']: + ppp = shutil.copy('%s/src/cpp/%s' % (build_dir, exe), 'src/shorah/bin/') + print(ppp) + +class CustomDevelop(develop): + """Subclassing develop to install files in the correct location.""" + def run(self): + move_files() + develop.run(self) + + +class CustomInstall(install): + """Subclassing develop to install files in the correct location.""" + def run(self): + move_files() + install.run(self) + +class PyTest(TestCommand): + user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] + + def initialize_options(self): + TestCommand.initialize_options(self) + self.pytest_args = [] + + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run_tests(self): + #import here, cause outside the eggs aren't loaded + import pytest + errno = pytest.main(self.pytest_args) + sys.exit(errno) + + +setup( + use_scm_version=True, + setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'], + install_requires=['setuptools_scm'], + tests_require=['pytest', 'flake8', 'pep257'], + cmdclass={ + 'test': PyTest, + 'install': CustomInstall, + 'develop': CustomDevelop + }, + name='ShoRAH', + description='SHOrt Reads Assembly into Haplotypes', + url='http://github.com/cbg-ethz/shorah', + # author='Osvaldo Zagordi', + # author_email='firstname.lastname@gmail.com', + packages=find_packages('src'), # include all packages under src + package_dir={'': 'src'}, # tell setuptools packages are under src + entry_points={ + 'console_scripts': ['shorah = shorah.cli:main'] + }, + license='GPL 3.0', + long_description=''' + ShoRAH is designed to analyse genetically heterogeneous samples. It provides error correction, + local haplotype reconstruction, and estimation of the frequency of the different genetic variants + present in a mixed sample. + ''', + project_urls={ + 'Documentation': 'http://cbg-ethz.github.io/shorah', + 'Source': 'https://github.com/cbg-ethz/shorah', + }, + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + 'Development Status :: 5 - Production/Stable', + # Indicate who your project is intended for + 'Intended Audience :: Developers', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Topic :: Scientific/Engineering :: Medical Science Apps.', + # Pick your license as you wish (should match "license" above) + 'License :: GPL 3.0', + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + # 'Programming Language :: Python :: 2', + # 'Programming Language :: Python :: 2.6', + # 'Programming Language :: Python :: 2.7', + # 'Programming Language :: Python :: 3', + # 'Programming Language :: Python :: 3.2', + # 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6'] +) diff --git a/src/shorah/b2w.cpp b/src/cpp/b2w.cpp similarity index 98% rename from src/shorah/b2w.cpp rename to src/cpp/b2w.cpp index 2733c3a..4a50de8 100644 --- a/src/shorah/b2w.cpp +++ b/src/cpp/b2w.cpp @@ -39,6 +39,8 @@ adapted from samtools/calDep.c #include "faidx.h" #include "sam.h" +#define UNUSED(expr) (void)(expr) + // data for fetch_func and pileup_func typedef struct { @@ -79,7 +81,8 @@ static int fetch_func1(const bam1_t* b, void* data) overlap = tmp->end - Rstart + 1; } if (overlap >= tmp->min_overlap) { - char rd[wSize + 1]; // store blank read segment of 'N's + //char rd[wSize + 1]; store blank read segment of 'N's + char *rd = new char[wSize + 1]; for (int i = 0; i < wSize; i++) { rd[i] = 'N'; } @@ -93,6 +96,7 @@ static int fetch_func1(const bam1_t* b, void* data) << '>' << bam1_qname(b) << ' ' << Rstart << '\n'; tmp->outFile // write read << rd << "\n"; + delete [] rd; } } return 0; @@ -102,6 +106,8 @@ static int fetch_func1(const bam1_t* b, void* data) static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t* pl, void* data) { tmpstruct_t* tmp = (tmpstruct_t*)data; + UNUSED(tid); + UNUSED(n); int i = pos - tmp->beg; // base position within window char* mm = (char*)tmp->rd + i; // location of base if (pos >= tmp->beg && pos <= tmp->end) { // make sure read position within window @@ -199,6 +205,7 @@ int main(int argc, char* argv[]) tmp.fai = fai_load(argv[optind + 1]); int iBuild = bam_index_build(argv[optind]); // generate bam index + UNUSED(iBuild); bam_index_t* idx; idx = bam_index_load(argv[optind]); // load bam index if (idx == 0) { diff --git a/src/shorah/data_structures.hpp b/src/cpp/data_structures.hpp similarity index 100% rename from src/shorah/data_structures.hpp rename to src/cpp/data_structures.hpp diff --git a/src/shorah/dpm_sampler.cpp b/src/cpp/dpm_sampler.cpp similarity index 99% rename from src/shorah/dpm_sampler.cpp rename to src/cpp/dpm_sampler.cpp index f6e72cc..71be74d 100644 --- a/src/shorah/dpm_sampler.cpp +++ b/src/cpp/dpm_sampler.cpp @@ -810,7 +810,7 @@ void build_assignment(std::ofstream& out_file) if (cn->next->size == 0) { #ifndef NDEBUG - printf("removing some node... p=%p\n", cn->next); + printf("removing some node... p=%p\n", (void *)cn->next); #endif remove_comp(&cn->next); } else { @@ -1300,7 +1300,7 @@ ssret* sample_class(unsigned int i, unsigned int step) temp = new int[J / 10 + 1]; #ifndef NDEBUG for (removed = 0; removed < q; removed++) - printf("c_ptr[%d]=%p\n", removed, c_ptr[removed]); + printf("c_ptr[%d]=%p\n", removed, (void *)c_ptr[removed]); removed = 0; printf("---------------------------------------------------\n"); printf("-----------> sampling class for %ith read <----------\n", i); @@ -1496,7 +1496,7 @@ ssret* sample_class(unsigned int i, unsigned int step) #ifndef NDEBUG for (j = 0; j <= st; j++) - printf("with P[%i] = %e to class %p\n", j, P[j], cl_ptr[j]); + printf("with P[%i] = %e to class %p\n", j, P[j], (void *)cl_ptr[j]); #endif g = gsl_ran_discrete_preproc(st + 1, P); @@ -1518,7 +1518,7 @@ ssret* sample_class(unsigned int i, unsigned int step) if (from_class == to_class) { #ifndef NDEBUG - printf("from %p to itself\n", from_class); + printf("from %p to itself\n", (void *)from_class); #endif // free(P); @@ -1532,7 +1532,7 @@ ssret* sample_class(unsigned int i, unsigned int step) else if (to_class != NULL) { #ifndef NDEBUG - printf("moving the read from %p to %p\n", from_class, to_class); + printf("moving the read from %p to %p\n", (void *)from_class, (void *)to_class); #endif c_ptr[i] = @@ -1554,7 +1554,7 @@ ssret* sample_class(unsigned int i, unsigned int step) else if (to_class == NULL) { #ifndef NDEBUG - printf("moving %i to a new class from %p\n", i, from_class); + printf("moving %i to a new class from %p\n", i, (void *)from_class); #endif remove_read(search_read(&from_class->rlist, i)); diff --git a/src/shorah/dpm_sampler.hpp b/src/cpp/dpm_sampler.hpp similarity index 100% rename from src/shorah/dpm_sampler.hpp rename to src/cpp/dpm_sampler.hpp diff --git a/src/shorah/fil.cpp b/src/cpp/fil.cpp similarity index 100% rename from src/shorah/fil.cpp rename to src/cpp/fil.cpp diff --git a/src/cpp/meson.build b/src/cpp/meson.build new file mode 100644 index 0000000..5295438 --- /dev/null +++ b/src/cpp/meson.build @@ -0,0 +1,44 @@ +message('Is popcnt available?') +code = ''' + #include + int main() { return _mm_popcnt_u64(0); }''' +cpp = meson.get_compiler('cpp') +success = cpp.links(code, args : '-mpopcnt', name : 'pocnt') +if success + message('popcnt intrinsics are available') + extra_cpp_args = ['-mpopcnt'] +else + message('popcnt is broken') + extra_cpp_args = [] +endif + +gsl_dep = dependency('gsl') +thread_dep = dependency('threads') + +dpm_exe = executable( + 'diri_sampler', files([ + 'data_structures.hpp', + 'dpm_sampler.hpp', + 'dpm_sampler.cpp']), + dependencies : [ gsl_dep, thread_dep ], + include_directories : bam_inc, + cpp_args : extra_cpp_args, + install : true) + +b2w_exe = executable( + 'b2w', files([ + 'b2w.cpp']), + dependencies : thread_dep, + include_directories : bam_inc, + cpp_args : extra_cpp_args, + link_with : bam_library, + install : true) + +fil_exe = executable( + 'fil', files([ + 'fil.cpp']), + dependencies : [ gsl_dep, thread_dep ], + include_directories : bam_inc, + cpp_args : extra_cpp_args, + link_with : bam_library, + install : true) diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..b390df9 --- /dev/null +++ b/src/meson.build @@ -0,0 +1,2 @@ +subdir('samtools') +subdir('cpp') diff --git a/src/perl/DNA.pm b/src/perl/DNA.pm deleted file mode 100755 index 0ff403a..0000000 --- a/src/perl/DNA.pm +++ /dev/null @@ -1,761 +0,0 @@ -# DNA.pm - Utilities - -# Copyright 2007, 2008, 2009 -# Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Osvaldo Zagordi, -# ETH Zurich - -# This file is part of ShoRAH. -# ShoRAH is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# ShoRAH is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . - -package DNA; - -use strict; -use NikoUtil; -use Util; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); -use Exporter; - -$VERSION = 1.00; -@ISA = qw(Exporter); - -# autoexport: -@EXPORT = qw( - translate - makeCodons - reverseCodons - revComp - NW SW - generalizedSW - blastAlign - correctGaps - correctGaps2 - ambigString - ); -# export on request: -@EXPORT_OK = qw(); - -sub translate { - # input: - # dna string - # codon hash - # optional: if ambiguous, should we output just one aa or - # a whole block [CKEIF] ?? - - my $dna = shift; - my $codon = shift; - my $JUSTONEAA = 0; - if (@_ > 0) { - $JUSTONEAA = shift; - } - my $pro = ''; - my $cod; - my $choices = 1; - my $ambig = 0; - - for(my $i=0; $i < length($dna); $i+=3) { - $cod = substr($dna,$i,3); - if (length($cod) == 3) { - if (defined $codon->{$cod}) { - $pro .= $codon->{$cod}; - } else { - my $l = resolveCodon($cod); - my %poss; - foreach (@$l) { - if (defined ($codon->{$_})) { - $poss{$codon->{$_}} = 1; - } else { - $poss{' '} = 1; - } - } - my $tmp = join('', sort(keys(%poss))); - if (keys(%poss) > 1) { - if ($JUSTONEAA) { - $pro .= substr($tmp,0,1); - } else { - $pro .= '[' . $tmp . ']'; - } - $choices *= keys(%poss); - $ambig += 1; - } else { - $pro .= $tmp; - } -#print "#error : don't know codon ", substr($dna,$i,3),"\n"; - } - } - } - #print "$choices choices for protein\n"; - #print "$ambig ambiguous codons\n"; - #print "$pro\n"; - return $pro; -} - -sub resolveCodon { - #take codon with ambiguity codes and output list of all codons that could - #give it - my $c = shift; - my @l = split '', $c; - my @ans = ('', '', ''); - my @codons; - foreach my $p (0..2) { - if ($l[$p] eq 'A') { - $ans[$p] .= 'A'; - } elsif ($l[$p] eq 'C') { - $ans[$p] .= 'C'; - } elsif ($l[$p] eq 'G') { - $ans[$p] .= 'G'; - } elsif ($l[$p] eq 'T') { - $ans[$p] .= 'T'; - } elsif ($l[$p] eq 'R') { - $ans[$p] .= 'AG'; - } elsif ($l[$p] eq 'Y') { - $ans[$p] .= 'CT'; - } elsif ($l[$p] eq 'K') { - $ans[$p] .= 'GT'; - } elsif ($l[$p] eq 'M') { - $ans[$p] .= 'AC'; - } elsif ($l[$p] eq 'S') { - $ans[$p] .= 'CG'; - } elsif ($l[$p] eq 'W') { - $ans[$p] .= 'AT'; - } elsif ($l[$p] eq 'D') { - $ans[$p] .= 'AGT'; - } elsif ($l[$p] eq 'H') { - $ans[$p] .= 'ACT'; - } elsif ($l[$p] eq 'B') { - $ans[$p] .= 'CGT'; - } elsif ($l[$p] eq 'V') { - $ans[$p] .= 'ACG'; - } elsif ($l[$p] eq 'X') { - $ans[$p] .= 'ACGT'; - } elsif ($l[$p] eq 'N') { - $ans[$p] .= 'ACGT'; - } - } - #print "$c -> @ans\n"; - foreach my $i (0 .. length($ans[0])-1) { - foreach my $j (0 .. length($ans[1])-1) { - foreach my $k (0 .. length($ans[2])-1) { - push(@codons, substr($ans[0],$i,1) . substr($ans[1],$j,1) . substr($ans[2],$k,1)); - } - } - } - #print "Codon $c -> @codons\n"; - - return \@codons; -} - -sub makeCodons { - my %codon; - $codon{GCT} = $codon{GCC} = $codon{GCA} = $codon{GCG} = 'A'; - $codon{TTA} = $codon{TTG} = $codon{CTT} = $codon{CTC} = $codon{CTA} = $codon{CTG} = 'L'; - $codon{CGT} = $codon{CGC} = $codon{CGA} = $codon{CGG} = $codon{AGA} = $codon{AGG} = 'R'; - $codon{AAA} = $codon{AAG} = 'K'; - $codon{AAT} = $codon{AAC} = 'N'; - $codon{ATG} = 'M'; - $codon{GAT} = $codon{GAC} = 'D'; - $codon{TTT} = $codon{TTC} = 'F'; - $codon{TGT} = $codon{TGC} = 'C'; - $codon{CCT} = $codon{CCC} = $codon{CCA} = $codon{CCG} = 'P'; - $codon{CAA} = $codon{CAG} = 'Q'; - $codon{TCT} = $codon{TCC} = $codon{TCA} = $codon{TCG} = $codon{AGT} = $codon{AGC} = 'S'; - $codon{GAA} = $codon{GAG} = 'E'; - $codon{ACT} = $codon{ACC} = $codon{ACA} = $codon{ACG} = 'T'; - $codon{GGT} = $codon{GGC} = $codon{GGA} = $codon{GGG} = 'G'; - $codon{TGG} = 'W'; - $codon{CAT} = $codon{CAC} = 'H'; - $codon{TAT} = $codon{TAC} = 'Y'; - $codon{ATT} = $codon{ATC} = $codon{ATA} = 'I'; - $codon{GTT} = $codon{GTC} = $codon{GTA} = $codon{GTG} = 'V'; - #$codon{ATG} = '+'; - $codon{TAG} = $codon{TGA} = $codon{TAA} = '*'; - return \%codon; -} -sub reverseCodons { - my $codon = shift; - my %a2c; - #$a2c{A} = [GCT, GCC, GCA, GCG] - foreach my $cod (keys(%$codon)) { - push(@{$a2c{$$codon{$cod}}},$cod); - } - return \%a2c; -} - -sub revComp { - my $seq = $_[0]; - my $seqRC = ""; - my $I; - - for($I = length($seq)-1; $I >= 0; $I--) { - my $base = substr($seq, $I, 1); - if($base eq "A") { - $seqRC .= "T"; - } elsif($base eq "C") { - $seqRC .= "G"; - } elsif($base eq "G") { - $seqRC .= "C"; - } elsif($base eq "T") { - $seqRC .= "A"; - } else { - $seqRC .= $base; - } - } - return($seqRC); -} - - -sub trans { - my $x = shift; - if ($x eq 'A') { - return 0; - } elsif ($x eq 'C') { - return 1; - } elsif ($x eq 'G') { - return 2; - } else { - return 3; - } -} - -sub NW { - # be careful if using... .fixes went into SW - my $a = shift; - my $b = shift; - my $d; - my $S; - if (@_ > 0) { - $d = shift; #linear gap - $S = shift; #[][] = costs - } else { - $d = -400; - $S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]]; - } - - my @A = split('', $a); - my @B = split('', $b); - my ($i, $j, $choice1, $choice2, $choice3); - my $AlignmentA; - my $AlignmentB; - my @F; - - foreach $i (0..@A) { - $F[$i][0] = $d * $i; - } - foreach $j (0..@B) { - $F[0][$j] = $d * $j; - } - - foreach $i (1..@A) { - foreach $j (1..@B) { - $choice1 = $F[$i-1][$j-1] + $S->[trans($A[$i-1])][trans($B[$j-1])]; - $choice2 = $F[$i-1][$j] + $d; - $choice3 = $F[$i][$j-1] + $d; - $F[$i][$j] = max($choice1,$choice2,$choice3); - } - } - - #foreach $i (0..@A) { - # foreach $j (0..@B) { - # print "$F[$i][$j] "; - # } - # print "\n"; - #} - - $AlignmentA = ''; - $AlignmentB = ''; - $i = @A; - $j = @B; - - while ($i > 0 and $j > 0) { - my $Score = $F[$i][$j]; - my $ScoreDiag = $F[$i - 1][$j - 1]; - my $ScoreUp = $F[$i][$j - 1]; - my $ScoreLeft = $F[$i - 1][$j]; - my $Ai = trans($A[$i-1]); - my $Bj = trans($B[$j-1]); - - if ($Score == $ScoreDiag + $S->[$Ai][$Bj]) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $i--; - $j--; - } elsif ($Score == $ScoreLeft + $d) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = '-' . $AlignmentB; - $i--; - } else { # (Score == ScoreUp + d) - $AlignmentA = '-' . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $j--; - } - } - while ($i > 0) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = '-' . $AlignmentB; - $i--; - } - while ($j > 0) { - $AlignmentA = '-' . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $j--; - } - return ($AlignmentA,$AlignmentB); -} - - -sub SW { - - #Smith Waterman local alignment - # assume that first sequence is the short one - my $a = shift; - my $b = shift; - my $d; - my $S; - if (@_ > 0) { - $d = shift; #linear gap - $S = shift; #[][] = costs - } else { - #$d = -100; - #$S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]]; - $d = -4; - $S = [[4,-4,-4,-4],[-4,4,-4,-4],[-4,-4,4,-4],[-4,-4,-4,4]]; - } - my $DEBUG = 0; - if (@_ > 0) { - $DEBUG = shift; - } - - - my @A = split('', $a); - my @B = split('', $b); - my @tA = map{trans($_)} @A; - my @tB = map{trans($_)} @B; - #my @tA = map {trans($_)} split ('', $a); - #my @tB = map {trans($_)} split ('', $b); - my ($i, $j); - my $AlignmentA; - my $AlignmentB; - my @F; - my ($largestF, $largesti,$largestj); - $largestF = 1; - - foreach $i (0..@A) { - $F[$i][0] = 0; - } - foreach $j (0..@B) { - $F[0][$j] = 0; - } - - - my ($c1, $c2, $c3, $m); - - foreach $i (1..@A) { - foreach $j (1..@B) { - $c1 = $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]; - $c2 = $F[$i-1][$j] + $d; - $c3 = $F[$i][$j-1] + $d; - $m = $c1 > $c2 ? ($c1 > $c3 ? $c1 : $c3) : ($c2 > $c3 ? $c2 : $c3); - $F[$i][$j] = $m > 0 ? $m : 0; - #$F[$i][$j] = max($choice1,$choice2,$choice3,0); - - #$F[$i][$j] = max($F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]], - # $F[$i-1][$j] + $d, $F[$i][$j-1] + $d, 0); - - if ($F[$i][$j] >= $largestF) { - #print "max at $i $j -- $F[$i][$j]\n"; - $largestF = $F[$i][$j]; - $largesti = $i; - $largestj = $j; - } - } - } - if ($DEBUG) { - foreach my $row (@F) { - print join("\t", @$row), "\n"; - } - } - - - #now trace back from the largest value in the matrix and build up the alignment - - $AlignmentA = ''; - $AlignmentB = ''; - $i = $largesti; - $j = $largestj; - #print "Largest F at $i, $j\n"; - # - # my $gapend = (length($a) - $largesti); - # if ($gapend > 0) { - # print "adding $gapend gaps at end\n"; - # $AlignmentA = 'x' x $gapend; - # #foreach my $x ($j .. ($j + $gapend - 1)) { - # # print "$x $B[$x]\n"; - # #} - # #$AlignmentB = lc(join('',@B[$j .. ($j + $gapend-1)])); - # $AlignmentB = 'x' x $gapend; - # } - # - while ($F[$i][$j] > 0) { - if ($DEBUG) { - print "$F[$i-1][$j-1] $F[$i-1][$j]\n$F[$i][$j-1] $F[$i][$j]\n"; - } - if ($F[$i][$j] == $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $i--; $j--; - } elsif ($F[$i][$j] == $F[$i-1][$j] + $d) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = '-' . $AlignmentB; - $i--; - } else { - $AlignmentA = '-' . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $j--; - } - if ($DEBUG) { - print "$AlignmentA\n$AlignmentB\n"; - print '-' x 20, "\n"; - } - } - - if (0) { - - while ($F[$i][$j] > 0) { - if ($F[$i][$j] == $F[$i-1][$j] + $d) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = '-' . $AlignmentB; - $i--; - } elsif ($F[$i][$j] == $F[$i][$j-1] + $d) { - $AlignmentA = '-' . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $j--; - } elsif ($F[$i][$j] == $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $i--; $j--; - } - print "$AlignmentA\n$AlignmentB\n"; - print '-' x 20, "\n"; - } -} - #while ($i > 0) { - # $AlignmentA = '-' . $AlignmentA; - # $AlignmentB = $B[$i] . $AlignmentB; - # #$AlignmentB = 'x' . $AlignmentB; - # $i--; - #} - - return ($AlignmentA,$AlignmentB, $j); -} - -sub generalizedSW { - - #Smith Waterman local alignment - #align a sequence to a quasi-sequence: - # - # so $a = 'AACCTT' - # and $b = [ {A => .5, C => .5}, ... ] - # assume that first sequence is the short one - # b is a list of lists, actually, in order ACGT - # b = [ [.5, .5, 0, 0], ... ] - my $a = shift; - my $b = shift; - # ltrIdx is hash A => 0, ... T => 3 - # alphabet is list [A C G T] - my $LtrIdx = shift; - my $alphabet = shift; - my $d; - my $S; - if (@_ > 0) { - $d = shift; #linear gap - $S = shift; #[][] = costs - } else { - #$d = -100; - #$S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]]; - $d = -4; - #$S = [[4,-8,-8,-8],[-8,4,-8,-8],[-8,-8,4,-8],[-8,-8,-8,4]]; - $S = [[4,-4,-4,-4],[-4,4,-4,-4],[-4,-4,4,-4],[-4,-4,-4,4]]; - } - my $DEBUG = 0; - if (@_ > 0) { - $DEBUG = shift; - } - - - if ($DEBUG) { - print "Aligning\n$a\n"; - print "[@$_], " foreach (@$b); - print "\n"; - } - my @A = split('', $a); - my @tA = map{$LtrIdx->{$_}} @A; - - my $numB = @$b; - #my @B = split('', $b); - #my @tB = map{trans($_)} @B; - - my @B = map {$alphabet->[maxind($_)]} @$b; - - my ($i, $j); - my $AlignmentA; - my $AlignmentB; - my @F; - my ($largestF, $largesti,$largestj); - $largestF = 1; - - foreach $i (0..@A) { - $F[$i][0] = 0; - } - foreach $j (0..$numB) { - $F[0][$j] = 0; - } - - - my ($c1, $c2, $c3, $m); - - foreach $i (1..@A) { - foreach $j (1..$numB) { - #$c1 = $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]; - #$c1 = F[i-1][j-1] + sum_alphabet pr(b_j-1 = K)*S->[a_i-1][K] - $c1 = $F[$i-1][$j-1] + score($tA[$i-1], $b->[$j-1], $S); - - $c2 = $F[$i-1][$j] + $d; - $c3 = $F[$i][$j-1] + $d; - #print "$i $j = max($c1 $c2 $c3)\n"; - $m = $c1 > $c2 ? ($c1 > $c3 ? $c1 : $c3) : ($c2 > $c3 ? $c2 : $c3); - $F[$i][$j] = $m > 0 ? $m : 0; - #$F[$i][$j] = max($choice1,$choice2,$choice3,0); - - #$F[$i][$j] = max($F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]], - # $F[$i-1][$j] + $d, $F[$i][$j-1] + $d, 0); - - if ($F[$i][$j] >= $largestF) { - #print "max at $i $j -- $F[$i][$j]\n"; - $largestF = $F[$i][$j]; - $largesti = $i; - $largestj = $j; - } - } - } - if ($DEBUG) { - foreach my $row (@F) { - print join("\t", @$row), "\n"; - } - } - - - #now trace back from the largest value in the matrix and build up the alignment - - $AlignmentA = ''; - $AlignmentB = ''; - $i = $largesti; - $j = $largestj; - $i = @A; - $j = $numB; - #print "Largest F at $i, $j\n"; - # - # my $gapend = (length($a) - $largesti); - # if ($gapend > 0) { - # print "adding $gapend gaps at end\n"; - # $AlignmentA = 'x' x $gapend; - # #foreach my $x ($j .. ($j + $gapend - 1)) { - # # print "$x $B[$x]\n"; - # #} - # #$AlignmentB = lc(join('',@B[$j .. ($j + $gapend-1)])); - # $AlignmentB = 'x' x $gapend; - # } - # - while ($F[$i][$j] > 0) { - if ($DEBUG) { - print "$F[$i-1][$j-1] $F[$i-1][$j]\n$F[$i][$j-1] $F[$i][$j]\n"; - } - if ($F[$i][$j] == $F[$i-1][$j-1] + score($tA[$i-1], $b->[$j-1], $S)) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $i--; $j--; - } elsif ($F[$i][$j] == $F[$i-1][$j] + $d) { - $AlignmentA = $A[$i-1] . $AlignmentA; - $AlignmentB = '-' . $AlignmentB; - $i--; - } else { - $AlignmentA = '-' . $AlignmentA; - $AlignmentB = $B[$j-1] . $AlignmentB; - $j--; - } - if ($DEBUG) { - print "$AlignmentA\n$AlignmentB\n"; - print '-' x 20, "\n"; - } - } - - return ($AlignmentA,$AlignmentB, $j); -} - -sub blastAlign { -#s2 should be a subsequence of s1 - my $s1 = shift; - my $s2 = shift; - system("echo $s1 > /tmp/seq1"); - system("echo $s2 > /tmp/seq2"); - system("bl2seq -i /tmp/seq1 -j /tmp/seq2 -p blastn |grep -A 2 Query:"); -} - -sub score { - my $a = shift; - my $b = shift; - my $S = shift; - my $ans = 0; - foreach my $baseInd (0 .. @$b-1) { - $ans += $S->[$a][$baseInd] * $b->[$baseInd]; - } - #print "Score $a, [@$b] = $ans\n"; - return $ans; -} - -sub correctGaps { - my $reads = shift; - - my @AAlist; - foreach my $rd (@$reads) { - $rd->{seq} =~ s/N/-/g; - foreach my $pos ($$rd{st} .. $$rd{end}-1) { - ${$AAlist[$pos]}{substr($$rd{seq},$pos - $$rd{st}, 1)}++; - } - } - my @corInd; - my @cons; - - foreach my $i (0..@AAlist-1) { - #my @chars = keys(%{$AAlist[$i]}); - if ((defined ${$AAlist[$i]}{'-'}) and (keys(%{$AAlist[$i]}) == 2)) { - #foreach (sort{$AAlist[$i]{$b} <=> $AAlist[$i]{$a}}keys(%{$AAlist[$i]})) { - delete ${$AAlist[$i]}{'-'}; - push(@corInd, $i); - #my @c = keys(%{$AAlist[$i]}); - my @c = sort {$AAlist[$i]{$b} <=> $AAlist[$i]{$a}} keys(%{$AAlist[$i]}); - push(@cons, $c[0]); - } - } - - foreach my $r (@$reads) { - foreach my $i (0 .. @corInd-1) { - #if r->st <= corInd[i] <= r->end - #and r->seq . substr(i-start) == '-' - #correct to cons - if (($r->{st} <= $corInd[$i]) and ($r->{end} >= $corInd[$i])) { - if (substr($r->{seq},$corInd[$i] - $r->{st},1) eq '-') { - substr($r->{seq},$corInd[$i] - $r->{st},1) = $cons[$i]; - } - } - } - } -} - - - -sub correctGaps2 { - #take a set of reads (given as two lists, @head and @seq) - #and correct all gaps for which there is only one choice - my $head = shift; - my $seq = shift; - my $starts; - my $ends; - #fix starts and ends - my $n = @$head; - my $num = 0; - foreach my $i (0 .. $n-1) { - my ($left, $right) = split /\$/, $head->[$i]; - my ($id, $start, undef) = split / /, trim($left); - my $len = length($seq->[$i]); - push(@$starts, $start); - push(@$ends, $start + $len); - } - - my @AAlist; - foreach my $rd (0 .. $n-1) { - $seq->[$rd] =~ s/N/-/g; - #if ($seq->[$rd] =~ / /) { - # print "SPACE: read $rd == .$seq->[$rd].\n"; - #} - #print "read $rd starts at $starts->[$rd] ends $ends->[$rd]\n"; - #print "length ", length($seq->[$rd]), "\n"; - foreach my $pos ($starts->[$rd] .. $ends->[$rd] - 1) { - ${$AAlist[$pos]}{substr($seq->[$rd],$pos - $starts->[$rd], 1)}++; - } - } - my @corInd; - my @cons; - - foreach my $i (0..@AAlist-1) { - #my @chars = keys(%{$AAlist[$i]}); - #print "pos $i, keys = [", join(',', keys(%{$AAlist[$i]})), "]\n"; - if ((defined ${$AAlist[$i]}{'-'}) and (keys(%{$AAlist[$i]}) == 2)) { - #foreach (sort{$AAlist[$i]{$b} <=> $AAlist[$i]{$a}}keys(%{$AAlist[$i]})) - delete ${$AAlist[$i]}{'-'}; - push(@corInd, $i); - my @c = sort {$AAlist[$i]{$b} <=> $AAlist[$i]{$a}} keys(%{$AAlist[$i]}); - push(@cons, $c[0]); - } - } - #print "correcting at @corInd\n"; - - foreach my $r (0 .. $n-1) { - foreach my $i (0 .. @corInd-1) { - #if r->st <= corInd[i] <= r->end - #and r->seq . substr(i-start) == '-' - #correct to cons - if (($starts->[$r] <= $corInd[$i]) and ($ends->[$r] >= $corInd[$i])) { - if (substr($seq->[$r],$corInd[$i] - $starts->[$r],1) eq '-') { - substr($seq->[$r],$corInd[$i] - $starts->[$r],1) = $cons[$i]; - #print "Correct at $corInd[$i]\n"; - $num++; - } - } - } - } - return $num; -} - -sub ambigString { - #input: $a = asdf[123] - #output @b = ({a=>1} {s=>1} {d=>1} {f=>1} {1=>1, 2=>1, 3=>1}) - # - my $a = shift; - my @b; - - my $i = 0; - - while ($i < length($a)) { - my $chunk = ''; - if (substr($a,$i,1) ne '[') { - $chunk = substr($a,$i,1); - } else { - # find end of [...] block - while (substr($a,$i,1) ne ']') { - $chunk .= substr($a,$i,1); - $i++; - } - } - $i++; - my %tmp = map {$_ => 1.0/length($chunk)} split '', $chunk; - push(@b, \%tmp); - } - return \@b; -} - - -1; diff --git a/src/perl/NikoUtil.pm b/src/perl/NikoUtil.pm deleted file mode 100755 index 665d2f3..0000000 --- a/src/perl/NikoUtil.pm +++ /dev/null @@ -1,473 +0,0 @@ -# -# NikoUtil.pm - Utilities -# Copyright 2007, 2008, 2009 -# Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Osvaldo Zagordi, -# ETH Zurich - -# This file is part of ShoRAH. -# ShoRAH is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# ShoRAH is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . - -package NikoUtil; - -use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); -use Exporter; - -$VERSION = 1.00; -@ISA = qw(Exporter); - -# autoexport: -@EXPORT = qw( - trim - is_numeric - round - intrand discr_rand - concat_list - read_fasta read_fastas print_fasta - print_matrix transpose exp_matrix - where_clause_list - hamming - hamming_distance alphabet - makeDistMatrix - random - Nmin Nmax -); - -# export on request: -@EXPORT_OK = qw(); - -sub concat_list { # concatenate lists - - my @lists = @_; # list of refs. to lists to be concatenated - -# concatenate all lists - my @all; - for (@lists) { - push @all, @{$_}; - } - -# count occurences in concatenation - my (@union, %count); - for (@all) { - $count{"@{$_}"}++; - push @union, $_ unless ($count{"@{$_}"} > 1); - } - - return \@union; # multiple occurences appear only once -} - - - -sub is_numeric { # Cookbook p. 44 - my $val = shift || ''; # return FALSE if $val is FALSE - - return ($val =~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/); -} - - - -sub round { # round to nearest integer - - my @input = @_; - - for (@input) { - $_ = int($_ + 0.5); - } - - return wantarray ? @input : $input[0]; -} - - - -sub intrand { - -# random integer number between and including $a and $b - - my $a = shift; - my $b = shift; - - return int($a) + int(rand($b-$a+1)); -} - - - -sub discr_rand { - -# random integers in given proportion - - my @proportion = @_; - my $n = scalar(@proportion); - - my @cum; - my $i; - - my $sum = 0.0; - for $i (0 .. $n - 1) { - $sum += $proportion[$i]; - } - $cum[0] = ($proportion[0] / $sum); - for $i (1 .. $n - 1) { - $cum[$i] = $cum[$i-1] + ($proportion[$i] / $sum); - } - - my $r = rand(); -#print "@cum ---> $r\n"; - - $i = 0; - while (($i < $n-1) && ($cum[$i] < $r)) { - $i++; - } - - return $i; -} - - - -sub trim { # remove white spaces to the left and right - - my @input = @_; - - for (@input) { - s/^\s+//; - s/\s+$//; - } - - return wantarray ? @input : $input[0]; -} - - - -sub read_fasta { - - my $filename = shift; - - my $sep = chr(13); # CR - - open FASTA, $filename - or die "Can't open fasta file $filename!\n"; - my @lines = ; - close FASTA; - - for my $i (0 .. $#lines) { - splice (@lines, $i, 1, (split /$sep/, $lines[$i])); - } - - return (substr($lines[0], 1), join('', @lines[1..$#lines])); -} - - - -sub read_fastas { - - my $filename = shift; - - my (@header, @seq); - my $i = -1; - - open FASTA, $filename - or die "Can't open fasta file -- $filename!\n"; - - while (my $line = ) { - next if ($line =~ /^#/); - if (substr($line, 0, 1) eq ">") { # header - $i++; - $header[$i] = trim(substr($line, 1)); - } - else { # sequence - die "Error - fasta file without header lines @ $line\n" if ($i == -1); - $seq[$i] .= trim($line); - } - } - - close FASTA; - - return (\@header, \@seq); - -} - - - -sub print_fasta { - - my $head = shift; - my $seq = shift; - - my $width = 60; - - my @seq = split //, $seq; - - my $fasta = ">$head\n"; - while (scalar(@seq) > 0) { - $fasta .= join('', splice(@seq, 0, $width)) . "\n"; - } - -#return $fasta . "\n"; - return $fasta; -} - - - -sub print_matrix { - -# print matrix (tab- and newline-separated) - - my @mat = @{shift @_}; # data im $mat[][] - my $row_label = shift; # row labels == first column to print - my $col_label = shift; # column labels == first row to print - my $mat_label = shift || "\\"; # upper leftmost cell - - print $mat_label, "\t" if ($row_label); - if ($col_label) { - print join("\t", @$col_label), "\n"; - } - - for my $i (0 .. $#mat) { - print $row_label->[$i], "\t" if ($row_label); - for my $j (0 .. $#{$mat[$i]}) { - if (defined($mat[$i][$j])) { - print $mat[$i][$j], "\t"; - } - else { - print "undef\t"; - } - } - print "\n"; - } -} - - - -sub transpose { - -# transpose matrix - - my @M = @{shift @_}; - - my @Mt; - for my $i (0 .. $#M) { - for my $j (0 .. $#{$M[0]}) { - $Mt[$j][$i] = $M[$i][$j]; - } - } - - return \@Mt; -} - - - -sub exp_matrix { - - my $mat = shift; - my $base = shift || 10; - - my $emat; - - for my $i (0 .. scalar(@$mat)-1) { - for my $j (0 .. scalar(@{$mat->[0]})-1) { - $emat->[$i][$j] = $base ** $mat->[$i][$j]; - } - } - - return $emat; -} - - - -sub where_clause_list { - - my $field = shift; - my @values = @_; - - my @cond = (); - - for (@values) { - if (is_numeric($_)) { - push @cond, sprintf "%s=%d", $field, $_; - } - else { - push @cond, sprintf "%s='%s'", $field, $_; - } - } - - return join(' or ', @cond); -} - - - - -sub Nmax { - -# Maximum of a vector containing undef's - - my @input = @_; - - my ($max, $argmax); - - for my $i (0 .. $#input) { - if (defined($input[$i])) { - if (! defined($argmax)) { - $max = $input[$i]; - $argmax = $i; - } - elsif ($input[$i] > $max) { # same action, but avoids warning - $max = $input[$i]; - $argmax = $i; - } - } - } - - return wantarray ? ($max, $argmax) : $max; -} - - -sub Nmin { - -# Minimum of a vector containing undef's - - my @input = @_; - - my ($min, $argmin); - - for my $i (0 .. $#input) { - if (defined($input[$i])) { - if (! defined($argmin)) { - $min = $input[$i]; - $argmin = $i; - } - elsif ($input[$i] < $min) { # same action, but avoids warning - $min = $input[$i]; - $argmin = $i; - } - } - } - return wantarray ? ($min, $argmin) : $min; -} - -sub hamming_distance { - - my $X = uc(shift); - my $Y = uc(shift); - - my $min; - if (length($X) > length($Y)) { - $min = length($Y); - } else { - $min = length($X); - } - - my $comparisons = 0; - my $dist = 0; - my ($x, $y); - for my $pos (0 .. $min - 1) { - $x = substr($X,$pos,1); - $y = substr($Y,$pos,1); - if (($x ne '.') && ($y ne '.')) { - $comparisons++; - if ($x ne $y) { - $dist++; - } - } - } - - #die "Incomparable strings:\n$X\n$Y\n" if ($comparisons == 0); - if ($comparisons == 0) { - #print "Error: incomparable strings:\n$X\n$Y\n"; - return 100; - } - - return ($dist / $comparisons); -} - -sub makeDistMatrix { - my $seqs = shift; - my @ans; - my $N = @$seqs; - foreach my $i (0..$N-1) { - foreach my $j (0..$N-1) { - $ans[$i][$j] = hamming($$seqs[$i],$$seqs[$j]); - } - #print "$i\n" if ($i % 10 == 0); - } - #print "distance matrix:\n"; - #print_matrix(\@ans); - #print "\n\n"; - return \@ans; -} - -sub hamming { - #hamming distance, absolute, not caring about '.' - my $s = shift; - my $t = shift; - my $diff = 0; - # compute numer of differences between $s and $t - #die "ERROR in hamming $s\n$t\n" if (length($s) != length($t)); - my $len = Nmin(length($s), length($t)); - foreach my $i (0.. $len-1) { - if (!(substr($s,$i,1) eq substr($t,$i,1))) { - $diff++; - } - } - return $diff; -} - - - -sub alphabet { - - my $type = uc(shift); - - my (@Alphabet, @Special); - - if ($type eq 'AA') { - @Alphabet = qw(A C D E F G H I K L M N P Q R S T V W Y); - @Special = ("-", ".", "*"); - } - elsif ($type eq 'NT') { - @Alphabet = qw(A C G T); - @Special = ("-", ".", "N"); - } - elsif ($type eq '01') { - @Alphabet = qw(0 1); - @Special = ("-", "."); - } - else { - die "Unknown alphabet -- $type!\n"; - } - - # indices: - my %LtrIdx; - for my $k (0 .. $#Special) { - $LtrIdx{$Special[$k]} = -$k - 1; - } - for my $k (0 .. $#Alphabet) { - $LtrIdx{$Alphabet[$k]} = $k; - } - - return (\@Alphabet, \@Special, \%LtrIdx); -} - -sub random { # generate an integer random number - - my $x = shift; # between $x - my $y = shift; # and $y (including both) - - return int(rand($y - $x + 1)) + $x; -} - - -1; diff --git a/src/perl/ReadUtil.pm b/src/perl/ReadUtil.pm deleted file mode 100755 index 644db85..0000000 --- a/src/perl/ReadUtil.pm +++ /dev/null @@ -1,219 +0,0 @@ -# utilities for reading files of reads -# -# Copyright 2007, 2008, 2009 -# Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Osvaldo Zagordi, -# ETH Zurich - -# This file is part of ShoRAH. -# ShoRAH is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# ShoRAH is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . - -package ReadUtil; - -use strict; -use NikoUtil; -use DNA; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); -use Exporter; - -$VERSION = 1.00; -@ISA = qw(Exporter); - -# autoexport: -@EXPORT = qw(read_ReadSim -readReadFile readFastaFile -compareToCons compareReadToCons -); -# export on request: -@EXPORT_OK = qw(); - - - -sub read_ReadSim { - my $ReadSimOut = shift; - my @readList; - - my ($h, $s) = read_fastas($ReadSimOut); - foreach my $i (0 .. @$s-1) { - # process $h->[$i] = read_0001|beg|2|length|92|forward|HIV - # and $s->[$i] = TCAAATCACTCTTTGGCAACGACCCCTCGTCGACAATAA - - #split on '|' doesn't work well, so first substitute spaces - $h->[$i] =~ s/\|/ /g; - my @head = split ' ', $h->[$i]; - - die "missing header $h->[$i]\n" if (@head <= 1); - my $seqID = $head[0]; - my $startpos = $head[2]; - my $dir = $head[5]; - my $source = $head[6]; - - my %read; - $read{id} = $seqID; - $read{st} = $startpos; - if ($dir eq 'forward') { - $read{seq} = $s->[$i]; - } else { - $read{seq} = revComp($s->[$i]); - } - $read{len} = length($s->[$i]); - $read{end} = $startpos + $read{len} - 1; - $read{dir} = $dir; - $read{source} = $source; - push(@readList, \%read); - } - - return \@readList; -} - -sub readReadFile { - # read a file of reads given by - # startpos sequence - # startpos sequence - # startpos sequence - # ... - my $file = $_[0]; - my @readList; - - if (! defined $file) { - die "Error: No fasta file specified\n"; - } - - if (! -r $file) { - die "Fasta file \"$file\" not readable\n"; - } - open(IN, "<$file") or die "Can't open $file: $!"; - while () { - chomp; - next if (/^#/); - my ($start, $seq) = split; - my %read; - $read{st} = $start; - $read{seq} = $seq; - $read{len} = length($seq); - $read{end} = $start + $read{len} - 1; - push(@readList, \%read); - } - return \@readList; -} - - - - -sub readFastaFile { - my $file = $_[0]; - my $seqID; - my $startpos; - my @readList; - my $seq = ''; - my @head; - - if (! defined $file) { - die "Error: No fasta file specified\n"; - } - - if (! -r $file) { - die "Fasta file \"$file\" not readable\n"; - } - open(IN, "<$file") or die "Can't open $file: $!"; - while () { - chomp; - if (/^\#/) { - next; - } elsif ($_ eq '') { - next; - } elsif (/^>/) { - # header looks like ">seqID startpos ..." - if (length($seq) > 0) { - #we've found a sequence, so store it - - my %read; - $read{id} = $seqID; - $read{st} = $startpos; - $read{seq} = $seq; - $read{len} = length($seq); - $read{end} = $startpos + $read{len} - 1; - #my @tmp = @head; - #$read{qs} = \@tmp; - push(@readList, \%read); - $seq = ''; - } - @head = split; - if (@head > 1) { - $seqID = shift @head; #$head[0]; - $seqID =~ s/>//; - $startpos = shift @head; #$head[1]; - } else { - print "Error - @head missing characters\n"; - } - } else { - #a sequence line - #remove spaces at beginning and end - s/^\s+//; - s/\s+$//; - $seq .= $_; - } - - } - #still have sequence in the queue, need to put the last one in - #this should be done in a better fashion... - my %read; - $read{id} = $seqID; - $read{st} = $startpos; - $read{seq} = $seq; - $read{len} = length($seq); - $read{end} = $startpos + $read{len} - 1; -# $read{qs} = \@head; - push(@readList, \%read); - - close IN; - return \@readList; -} - -sub compareToCons { - my $seq = shift; - my $cons = shift; - my @ans; - #die "$seq\n$cons\n" if (length($seq) != length($cons)); - my $minl = Nmin(length($seq), length($cons)); - #input: two strings of same length - #output: list of differences in format A90I - foreach my $i (1..$minl) { - my $c1 = substr($seq,$i-1,1); - my $c0 = substr($cons,$i-1,1); - if (!($c1 eq $c0)) { - push(@ans, $c0 . $i . $c1); - } - } - return \@ans; -} - -sub compareReadToCons { - # not really right, since compareToCons - # prints in local coordinates... - my $read = shift; - my $cons = shift; - #input a read %r = {id:AHAHA seq : ACGTAT, len : 6, start : 80} - #and a string - #output: %r = {id: AHAHA, ..., muts = join(',', compareToCons(read, cons) } - #my %r; - #$r{id} = $read->{id}; - #$r{len} = $read->{len}; - $read->{muts} = join(',', compareToCons($read->{seq}, substr($cons, $read->{st}, $read->{len}))); -} - -1; diff --git a/src/perl/Util.pm b/src/perl/Util.pm deleted file mode 100755 index 0b7b3a1..0000000 --- a/src/perl/Util.pm +++ /dev/null @@ -1,255 +0,0 @@ -package Util; -# various statistical utilities - -# Copyright 2007, 2008, 2009 -# Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Osvaldo Zagordi, -# ETH Zurich - -# This file is part of ShoRAH. -# ShoRAH is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# ShoRAH is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . - -use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION); -use Exporter; -use NikoUtil; - -$VERSION = 1.00; -@ISA = qw(Exporter); - -# autoexport: -@EXPORT = qw(dec2bin findProg Psystem average median countChar -normalize maxind makeRandomDist makeRandomDistMin binomial fact rand_perm -computeInDelMut); -# export on request: -@EXPORT_OK = qw(); - -sub dec2bin { - my $arg = shift; - my $N = shift; - substr(unpack("B32", pack('C', $arg)),-$N); -} - -sub findProg { - my $name = shift; - my $ans; - - my @pathList = split(':',$ENV{'PATH'}); - push(@pathList,"$ENV{'HOME'}/bin"); - push(@pathList,'.'); - - my $found = 0; - foreach my $dir (@pathList) { - if (-r "$dir/$name") { - $ans = "$dir/$name"; - $found = 1; - print "found $name as $ans\n"; - last; - } - } - if ($found == 0) { - print "error, can't find $name in PATH = @pathList\n"; - exit 1; - } - return $ans; -} - -sub Psystem { - my $s = shift; - print "exec : $s\n"; - system ("$s"); -} - -sub Fsystem { - my $s = shift; - print "$s\n"; -} - -sub average { - my $l = shift; - if (@$l == 0) { - return undef; - } - my $ave = 0; - foreach (@$l) { - $ave += $_; - } - $ave /= @$l; - return $ave; -} - -sub median { - #actually computes the mode - my $l = shift; - my %c; - foreach (@$l) { - $c{$_}++; - } - my $maxcnt = 0; - my $med; - foreach (keys(%c)) { - if ($c{$_} > $maxcnt) { - $maxcnt = $c{$_}; - $med = $_; - } - } - return $med; -} - -sub countChar { - #count occurances of $char in $s - # - my $s = shift; - my $char = shift; - #print "counting $char in $s\n"; - # this little trick doesn't work if char = '.' - #$_ = $s; - #return tr/$char/$char/; - - my $count = 0; - foreach (0.. length($s)) { - if (substr($s,$_,1) eq $char) { - $count++; - } - } - #print "get $count\n"; - return $count; -} - -sub normalize { - #input: list of positive integers - #output: list / sum - my $l = shift; - #print "Calling norm with @$l\n"; - my $sum = 0; - foreach my $a (@$l) { - if (defined $a) { - $sum += $a; - } - } - if ($sum > 0) { - foreach my $a (@$l) { - if (defined $a) { - $a = $a / $sum; - } else { - $a = 0; - } - } - } -} - -sub maxind { - my $l = shift; - my $max = -1e100; - my $maxind = 0; - foreach my $i (0..@$l-1) { - if ($l->[$i] > $max) { - $max = $l->[$i]; - $maxind = $i; - } - } - return $maxind; -} - -sub makeRandomDist { - my $n = int(shift); - return 0 if ($n < 1); - my @ans; - - foreach my $i (1 .. $n) { - push(@ans, int(rand(1000))); - } - normalize(\@ans); - return \@ans; -} - -sub makeRandomDistMin { - my $n = int(shift); - my $minFreq = shift; - return 0 if ($n < 1); - die "$n * $minFreq > 1\n" if ($n * $minFreq > 1); - my @ans; - - push @ans, 0; - foreach my $i (1 .. $n-1) { - push(@ans, int(rand(1000))); - } - # normalize so that sum @ans = 1 - $minFreq*$n - my $num = 1 - $minFreq*$n; - my $sum = 0; - $sum += $_ foreach (@ans); - $_ *= $num/$sum foreach (@ans); - $_ += $minFreq foreach (@ans); - return \@ans; -} - -sub binomial { - my ($n, $k) = round(@_); - if ($k > $n) { - return 0; - } - return (fact($n) / (fact($k) * fact($n - $k))); -} - -sub fact { - my $n = shift; - if ($n == 0) { - return 1; - } else { - my $ans = 1; - foreach (1..$n) { - $ans *= $_; - } - return $ans; - } -} - - -sub rand_perm { - - # returns a random permutation of the $N integers 0, ..., $N-1 - - my $N = shift; - - my @list = (0 .. $N-1); - my @perm; - - while (@list) { - my $r = random(0, $#list); - push @perm, splice(@list, $r, 1); - } - - return @perm; -} - -sub computeInDelMut { - my $a = shift; - my $b = shift; - my $del = countChar($a, '-'); - my $ins = countChar($b, '-'); - my $Mut = 0; - foreach my $i (0 .. length($a) - 1) { - my $x = substr($a,$i,1); - my $y = substr($b,$i,1); - if (($x ne $y) and ($x ne '-') and ($y ne '-')) { - $Mut++; - } - } - return ($ins, $del, $Mut); -} - -1; diff --git a/src/python/shorah_matching.py b/src/python/shorah_matching.py deleted file mode 100755 index 6b36ef8..0000000 --- a/src/python/shorah_matching.py +++ /dev/null @@ -1,81 +0,0 @@ -# Hopcroft-Karp bipartite max-cardinality matching and max independent set -# David Eppstein, UC Irvine, 27 Apr 2002 -# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/123641 - -# Example from Cormen, Leiserson, and Rivest, section 27.3: -#a = bipartiteMatch({ 0:[0], 1:[0,2], 2:[1,2,3], 3:[2], 4:[2] }) - - -def bipartiteMatch(graph): - '''Find maximum cardinality matching of a bipartite graph (U,V,E). - The input format is a dictionary mapping members of U to a list - of their neighbors in V. The output is a triple (M,A,B) where M is a - dictionary mapping members of V to their matches in U, A is the part - of the maximum independent set in U, and B is the part of the MIS in V. - The same object may occur in both U and V, and is treated as two - distinct vertices if this happens.''' - - # initialize greedy matching (redundant, but faster than full search) - matching = {} - for u in graph: - for v in graph[u]: - if v not in matching: - matching[v] = u - break - - while 1: - # structure residual graph into layers - # pred[u] gives the neighbor in the previous layer for u in U - # preds[v] gives a list of neighbors in the previous layer for v in V - # unmatched gives a list of unmatched vertices in final layer of V, - # and is also used as a flag value for pred[u] when u is in the first - # layer - preds = {} - unmatched = [] - pred = dict([(u, unmatched) for u in graph]) - for v in matching: - del pred[matching[v]] - layer = list(pred) - - # repeatedly extend layering structure by another pair of layers - while layer and not unmatched: - newLayer = {} - for u in layer: - for v in graph[u]: - if v not in preds: - newLayer.setdefault(v, []).append(u) - layer = [] - for v in newLayer: - preds[v] = newLayer[v] - if v in matching: - layer.append(matching[v]) - pred[matching[v]] = v - else: - unmatched.append(v) - - # did we finish layering without finding any alternating paths? - if not unmatched: - unlayered = {} - for u in graph: - for v in graph[u]: - if v not in preds: - unlayered[v] = None - return (matching, list(pred), list(unlayered)) - - # recursively search backward through layers to find alternating paths - # recursion returns true if found path, false otherwise - def recurse(v): - if v in preds: - L = preds[v] - del preds[v] - for u in L: - if u in pred: - pu = pred[u] - del pred[u] - if pu is unmatched or recurse(pu): - matching[v] = u - return 1 - return 0 - - for v in unmatched: - recurse(v) diff --git a/src/python/shorah_mm.py b/src/python/shorah_mm.py deleted file mode 100755 index 9944e4e..0000000 --- a/src/python/shorah_mm.py +++ /dev/null @@ -1,778 +0,0 @@ -# Copyright 2007, 2008, 2009 -# Niko Beerenwinkel, -# Nicholas Eriksson, -# Moritz Gerstung, -# Lukas Geyrhofer, -# Osvaldo Zagordi, -# Susana Posada Cespedes, -# ETH Zurich - -# This file is part of ShoRAH. -# ShoRAH is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# ShoRAH is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with ShoRAH. If not, see . - -''' - -usage: mm.py filename [max_haplos] - ------------- -Input: filename should consist of lines like -0 ACAGACGA -1 CAGACGA -4 AAAAAAAAAA - -where the first column is the start position and the second column is the read. -The start positions must be indexed from 0. - -Any alphabet is allowed for the reads. - -Duplicate reads or reads which are redundant might cause unexpected results. -For example, the reads -0 ACGT -1 CGT -are redundant (since the first contains the second exactly). - ------------- -Output: - the last 4 characters of filename are deleted and replaced with geno - so if input is file.read then output is file.geno ------------- -parameters (see the beginning of main() in the code to change these) - -MAXGENO = 10000 (can be changed with max_haplos on command line) - - maximum number of haplotypes we output - -MAXITER = 500 - - maximum number of times we rerun the matching algorithm to try to find new - haplotypes - -ONEPATH = True - - extend chains to just one path in the graph? If false, chains get extended - to all possible paths. - This can be a problem if the chain is small, you can get too many paths. - -TODO - ONEPATH should have an option to output up to MAXPATHS paths -''' - -from __future__ import print_function -import random -import shorah_matching - -MAXGENO_DEFAULT = 10000 -MAXITER_DEFAULT = 50 -MINOVERLAP_DEFAULT = 1 -ONEPATH_DEFAULT = True -OUTPUTGRAPH_DEFAULT = False - - -def parse_com_line(): - - import argparse - - parser = argparse.ArgumentParser(description="Global reconstruction: minimal set of haplotypes that\ - explains all reads in the dataset", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - opts = main.__defaults__ - - parser.add_argument("filename", nargs=1, - help="Input file containing starting position and sequences\ - for each of the reads") - - parser.add_argument("-m", "--maxhaplo", action="store", metavar='INT', type=int, - required=False, dest="MAXGENO", default=opts[0], - help="keep searching until we find at least MAXGENO \ - haplotypes") - - parser.add_argument("-i", "--maxiter", action="store", metavar='INT', type=int, - required=False, dest="MAXITER", default=opts[1], - help="but only look up to MAXITER rounds") - - parser.add_argument("-l", "--length", action="store", metavar='INT', type=int, - required=False, dest="MINOVERLAP", default=opts[2], - help="minimum overlap to include in graph") - - # logical options - parser.add_argument("-o", "--one", action="store_true", required=False, - dest="ONEPATH", default=ONEPATH_DEFAULT, - help="search one path only") - - parser.add_argument("-a", "--all", action="store_true", required=False, - dest="MULTIPATH", help="extend search to many paths") - - parser.add_argument("-g", "--graph", action="store_true", required=False, - dest="OUTPUTGRAPH", default=OUTPUTGRAPH_DEFAULT, - help="output a graph?") - - args = parser.parse_args() - if args.MULTIPATH: - args.ONEPATH = False - - return args - - -def permuteList(l, perm): - '''apply a permutation to a list''' - return [perm[i] for i in l] - - -def permuteGraph(graph, perm): - '''apply a permutation to a graph''' - return dict([(perm[g], permuteList(child, perm)) - for (g, child) in graph.items()]) - - -def permute(graph, reads, descList): - ''' pick a random elt of S_{len(graph) - 2} and apply the permutation - (fixing the source and sink) to graph, reads, and descList - ''' - # build permutation in S_n - n = len(graph) - 2 - perm = list(range(n)) - random.shuffle(perm) - # and then make sure to fix the source and sink - perm.append(source) - perm.append(sink) - if verbose: - print('permuting with', perm) - newGraph = permuteGraph(graph, perm) - #newReads = [reads[i] for i in perm[:-2]] - # DOESN'T WORK - NEED INVERSE OF PERM - - newReads = [[] for i in range(n)] - for i in range(n): - newReads[perm[i]] = reads[i] - # permute labels in descList - tmpdescList = list( - map(lambda l: permuteList(l, perm), map(list, descList))) - # also permute places ### VIA INVERSE - newdescList = [[] for i in tmpdescList] - for i in range(n + 2): - newdescList[perm[i]] = tmpdescList[i] - return newGraph, newReads, newdescList - - -def readFasta(filename): - '''name says it''' - try: - f = open(filename, 'r') - except IOError: - return [], 0 - ans = [] - maxEnd = 0 - for line in f: - pos, seq = line.split() - pos = int(pos) - end = pos + len(seq) - if end > maxEnd: - maxEnd = end - ans.append([pos, seq]) - return ans, maxEnd - - -def printFasta(seqs, lineLen=80): - '''simple enough''' - start = 0 - totalLen = len(seqs[0]) - - while start < totalLen: - length = min(lineLen, totalLen - start) - print() - for i in seqs: - print(i[start:start + length]) - start += length - return 0 - - -def addSourceSink(graph, descList, reads, seqLen): - """ - adds a source and sink onto the graph - and into descList - """ - source = len(reads) - sink = len(reads) + 1 - maxstart = 0 - for r in reads: - if r[0] > maxstart: - maxstart = r[0] - # reads.append([-1,'']) - #reads.append([seqLen + 1,'']) - - # need edges source -> r whenever r has no parents - # and r -> sink whenever it has no children - - # find everything which is not a child - - allChildren = set([]) - list(map(allChildren.update, [graph[i] for i in graph])) - initialReads = set(list(range(len(graph)))).difference(allChildren) - graph.setdefault(source, list(initialReads)) - - # find everything without children and create edge to sink - for r in range(len(graph)): - if graph[r] == []: - graph[r].append(sink) - graph.setdefault(sink, []) - - # print(graph) - if OUTPUTGRAPH: - print('BEGIN') - for r in graph: - #print('node name', r) - #print('node start', reads[r][0]) - #print('children', graph[r]) - if r == sink: - print(1000, ':', r, ':', []) - if r == source: - print(-1, ':', r, ':', graph[r]) - if (r != sink and r != source): - print(reads[r][0], ':', r, ':', graph[r]) - print('END') - print(graph) - # fix up descList - # everything as sink as desc - for i in descList: - i.add(sink) - # push source and then sink on descList - # source has everthing as desc - sourceDesc = set(list(range(len(descList)))) - sourceDesc.add(sink) - sinkDesc = set([]) - descList.append(sourceDesc) - descList.append(sinkDesc) - - -def makeGraph(reads): - '''missing''' - def readsMatch(r, s): - '''check if two reads agree on their overlap - called with r starting before s - ''' - overlap = r[0] + len(r[1]) - s[0] - if overlap < MINOVERLAP: - return 0 - if overlap > len(s[1]): - overlap = len(s[1]) - #if overlap < 0: overlap = 0 - return r[1][-overlap:] == s[1][:overlap] - - graph = {} - # initialize graph - for r in range(len(reads)): - graph.setdefault(r, []) - - # set edges between reads - for r in range(len(reads)): - if r % 100 == 0: - print(r) - for s in range(len(reads)): - if reads[r][0] < reads[s][0]: - if readsMatch(reads[r], reads[s]): - # edge from r to s - graph.setdefault(r, []).append(s) - return graph - - -def findDescendants(graph): - '''missing''' - descList = [] - for i in range(len(graph)): - descList.append(set([])) - - def getDesc(node): - '''missing''' - #print('getDesc called on', node) - # node is an int, - # descList is a list of lists of ints - # set descList[node] by: - # union of all descList[x] for x in graph[node] - if len(descList[node]) > 0: - #print(node, descList[node]) - return descList[node] - else: - #descList[node] = descList[node].union(set(graph[node])) - descList[node].update(set(graph[node])) - for x in graph[node]: - # doesn't speed it up - # if len(descList[x]) > 0: - # descList[node] = descList[node].union(descList[x]) - # else: - #descList[node] = descList[node].union(getDesc(x)) - descList[node].update(getDesc(x)) - return descList[node] - - # must call getDesc on the entire graph - # but most of the calls are trivial - for i in range(len(graph)): - getDesc(i) - #print('desc of', i, 'are', getDesc(i)) - return descList - - -def removeCycles(graph, descList): - '''missing''' - for i in graph.keys(): - ##print('checking the',len(graph[i]),'children of node',i) - # python is a pain if we modify graph[i] while looping over it - # so need a temporary variable - tmp = [] - tmp.extend(graph[i]) - for j in tmp: - for k in tmp: - if j != k: - # print('checking',j,k) - if j in descList[k]: - #print('removing node',j) - #print('graph[i] =',graph[i]) - graph[i].remove(j) - break - ###print('reduced to',len(graph[i]), 'children') - return graph - - -def matchToChain(match, n): - '''takes a matching and creates the - corresponding chain in the poset - Who knows how it works...''' - startpoints = set(list(range(n))).difference(list(match.values())) - allChains = [] - for i in startpoints: - curChain = [] - curChain.append(i) - tmp = i - while True: - if tmp in match.keys(): - tmp = match[tmp] - curChain.append(tmp) - else: - break - allChains.append(curChain) - return allChains - - -def onePath(x, y): - ''' find one path between x and y - using descList - currently unused, but might be nice - ''' - path = [x] - if y not in descList[x]: - return None - if x == y: - return path - done = 0 - while not done: - for child in graph[path[-1]]: - if y in descList[child]: - path.append(child) - break - if y == child: - path.append(y) - done = 1 - break - return path - - -def find_all_paths(GRAPH, start, end, path=[]): - '''from Guido's essay''' - path = path + [start] - if start == end: - return [path] -# if not GRAPH.has_key(start): - if start not in GRAPH: - return [] - paths = [] - for node in GRAPH[start]: - if node not in path: - newpaths = find_all_paths(GRAPH, node, end, path) - for newpath in newpaths: - paths.append(newpath) - return paths - - -def allPaths(x, y): - '''find all paths from x to y in the graph''' - ##print('allPaths', x, y) - paths = [[x]] - if y not in descList[x]: - return None - if x == y: - return paths - finishedpaths = [] - while len(paths) > 0: - # for last element of each list in paths - # loop over children, making new list of paths - newpaths = [] - # for every path - for i in range(len(paths)): - # try to extend it - for child in graph[paths[i][-1]]: - # if we can extend along this child - if y in descList[child]: - # y is later, so create a new, longer path - # FIXME: test this, possibly just use find_all_paths - #tmppath = [] - # tmppath.extend(paths[i]) - # tmppath.append(child) - newpaths.append(paths[i] + [child]) - if y == child: - # done, push on final - tmppath = [] - tmppath.extend(paths[i]) - # don't include y - # tmppath.append(child) - finishedpaths.append(tmppath) - #print('newpaths = ', newpaths) - paths = newpaths - #print('returns', len(finishedpaths), 'paths') - return finishedpaths - - -def pathToGeno(path): - ''' - first get rid of sink and source - (this has the side effect of modifying the paths) - ''' - geno = '' - if len(path) == 0: - return None - if path[0] == source: - path.pop(0) - if len(path) == 0: - return None - if path[-1] == sink: - path.pop(-1) - for i in range(len(path) - 1): - r = path[i] - s = path[i + 1] - # nonoverlap is how much of read r occurs before read s starts - nonoverlap = reads[s][0] - reads[r][0] - geno = geno + reads[r][1][:nonoverlap] - # geno.append(reads[r][1][:nonoverlap] - # add the last read - geno = geno + reads[path[-1]][1] - # geno.append(reads[path[-1]][1]) - - # pad with .'s at beginning and end to make right length - # countBegin = 0 - # countEnd = 0 - - for i in range(reads[path[0]][0]): - # geno.append('.') - geno = '.' + geno - # countBegin += 1 - - offsetEnd = seqLen - reads[path[-1]][0] - len(reads[path[-1]][1]) - for i in range(offsetEnd): - geno = geno + '.' - # countEnd += 1 - -# print('path of length',len(path), 'has', countBegin, '. and', countEnd) -# if countBegin > .5 * len(geno): -# geno = None -# if countEnd > .5 * len(geno): -# geno = None - - return geno - - -def extendChainToOne(chain): - '''extend chain to ONE path in the graph - input: a chain (list of ints) - output: a list containing a path that extends the chain - ''' - ans = [] - if len(chain) == 0: - return ans - if chain[0] == source: - chain.pop(0) - if len(chain) == 0: - return None - if chain[-1] == sink: - chain.pop(-1) - # deal with paths from source to chain[0] but - # if chain started as [source, sink] then it is of - # length 0 now - if len(chain) == 0: - ans = onePath(source, sink) - else: - ans = onePath(source, chain[0]) - for i in range(len(chain)): - if i != len(chain) - 1: - nxt = chain[i + 1] - else: - nxt = sink - if nxt not in graph[chain[i]]: - ans.extend(onePath(chain[i], nxt)) - else: - ans.append(chain[i]) - return [ans] - - -def extendChain(chain): - '''extend chain to all possible paths in graph - input: a chain (list of ints) - output: a list of all paths that extend the chain - print 'input:',chain - ''' - ans = [[]] - newans = [] - if len(chain) == 0: - return ans - if chain[0] == source: - chain.pop(0) - if len(chain) == 0: - return None - if chain[-1] == sink: - chain.pop(-1) - # deal with paths from source to chain[0] but - # if chain started as [source, sink] then it is of - # length 0 now - if len(chain) == 0: - ans = allPaths(source, sink) - else: - ans = allPaths(source, chain[0]) - for i in range(len(chain)): - #print('currently have', len(ans),'chains') - if i != len(chain) - 1: - nxt = chain[i + 1] - else: - nxt = sink - if nxt not in graph[chain[i]]: - for p in allPaths(chain[i], nxt): - for j in ans: - newans.append(j + p) - ans = newans - newans = [] - else: - for j in ans: - j.append(chain[i]) - ##print('output:', ans) - return ans - - -def minimalCover(graph, reads, descList): - """ the main code: takes a graph and creates a matching - problem. then solves the matching to get chains in - the graph, which are extended to paths in all ways """ - - #print('making bipartite graph') - # make bipartiteGraph - biGraph = {} - for i in range(len(graph)): - biGraph.setdefault(i, list(descList[i])) - - print('...finding matching') - # find matching - a = shorah_matching.bipartiteMatch(biGraph) - # a[0] is the matching, a[1] and a[2] the parts of - # the maximal independent sets in the two parts of biGraph - print('...Reversing matching') - # reverse the matching - match = dict([(v, k) for (k, v) in a[0].items()]) - # free some memory, perhaps - del a - #print('matching =', match) - - # transform to chains in poset - chainList = matchToChain(match, len(graph)) - if OUTPUTGRAPH: - print('chains=', chainList) - maximalAntiChain = len(chainList) - print('...largest antichain of size', maximalAntiChain) - if verbose: - for zz in range(len(chainList)): - print(zz, '==', chainList[zz]) - - # now that we're done with the matching, add the source and sink into graph - # this is needed for the extendChain and pathToGeno functions - # now done before the matching - #addSourceSink(graph, descList, reads, seqLen) - - print('...transforming chains to paths') - # transform chains to paths in all possible ways - if ONEPATH: - paths = list(map(extendChainToOne, chainList)) - else: - paths = list(map(extendChain, chainList)) - - print(len(chainList), 'chains give', sum(map(len, paths)), - 'paths (', list(map(len, paths)), ')') - if verbose: - print('paths=', paths) - - print('...translating paths->genotypes') - genos = [] - for i in range(len(paths)): - genos.extend(map(pathToGeno, paths[i])) - del(reads) - - return genos, maximalAntiChain - - -def countPaths(graph, source, sink): - '''missing''' - # numPaths[i] = # of paths from i to the sink in graph - numPaths = [-1 for i in range(len(graph))] - - numPaths[sink] = 1 - - def getPaths(node): - ''' count paths from node to sink ''' - if numPaths[node] < 0: - numPaths[node] = sum(map(getPaths, graph[node])) - #print('node', node, 'has', numPaths[node], 'paths to sink') - return numPaths[node] - getPaths(source) - #print('paths =', numPaths) - return numPaths[source] - - -###########main code######### - -def main(readfile, maxhaplo=MAXGENO_DEFAULT, maxiter=MAXITER_DEFAULT, - minoverlap=MINOVERLAP_DEFAULT, onepath=ONEPATH_DEFAULT, - outputgraph_local=OUTPUTGRAPH_DEFAULT): - '''This can be called from shorah.py directly - ''' - # want to do outfile = readfile; outfile ~= s/rest/geno$/ - if readfile[-4:] == 'rest' or readfile[-4:] == 'read': - #outFileName = readfile[:-4] + str(MAXGENO) + '.geno' - outFileName = readfile[:-4] + 'geno' - else: - outFileName = readfile + '.geno' - - # This use of global variables should definitely be changed - global verbose, seqLen, descList, graph, reads, source, sink - global MAXGENO - MAXGENO = maxhaplo - - global MAXITER - MAXITER = maxiter - - global ONEPATH - ONEPATH = onepath - - global MINOVERLAP - MINOVERLAP = minoverlap - - global OUTPUTGRAPH - OUTPUTGRAPH = outputgraph_local - - print('inputfile =', readfile) - print('outputfile=', outFileName) - print('Creating up to', MAXGENO, 'haplotypes') - - print('Permuting up to', MAXITER, 'times') - print('ONEPATH = ', ONEPATH) - print('minimum overlap in graph:', MINOVERLAP) - - # get reads - reads, seqLen = readFasta(readfile) - if seqLen == 0: - print('No reads in file (or couldn\'t open file)') - return 0 - source = len(reads) - sink = len(reads) + 1 - if len(reads) < 51: - verbose = 1 - else: - verbose = 0 - print(len(reads), 'reads, sequence length =', seqLen) - if verbose: - for r in reads: - print(reads.index(r), r) - - # Build read graph - graph = makeGraph(reads) - if verbose: - print('graph = ', graph) - print('finished graph') - - # take transitive closure of graph - descList = findDescendants(graph) - if verbose: - print('desc =', list(map(list, descList))) - print('...done with transitive closure') - - print('...removing cycles') - graph = removeCycles(graph, descList) - if verbose: - print('graph \ cycles = ', graph) - - # add on a source and sink to the graph - addSourceSink(graph, descList, reads, seqLen) - if verbose: - print('graph with SS', graph) - print('new Desclist', list(map(list, descList))) - # count paths in the graph - - totalPaths = countPaths(graph, source, sink) - print('There are', totalPaths, 'paths in the graph') - - #global maximalAntiChain - - allGenos = set([]) - DONE = False - - iteration = 0 - if totalPaths < MAXGENO: - print('This is less than MAXGENO =', MAXGENO) - print('So matching is not run') - paths = extendChain([source, sink]) - genos = list(map(pathToGeno, paths)) - allGenos.update(genos) - DONE = True - genos, maximalAntiChain = minimalCover(graph, reads, descList) - else: - print('starting matching') - print() - while not DONE and (len(allGenos) < MAXGENO) and (iteration < MAXITER): - # Done with preproccessing. Now permute - # graph, reads, and descList and get a new set of genotypes - # then find a minimal chain decomposition of the graph - # and return all genotypes obtained by extending - # these chains - if iteration > 1: - print('...permuting') - graph, reads, descList = permute(graph, reads, descList) - genos, maximalAntiChain = minimalCover(graph, reads, descList) - sizeBefore = len(allGenos) - allGenos.update(genos) - sizeAfter = len(allGenos) - print('found', len(allGenos), 'genotypes out of', MAXGENO) - print('gained', sizeAfter - sizeBefore, 'in round', iteration) - print() - # printFasta(genos,1000) - iteration += 1 - - outfile = open(outFileName, 'w') - - for i in list(allGenos): - # check if i == None - if i: - outfile.write(i) - outfile.write('\n') - outfile.close() - if verbose: - printFasta(list(allGenos), 1000) - - # summary - print('graph has', totalPaths, 'total paths') - print('minimal path cover is of size', maximalAntiChain) - print('ran matching algorithm', iteration, 'times') - print('output', len(allGenos), 'haplotypes') - - print('% TOTALPATHS =', totalPaths) - print('% ANTICHAIN =', maximalAntiChain) - print('% HAPLO =', len(allGenos)) diff --git a/src/samtools/meson.build b/src/samtools/meson.build new file mode 100644 index 0000000..cb7b458 --- /dev/null +++ b/src/samtools/meson.build @@ -0,0 +1,45 @@ +bam_sources = files([ + 'bam.c', + 'bam_aux.c', + 'bam_cat.c', + 'bam_import.c', + 'bam_index.c', + 'bam_lpileup.c', + 'bam_md.c', + 'bam_pileup.c', + 'bam_reheader.c', + 'bam_sort.c', + 'bedidx.c', + 'bgzf.c', + 'faidx.c', + 'knetfile.c', + 'kprobaln.c', + 'kstring.c', + 'razf.c', + 'sam.c', + 'sam_header.c', + 'bam.h', + 'bam_endian.h', + 'bgzf.h', + 'faidx.h', + 'kaln.h', + 'khash.h', + 'knetfile.h', + 'kprobaln.h', + 'kseq.h', + 'ksort.h', + 'kstring.h', + 'razf.h', + 'sam.h', + 'sam_header.h']) + +zlib_dep = dependency('zlib') + +bam_library = static_library( + 'libbam', + bam_sources, + dependencies : zlib_dep, + c_args : ['-D_USE_KNETFILE'], + install : false) + +bam_inc = include_directories('.') diff --git a/src/scripts/dec.py.in b/src/scripts/dec.py.in deleted file mode 100755 index f75a24e..0000000 --- a/src/scripts/dec.py.in +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env @PYTHON@ - -from __future__ import print_function -import argparse -import os -import shorah_dec - -# parse command line -parser = argparse.ArgumentParser(description="Local haplotype recontruction: Dirichlet process mixture", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -# set the defaults (see http://bit.ly/2hCTQl) -opts = shorah_dec.main.__defaults__ - -group1 = parser.add_argument_group('Input files', 'Required input') -group2 = parser.add_argument_group('Run options', - 'Parameters that can (maybe should) be changed \ - according to the needs') -group3 = parser.add_argument_group('More options', - 'Do you really want to change this?') - - -group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str, - dest="in_bam", help="file with aligned reads in .bam format") - -group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str, - dest="in_fasta", help="reference genome in fasta format") - -group2.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int, - default=opts[0], dest="win_length", help="window size") - -group2.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, - default=opts[2], dest="region", help="region in format 'chrm:start-stop', \ - e.g. 'ch3:1000-3000'") - -group2.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float, - default=opts[4], dest="alpha", help="alpha in dpm sampling") - -group2.add_argument("-S", "--seed", metavar='INT', required=False, type=int, - default=opts[6], dest="seed", help="set seed for reproducible results") - -group3.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int, - default=opts[1], dest="win_shifts", help="number of window shifts") - -group3.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int, - default=opts[3], dest="max_coverage", help="approximate max coverage allowed") - -group3.add_argument("-k", "--keep_files", required=False, action='store_true', - default=opts[5], dest="keep_files", help="keep all intermediate files") - -args = parser.parse_args() - -supported_formats = { - 'bam': 'aligned reads', - 'fasta': 'reference genome' -} -# check the input file is in supported format -try: - tmp_filename = os.path.split(args.in_bam)[1] - [in_stem, in_format] = [tmp_filename.split('.')[0], - tmp_filename.split('.')[-1]] - t = supported_formats[in_format] -except IndexError: - shorah_dec.declog.error('The input file must be filestem.format') - print('The input file must be filestem.format') - print('Supported formats are') - for sf in supported_formats.items(): - print(sf[0], ':', sf[1]) - sys.exit() -except KeyError: - shorah_dec.declog.error('format unknown') - print(in_format, 'format unknown') - print('Supported formats are') - for sf in supported_formats.items(): - print(sf[0], ':', sf[1]) - sys.exit() - -shorah_dec.main(args.in_bam, args.in_fasta, args.win_length, args.win_shifts, - args.region, args.max_coverage, args.alpha, args.keep_files, args.seed) diff --git a/src/scripts/fas2read.pl.in b/src/scripts/fas2read.pl.in deleted file mode 100755 index f948849..0000000 --- a/src/scripts/fas2read.pl.in +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env perl -#translate a fasta file to read format - -use lib "@perldir@"; -use strict; -use Getopt::Long; -use DNA; -use ReadUtil; - -my ($HELP, $VERBOSE, $FASTAFILE); - -GetOptions( - "help" => \$HELP, - "fasta=s" => \$FASTAFILE, - ); -help() if (($HELP) or (! defined $FASTAFILE)); -my $outfile = $FASTAFILE; -$outfile =~ s/fas$/read/; -print "Reading $FASTAFILE\nOutput to $outfile\n"; -open OUT, ">$outfile"; -my $reads = readFastaFile($FASTAFILE); - -foreach my $rd (@$reads) { - print OUT "$rd->{st} $rd->{seq}\n"; -} -close OUT; - -sub help { - print <. -''' shorah.py is the program that performs the global reconstruction. - It runs error correction, global reconstruction and frequency estimation - by calling the relevant programs. Additionally, it performs SNV discovery - by calling the program snv.py -''' -from __future__ import division -import os -import os.path -import sys - -import logging -import logging.handlers - -# Make a global logging object. -sholog = logging.getLogger(__name__) - -dn = os.path.dirname(__file__) - - -def run_child(exe_name, arg_string): - '''use subrocess to run an external program with arguments''' - import subprocess - - if not arg_string.startswith(' '): - arg_string = ' ' + arg_string - - try: - sholog.debug("running %s" % (exe_name + arg_string)) - retcode = subprocess.call(exe_name + arg_string, shell=True) - if retcode < 0: - sholog.error(exe_name + arg_string) - sholog.error("Child %s terminated by signal" % exe_name, -retcode) - else: - sholog.debug("Child %s returned %i" % (exe_name, retcode)) - except OSError as ee: - sholog.error("Execution of %s failed: %s" % (exe_name, ee)) - - return retcode - - -if __name__ == "__main__": - - import argparse - import shutil - - from Bio import SeqIO - - import shorah_dec - import shorah_mm - - # parse command line - parser = argparse.ArgumentParser(description="ShoRAH: Short Read Assembly into Haplotypes", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - # First define all option groups - group1 = parser.add_argument_group('Input files', 'Required input') - group2 = parser.add_argument_group('Run options', - "Parameters that can (maybe should) be changed \ - according to the needs") - group3 = parser.add_argument_group( - 'More options', 'Do you really want to change this?') - - group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str, - dest="b", help="sorted bam format alignment file") - - group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str, - dest="f", help="reference genome in fasta format") - - group2.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float, - dest="a", default=0.1, help="alpha in dpm sampling") - - group2.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int, - dest="w", default=201, help="window size") - - group2.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, - dest="r", default='', help="region in format 'chr:start-stop', e.g.\ - 'chrm:1000-3000'") - - group2.add_argument("-S", "--seed", metavar='INT', required=False, type=int, - dest="S", default=None, help="set seed for reproducible results") - - group3.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int, - dest="s", default=3, help="number of window shifts") - - group3.add_argument("-i", "--sigma", metavar='FLOAT', required=False, type=float, - dest="i", default=0.01, help="value of sigma to use when calling SNVs") - - group3.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int, - dest="x", default=10000, help="approximate max coverage allowed") - - group3.add_argument("-k", "--keep_files", required=False, action="store_true", default=True, - dest="k", help="keep intermediate files") - - args = parser.parse_args() - - # set logging level - sholog.setLevel(logging.DEBUG) - # This handler writes everything to a file. - LOG_FILENAME = './shorah.log' - h = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', - maxBytes=100000, backupCount=5) - f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\ - %(lineno)d %(message)s") - h.setFormatter(f) - sholog.addHandler(h) - sholog.info(' '.join(sys.argv)) - - in_stem = '.'.join(os.path.split(args.b)[1].split('.')[:-1]) - - # 1. run main from shorah_dec; it will also launch snv.py - if not os.path.exists('%s.cor.fas' % in_stem): - sholog.debug('running dec.py') - shorah_dec.main(in_bam=args.b, in_fasta=args.f, win_length=args.w, - win_shifts=args.s, max_coverage=args.x, region=args.r, - alpha=args.a, keep_files=args.k, seed=args.S) - - # 2. copy file and run fas2read to convert from fasta - shutil.move('%s.cor.fas' % in_stem, '%s_cor.fas' % in_stem) - sholog.debug('running fas2reads') - my_prog = 'perl -I %s %s' % (dn + '/perllib', - os.path.join(dn, 'fas2read.pl')) - my_arg = " -f %s_cor.fas" % in_stem - assert os.path.isfile("%s_cor.fas" % in_stem), \ - 'File %s_cor.fas not found' % in_stem - retcode_f2r = run_child(my_prog, my_arg) - if retcode_f2r: - sholog.error('fas2read did not return 0') - sys.exit('Something went wrong in fas2read') - else: - sholog.debug('fas2read exited successfully') - - # 3. run contain to eliminate redundant reads - # output: filein.rest - sholog.debug('running contain') - my_prog = os.path.join(dn, 'contain') - my_arg = " -f %s_cor" % in_stem - assert os.path.isfile('%s_cor.fas' % in_stem), \ - 'File %s_cor.fas not found' % in_stem - retcode_c = run_child(my_prog, my_arg) - if retcode_c: - sholog.error('contain did not return 0') - sys.exit('Something went wrong while running contain') - else: - sholog.debug('contain exited successfully') - - # 4. run main from mm.py to find minimal matching - sholog.debug('running mm.py') - shorah_mm.main('%s_cor.rest' % in_stem, maxhaplo=200) - - # 5. run EM freqEst output: sample.$nr.popl - sholog.debug('running freqEst') - my_prog = os.path.join(dn, 'freqEst') - my_arg = " -f %s_cor" % in_stem - assert os.path.isfile('%s_cor.rest' % in_stem), \ - 'File %s_cor.rest not found' % in_stem - retcode_em = run_child(my_prog, my_arg) - if retcode_em: - sholog.error('freqEst did not return 0') - sys.exit('Something went wrong in the EM step') - else: - sholog.debug('freqEst exited successfully') - - # 6. copy high frequency haps from .popl to global_haps.fasta - n_rest = len(list(open('%s_cor.rest' % in_stem))) - min_freq = 1.0 / n_rest - g_recs = [] - for s in SeqIO.parse('%s_cor.popl' % in_stem, 'fasta'): - f_here = float(s.id.split('_')[1]) - if f_here >= min_freq: - s.description = '| global_haplotype frequency=%f\n' % f_here - g_recs.append(s) - SeqIO.write(g_recs, '%s_global_haps.fasta' % in_stem, 'fasta') diff --git a/src/scripts/snv.py.in b/src/scripts/snv.py.in deleted file mode 100755 index 7fbdbed..0000000 --- a/src/scripts/snv.py.in +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env @PYTHON@ - -import argparse -import shorah_snv - -# parse command line -parser = argparse.ArgumentParser(description="Single nucleotide variant calling", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - -# set the defaults (see http://bit.ly/2hCTQl) -opts = shorah_snv.main.__defaults__ - -requiredNamed = parser.add_argument_group('required named arguments') - -requiredNamed.add_argument("-r", "--ref", metavar='REF', type=str, required=True, - dest="reference", help="reference file") - -requiredNamed.add_argument("-b", "--bam", metavar='BAM', type=str, required=True, - dest="bam_file", help="sorted bam format alignment file") - -parser.add_argument("-s", "--sigma", metavar='FLOAT', default=opts[0], type=float, - required=False, dest="sigma", help="value of sigma to use when calling\ - SNVs") - -parser.add_argument("-i", "--increment", metavar='INT', default=opts[1], type=int, - required=False, dest="increment", - help="value of increment to use when calling \ - SNVs (1 used by amplian.py)") - -parser.add_argument("-x", "--maxcov", metavar='INT', default=opts[2], type=int, - dest="max_coverage", help="Maximum coverage allowed") - -args = parser.parse_args() - -shorah_snv.main(args.reference, args.bam_file, args.sigma, - args.increment, args.max_coverage) diff --git a/src/shorah/__main__.py b/src/shorah/__main__.py new file mode 100644 index 0000000..42e6c65 --- /dev/null +++ b/src/shorah/__main__.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +"""Entrypoint module, in case you use `python -mshorah`. + +Why does this file exist, and why __main__? For more info, read: + +- https://www.python.org/dev/peps/pep-0338/ +- https://docs.python.org/2/using/cmdline.html#cmdoption-m +- https://docs.python.org/3/using/cmdline.html#cmdoption-m +""" +from shorah.cli import main + +if __name__ == "__main__": + main() diff --git a/src/scripts/amplian.py.in b/src/shorah/amplicon.py old mode 100755 new mode 100644 similarity index 67% rename from src/scripts/amplian.py.in rename to src/shorah/amplicon.py index 81e1365..aef34e5 --- a/src/scripts/amplian.py.in +++ b/src/shorah/amplicon.py @@ -1,6 +1,6 @@ -#!/usr/bin/env @PYTHON@ +#!/usr/bin/env python3 -# Copyright 2007-2012 +# Copyright 2007-2018 # Niko Beerenwinkel, # Nicholas Eriksson, # Moritz Gerstung, @@ -32,20 +32,30 @@ import os import os.path import sys +import shlex import logging import logging.handlers +from pkg_resources import resource_filename from Bio import SeqIO -# Make a global logging object. -amplog = logging.getLogger(__name__) +dn_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if __name__ == '__main__': + if __package__ is None: + os.sys.path.insert(1, dn_dir) + mod = __import__('shorah') + sys.modules["shorah"] = mod + import shorah_snv +else: + from . import shorah_snv + +diri_exe = resource_filename(__name__, 'bin/diri_sampler') +b2w_exe = resource_filename(__name__, 'bin/b2w') win_min_ext = 0.95 cutoff_depth = 10000 -dn = os.path.dirname(__file__) - def run_child(exe_name, arg_string): '''use subrocess to run an external program with arguments''' @@ -54,17 +64,17 @@ def run_child(exe_name, arg_string): if not arg_string.startswith(' '): arg_string = ' ' + arg_string - amplog.debug(exe_name + arg_string) + logging.debug(exe_name + arg_string) try: retcode = subprocess.call(exe_name + arg_string, shell=True) if retcode > 0: - amplog.error(exe_name + arg_string) - amplog.error("Child %s terminated by signal" % exe_name, retcode) + logging.error(exe_name + arg_string) + logging.error("Child %s terminated by signal %s", exe_name, retcode) else: - amplog.debug("Child %s returned %i" % (exe_name, retcode)) + logging.debug("Child %s returned %i", exe_name, retcode) except OSError as ee: - amplog.error("Execution of %s failed:" % exe_name, ee) + logging.error("Execution of %s failed: %s", exe_name, ee) return retcode @@ -103,15 +113,15 @@ def run_diagnostics(window_file, reads): untouched.append(unt) theta.append(the) gamma.append(gam) - del(it) + del it - amplog.info('sample has %d reads' % reads) + logging.info('sample has %d reads', reads) untouched_hst = untouched[-2000:] unt_mean = sum(untouched_hst) / len(untouched_hst) unt_ratio = 100 * unt_mean / q unt_msg = '%3.1f %% of untouched objects ' % \ unt_ratio - amplog.info(unt_msg) + logging.info(unt_msg) if unt_ratio < 90.0: warnings.warn(unt_msg) @@ -123,7 +133,7 @@ def matchremove(matchobj): c = int(match[1:]) else: c = 0 - del(c) + del c return '' @@ -145,7 +155,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords): try: import matplotlib.pyplot as plt except ImportError: - amplog.error('could not import matplotlib, no pdf produced') + logging.error('could not import matplotlib, no pdf produced') return high_start, high_stop = win_coords @@ -173,7 +183,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords): plt.title('chosen entropy window is %d-%d' % (high_start, high_stop)) plt.savefig('entropy.pdf') - + del fig def highest_entropy(bam_file, fasta_file, ent_sel='relative'): '''Parse reads to have their length distribution and compute the @@ -189,13 +199,13 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'): os.remove('rl.txt') read_len = sorted(read_len) n_reads = len(read_len) - amplog.info('n_reads: %d' % n_reads) + logging.info('n_reads: %d', n_reads) # max_len = max(read_len) trimmed_mean = sum([read_len[i] for i in range(int(0.1 * n_reads), int(0.9 * n_reads))]) trimmed_mean /= (0.8 * n_reads) trimmed_mean = int(round(trimmed_mean, 0)) - amplog.info('trimmed_mean: %d' % trimmed_mean) + logging.info('trimmed_mean: %d', trimmed_mean) # Build the mpileup and compute the entropy per position ref_seq = list(SeqIO.parse(fasta_file, 'fasta'))[0] entropy = [None] * (len(ref_seq) + 1) @@ -232,7 +242,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'): stop = i break stop = i - amplog.info('start: %d, stop: %d' % (start, stop)) + logging.info('start: %d, stop: %d', start, stop) # mean entropy ent_mean = [None] * (len(ref_seq) + 1) @@ -246,7 +256,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'): if entropy[i] > max_ent_per_pos: max_ent_per_pos = entropy[i] highest_ent_pos = i - amplog.info('highest entropy found at position %d' % highest_ent_pos) + logging.info('highest entropy found at position %d', highest_ent_pos) # the window is chosen as the absolute max mean_entropy or as the # max mean entropy covering the position with max entropy @@ -293,26 +303,24 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'): return high_ent_start, high_ent_stop -def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000, - alpha=0.5, s=0.01, region='', diversity=False): +#def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000, +# alpha=0.5, s=0.01, region='', diversity=False): +def main(args): ''' Performs the amplicon analysis, running diri_sampler and analyzing the result ''' - - import shorah_snv - - # set logging level - amplog.setLevel(logging.DEBUG) - # This handler writes everything to a file. - LOG_FILENAME = './amplian.log' - hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', - maxBytes=100000, backupCount=5) - f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\ - %(lineno)d %(message)s") - hl.setFormatter(f) - amplog.addHandler(hl) - amplog.info(' '.join(sys.argv)) + in_bam = args.b + in_fasta = args.f + region = args.r + max_coverage = args.max_coverage + alpha = args.a + seed = args.seed + sigma = args.sigma + diversity = args.diversity + min_overlap = args.min_overlap + + logging.info(' '.join(sys.argv)) # info on reference and region if given, or discover high entropy one ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0] ref_name = ref_seq.id @@ -329,101 +337,29 @@ def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000, ref_length = len(ref_seq) reg_stop = ref_length - amplog.info('analysing region from %d to %d' % (reg_start, reg_stop)) + logging.info('analysing region from %d to %d', reg_start, reg_stop) # output the reads, aligned to the amplicon - b2w_exe = os.path.join(dn, 'b2w') + b2w_args = ' -i 0 -w %d -m %d -x %d %s %s %s' % \ (ref_length, int(min_overlap * ref_length), max_coverage, in_bam, in_fasta, region) - ret_b2w = run_child(b2w_exe, b2w_args) - amplog.debug('b2w returned %d' % ret_b2w) + ret_b2w = run_child(shlex.quote(b2w_exe), b2w_args) + logging.debug('b2w returned %d', ret_b2w) # run diri_sampler on the aligned reads win_file = 'w-%s-%d-%d.reads.fas' % (ref_name, reg_start, reg_stop) h = list(open('coverage.txt'))[0] n_reads = int(h.split()[-1]) assert os.path.exists(win_file), 'window file %s not found' % win_file - diri_exe = os.path.join(dn, 'diri_sampler') + iterations = min(30000, n_reads * 20) diri_args = '-i %s -j %d -a %f -t 2000' % (win_file, iterations, alpha) - ret_diri = run_child(diri_exe, diri_args) - amplog.debug('diri_sampler returned %d' % ret_diri) + ret_diri = run_child(shlex.quote(diri_exe), diri_args) + logging.debug('diri_sampler returned %d', ret_diri) # diagnostics on the convergence run_diagnostics(win_file, n_reads) # run snv.py to parse single nucleotide variants - shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=s, increment=1) - - -if __name__ == "__main__": - - import argparse - # parse command line - parser = argparse.ArgumentParser(description="Local haplotype reconstruction - amplicon mode", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - opts = main.__defaults__ # set the defaults (see http://bit.ly/2hCTQl) - - # First define all option groups - group1 = parser.add_argument_group("Input files", "Required input") - - group2 = parser.add_argument_group("Type of run", - "You can specify a region, or look for the highest diversity\ - region") - group3 = parser.add_argument_group("Run options", "Fine tuning") - group4 = parser.add_argument_group("More options", - "Do you really want to change this?") - - group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str, - dest="in_bam", help="file with aligned reads in .bam format") - - group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str, - dest="in_fasta", help="reference genome in fasta format") - - group2.add_argument("-r", "--region", metavar='chrm:start-stop', default=opts[4], type=str, - dest="region", help="region in format 'chrm:start-stop' e.g.\ - 'ch3:1000-1300'") - - group2.add_argument("-d", "--diversity", action="store_true", default=opts[5], - dest="diversity", help="if set, automatically detects the highest entropy\ - region and runs there") - - group3.add_argument("-m", "--min_overlap", metavar='FLOAT', default=opts[0], type=float, - dest="min_overlap", help="fraction of read overlap to be included") - - group3.add_argument("-a", "--alpha", metavar='FLOAT', default=opts[2], type=float, - dest="alpha", help="alpha in dpm sampling") - - group4.add_argument("-x", "--maxcov", metavar='INT', default=opts[1], type=int, - dest="max_coverage", help="approximate max coverage allowed") - - group4.add_argument("-s", "--sigma", metavar='FLOAT', default=opts[3], type=float, - dest="s", help="sigma value to use when calling SNVs") - - args = parser.parse_args() - - supported_formats = { - 'bam': 'aligned reads', - 'fasta': 'reference genome' - } - # check the input file is in supported format - try: - tmp_filename = os.path.split(args.in_bam)[1] - [in_stem, in_format] = [tmp_filename.split('.')[0], - tmp_filename.split('.')[-1]] - t = supported_formats[in_format] - except IndexError: - print('The input file must be filestem.format') - print('Supported formats are') - for sf in supported_formats.items(): - print(sf[0], ':', sf[1]) - sys.exit() - except KeyError: - print('usage: amplian.py -b bam_file -f fasta_reference') - sys.exit('Please run with -h for all options') - if args.diversity and args.region != '': - sys.exit('Either detect the highest entropy region, or specify one') - - main(args.in_bam, args.in_fasta, args.min_overlap, args.max_coverage, args.alpha, - args.s, args.region, args.diversity) + shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=sigma, increment=1) diff --git a/src/shorah/bin/.gitignore b/src/shorah/bin/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/src/shorah/cli.py b/src/shorah/cli.py new file mode 100644 index 0000000..f2ab9a4 --- /dev/null +++ b/src/shorah/cli.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +# Copyright 2007-2018 +# Niko Beerenwinkel, +# Nicholas Eriksson, +# Moritz Gerstung, +# Lukas Geyrhofer, +# Kerensa McElroy, +# Osvaldo Zagordi, +# ETH Zurich + +# This file is part of ShoRAH. +# ShoRAH is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# ShoRAH is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with ShoRAH. If not, see . +""" +Module that contains the command line app. + +Why does this file exist, and why not put this in __main__? + + You might be tempted to import things from __main__ later, but that will cause + problems: the code will get executed twice: + + - When you run `python -mminvar` python will execute + ``__main__.py`` as a script. That means there won't be any + ``minvar.__main__`` in ``sys.modules``. + - When you import __main__ it will get executed again (as a module) because + there's no ``minvar.__main__`` in ``sys.modules``. + + Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration +""" +import os +import sys + +from pkg_resources import (get_distribution, DistributionNotFound) + +try: + __version__ = get_distribution('shorah').version +except DistributionNotFound: + # probably installed using Autotools + with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), '.version'), 'r') as version_file: + __version__ = version_file.read() + +# manipulate path to import functions +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if __name__ == '__main__': + if __package__ is None: + os.sys.path.insert(1, parent_dir) + mod = __import__('shorah') + sys.modules["shorah"] = mod + #from common import hcv_map, hiv_map, org_dict, wobbles + #from stats import (genome_coverage, start_stop_coverage) +#else: + #from .common import hcv_map, hiv_map, org_dict, wobbles + #from .stats import (genome_coverage, start_stop_coverage) + +# each subcommand takes one of these functions as default + +def shotgun_run(args): + """Default function for command line parser.""" + from shorah import shotgun + shotgun.main(args) + +def amplicon_run(args): + """Default function for command line parser.""" + from shorah import amplicon + amplicon.main(args) + +def main(): + """Parse command line, run default functions.""" + import argparse + # parse command line + # create the top-level parser + parser = argparse.ArgumentParser(usage='%(prog)s [options]', + epilog="Run `shorah subcommand -h` for more help") + parser.add_argument('-v', '--version', action='version', version=__version__) + + parser.add_argument("-b", "--bam", metavar='BAM', required=True, type=str, + dest="b", help="sorted bam format alignment file") + + parser.add_argument("-f", "--fasta", metavar='REF', required=True, type=str, + dest="f", help="reference genome in fasta format") + + parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float, + dest="a", default=0.1, help="alpha in dpm sampling") + + parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str, + dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'") + + parser.add_argument("-R", "--seed", metavar='INT', required=False, type=int, + dest="seed", default=None, help="set seed for reproducible results") + + parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int, + default=10000, dest="max_coverage", help="approximate max coverage allowed") + + parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, type=float, + dest="sigma", help="sigma value to use when calling SNVs") + + subparsers = parser.add_subparsers(help='available sub-commands') + + # create the parser for command "shotgun" + parser_shotgun = subparsers.add_parser('shotgun', help='run local analysis in shotgun mode') + + parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int, + dest="w", default=201, help="window size") + + parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int, + default=3, dest="win_shifts", help="number of window shifts") + + parser_shotgun.add_argument("-k", "--keep_files", required=False, action='store_true', + default=True, dest="keep_files", help="keep all intermediate files") + + parser_shotgun.set_defaults(func=shotgun_run) + + parser_amplicon = subparsers.add_parser('amplicon', help='run local analysis in amplicon mode') + + parser_amplicon.add_argument("-d", "--diversity", action="store_true", default=False, + dest="diversity", help="detect the highest entropy region and run there") + + parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, type=float, + dest="min_overlap", help="fraction of read overlap to be included") + + parser_amplicon.set_defaults(func=amplicon_run) + + # exit so that log file is not written + if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': + parser.print_help() + sys.exit() + + # logging configuration + import logging + import logging.handlers + logging.basicConfig(filename='shorah.log', level=logging.DEBUG, + format='%(levelname)s %(asctime)s %(filename)s: %(funcName)s() %(lineno)d: \t%(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + + logging.info(' '.join(sys.argv)) + logging.info('shorah version:%s', __version__) + # parse the args + args = parser.parse_args() + args.func(args) + +if __name__ == "__main__": # and __package__ is None: + main() diff --git a/src/shorah/contain.cpp b/src/shorah/contain.cpp deleted file mode 100644 index 219d0f8..0000000 --- a/src/shorah/contain.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * contain.cpp -- eliminates redundant reads - * - * usage: contain -f basename - * input: basename.read - * output: basename.rest - * both in the format "start_pos read_string" - * - # Copyright 2007-2012 - # Niko Beerenwinkel, - # Nicholas Eriksson, - # Moritz Gerstung, - # Lukas Geyrhofer, - # Kerensa McElroy - # Osvaldo Zagordi, - # ETH Zurich - - # This file is part of ShoRAH. - # ShoRAH is free software: you can redistribute it and/or modify - # it under the terms of the GNU General Public License as published by - # the Free Software Foundation, either version 3 of the License, or - # (at your option) any later version. - - # ShoRAH is distributed in the hope that it will be useful, - # but WITHOUT ANY WARRANTY; without even the implied warranty of - # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - # GNU General Public License for more details. - - # You should have received a copy of the GNU General Public License - # along with ShoRAH. If not, see . -*/ - -#include -#include -#include -#include -#include -#include -#include - -struct Read -{ - unsigned int pos; - unsigned int len; - unsigned int end; - std::string seq; -}; - -void help(const int exit_code = EXIT_FAILURE) -{ - std::cout << "Usage: contain -f basename\n"; - std::cout << "Expects basename.read, outputs basename.rest\n"; - exit(exit_code); -} - -void getReads(std::istream& infile, std::vector& ans) -{ - /* - * get the reads from a file/stdin - * FIXME: this should be much more robust - FIXME: die if the file isn't readable!!! - */ - - Read r; - std::string t; - std::string tmp; - std::string throwaway; - - // line of file is "pos seq" - while (!std::getline(infile, tmp, ' ').eof()) { - if (tmp[0] == '#') { - std::getline(infile, throwaway); - std::cerr << "Comment : " << tmp << " " << throwaway << std::endl; - continue; - } - r.pos = atoi(tmp.c_str()); - if ((r.pos < 0)) { - std::cerr << "Error: " << r.pos << " is an invalid start position\n"; - exit(1); - } - - std::getline(infile, r.seq); - r.len = r.seq.length(); - r.end = r.pos + r.len - 1; - - if (r.len > 0) ans.push_back(r); - } -} - -int contain(std::vector& reads, int i, int j) -{ - // return i if read[i] contained in read[j] - // j if read[j] ............ read[i] - // -1 else - // - int tmp, overlap; - - if ((reads[i].pos > reads[j].end) || (reads[j].pos > reads[i].end)) { - // std::cout << "no overlap\n"; - return -1; - } - - // switch so that i starts first - if ((reads[i].pos > reads[j].pos) || - ((reads[i].pos == reads[j].pos) && (reads[i].len < reads[j].len))) { - tmp = j; - j = i; - i = tmp; - } - overlap = reads[i].pos + reads[i].len - reads[j].pos; - // std::cout << "read " << i << " at pos " << reads[i].pos << " length " << reads[i].len << - // "\n"; - // std::cout << "read " << j << " at pos " << reads[j].pos << " length " << reads[j].len << - // "\n"; - // std::cout << "overlap == " << overlap << "\n"; - if (overlap < reads[j].len) { - // std::cout << "not enough overlap\n"; - return -1; - } - if (overlap > reads[j].len) { - overlap = reads[j].len; - } - if (reads[i].seq.substr(reads[j].pos - reads[i].pos, overlap) == - reads[j].seq.substr(0, overlap)) { - // std::cout << "Agree\n"; - return j; - } else { - // std::cout << "Disagree\n"; - return -1; - } -} - -int main(int argc, char** argv) -{ - int c; - std::vector reads; - std::string basename; - std::string filename; - std::string outfilename; - int i, j; - - /* parse options - * -f file (expect file.read file.geno) - * -h (help) - */ - while (1) { - c = getopt(argc, argv, "f:h?"); - if (c == -1) break; - switch (c) { - case 'f': - basename = optarg; - break; - case '?': - help(); - break; - case 'h': - help(EXIT_SUCCESS); - break; - default: - help(); - } - } - if (basename == "") { - help(); - } - filename = basename + ".read"; - outfilename = basename + ".rest"; - - std::ifstream infile(filename.c_str()); - std::ofstream outfile(outfilename.c_str()); - std::map delReads; - - // pull all reads into file - getReads(infile, reads); - int idx; - int finalReads = 0; - int numDeleted = 0; - std::cout << "readfile read" << std::endl; - // do double loop over all reads. - // contain returns the index to delete: either i, j, or -1 - // should be able to speed this up by skipping i or j if - // delReads[i] or j is 1 - // but have to be careful - // - for (i = 0; i < reads.size(); ++i) { - // if reads[i] already deleted, then we will have already - // deleted everything reads[i] makes redundant - if (delReads[i]) { - continue; - } - if (i % 10 == 0) std::cout << "finished " << i << " reads, " << numDeleted << " deleted\n"; - - for (j = i + 1; j < reads.size(); ++j) { - if (delReads[j]) continue; - idx = contain(reads, i, j); - - if (idx == i) { - delReads[idx] = 1; - // skip to next iteration of i loop - j = reads.size(); - numDeleted++; - } else if (idx == j) { - delReads[idx] = 1; - numDeleted++; - } - /* - if (idx != -1) { - std::cout << "del " << idx << " (from " << i << " " << j << ")\n"; - } - */ - } - } - for (i = 0; i < reads.size(); ++i) { - if (delReads[i] != 1) { - finalReads++; - outfile << reads[i].pos << " " << reads[i].seq << "\n"; - } - } - std::cout << "Started with " << reads.size() << " reads.\n"; - std::cout << "Ended with " << finalReads << " reads.\n"; -} diff --git a/src/shorah/doxyfile b/src/shorah/doxyfile deleted file mode 100755 index 7ed8af0..0000000 --- a/src/shorah/doxyfile +++ /dev/null @@ -1,257 +0,0 @@ -# Doxyfile 1.6.1 - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- -DOXYFILE_ENCODING = UTF-8 -PROJECT_NAME = SHORAH-DPM -PROJECT_NUMBER = 0.3a -OUTPUT_DIRECTORY = /home/barnab/Work/doc/ -CREATE_SUBDIRS = NO -OUTPUT_LANGUAGE = English -BRIEF_MEMBER_DESC = YES -REPEAT_BRIEF = YES -ABBREVIATE_BRIEF = -ALWAYS_DETAILED_SEC = NO -INLINE_INHERITED_MEMB = NO -FULL_PATH_NAMES = YES -STRIP_FROM_PATH = -STRIP_FROM_INC_PATH = -SHORT_NAMES = NO -JAVADOC_AUTOBRIEF = NO -QT_AUTOBRIEF = NO -MULTILINE_CPP_IS_BRIEF = NO -INHERIT_DOCS = YES -SEPARATE_MEMBER_PAGES = NO -TAB_SIZE = 8 -ALIASES = -OPTIMIZE_OUTPUT_FOR_C = NO -OPTIMIZE_OUTPUT_JAVA = NO -OPTIMIZE_FOR_FORTRAN = NO -OPTIMIZE_OUTPUT_VHDL = NO -EXTENSION_MAPPING = -BUILTIN_STL_SUPPORT = NO -CPP_CLI_SUPPORT = NO -SIP_SUPPORT = NO -IDL_PROPERTY_SUPPORT = YES -DISTRIBUTE_GROUP_DOC = NO -SUBGROUPING = YES -TYPEDEF_HIDES_STRUCT = NO -SYMBOL_CACHE_SIZE = 0 -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- -EXTRACT_ALL = YES -EXTRACT_PRIVATE = NO -EXTRACT_STATIC = NO -EXTRACT_LOCAL_CLASSES = YES -EXTRACT_LOCAL_METHODS = NO -EXTRACT_ANON_NSPACES = NO -HIDE_UNDOC_MEMBERS = NO -HIDE_UNDOC_CLASSES = NO -HIDE_FRIEND_COMPOUNDS = NO -HIDE_IN_BODY_DOCS = NO -INTERNAL_DOCS = NO -CASE_SENSE_NAMES = NO -HIDE_SCOPE_NAMES = NO -SHOW_INCLUDE_FILES = YES -INLINE_INFO = YES -SORT_MEMBER_DOCS = YES -SORT_BRIEF_DOCS = NO -SORT_MEMBERS_CTORS_1ST = NO -SORT_GROUP_NAMES = NO -SORT_BY_SCOPE_NAME = NO -GENERATE_TODOLIST = YES -GENERATE_TESTLIST = YES -GENERATE_BUGLIST = YES -GENERATE_DEPRECATEDLIST= YES -ENABLED_SECTIONS = -MAX_INITIALIZER_LINES = 30 -SHOW_USED_FILES = YES -SHOW_DIRECTORIES = NO -SHOW_FILES = YES -SHOW_NAMESPACES = YES -FILE_VERSION_FILTER = -LAYOUT_FILE = -#--------------------------------------------------------------------------- -# configuration options related to warning and progress messages -#--------------------------------------------------------------------------- -QUIET = NO -WARNINGS = YES -WARN_IF_UNDOCUMENTED = YES -WARN_IF_DOC_ERROR = YES -WARN_NO_PARAMDOC = NO -WARN_FORMAT = "$file:$line: $text" -WARN_LOGFILE = -#--------------------------------------------------------------------------- -# configuration options related to the input files -#--------------------------------------------------------------------------- -INPUT = ./dpm_src -INPUT_ENCODING = UTF-8 -FILE_PATTERNS = *.cpp *.h -RECURSIVE = NO -EXCLUDE = -EXCLUDE_SYMLINKS = NO -EXCLUDE_PATTERNS = -EXCLUDE_SYMBOLS = -EXAMPLE_PATH = -EXAMPLE_PATTERNS = -EXAMPLE_RECURSIVE = NO -IMAGE_PATH = -INPUT_FILTER = -FILTER_PATTERNS = -FILTER_SOURCE_FILES = NO -#--------------------------------------------------------------------------- -# configuration options related to source browsing -#--------------------------------------------------------------------------- -SOURCE_BROWSER = NO -INLINE_SOURCES = NO -STRIP_CODE_COMMENTS = YES -REFERENCED_BY_RELATION = NO -REFERENCES_RELATION = NO -REFERENCES_LINK_SOURCE = YES -USE_HTAGS = NO -VERBATIM_HEADERS = YES -#--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- -ALPHABETICAL_INDEX = NO -COLS_IN_ALPHA_INDEX = 5 -IGNORE_PREFIX = -#--------------------------------------------------------------------------- -# configuration options related to the HTML output -#--------------------------------------------------------------------------- -GENERATE_HTML = YES -HTML_OUTPUT = html -HTML_FILE_EXTENSION = .html -HTML_HEADER = -HTML_FOOTER = -HTML_STYLESHEET = -HTML_ALIGN_MEMBERS = YES -HTML_DYNAMIC_SECTIONS = NO -GENERATE_DOCSET = NO -DOCSET_FEEDNAME = "Doxygen generated docs" -DOCSET_BUNDLE_ID = org.doxygen.Project -GENERATE_HTMLHELP = NO -CHM_FILE = -HHC_LOCATION = -GENERATE_CHI = NO -CHM_INDEX_ENCODING = -BINARY_TOC = NO -TOC_EXPAND = NO -GENERATE_QHP = NO -QCH_FILE = -QHP_NAMESPACE = -QHP_VIRTUAL_FOLDER = doc -QHP_CUST_FILTER_NAME = -QHP_CUST_FILTER_ATTRS = -QHP_SECT_FILTER_ATTRS = -QHG_LOCATION = -DISABLE_INDEX = NO -ENUM_VALUES_PER_LINE = 4 -GENERATE_TREEVIEW = NO -USE_INLINE_TREES = NO -TREEVIEW_WIDTH = 250 -FORMULA_FONTSIZE = 10 -SEARCHENGINE = YES -#--------------------------------------------------------------------------- -# configuration options related to the LaTeX output -#--------------------------------------------------------------------------- -GENERATE_LATEX = YES -LATEX_OUTPUT = latex -LATEX_CMD_NAME = latex -MAKEINDEX_CMD_NAME = makeindex -COMPACT_LATEX = NO -PAPER_TYPE = a4wide -EXTRA_PACKAGES = -LATEX_HEADER = -PDF_HYPERLINKS = YES -USE_PDFLATEX = YES -LATEX_BATCHMODE = NO -LATEX_HIDE_INDICES = NO -LATEX_SOURCE_CODE = NO -#--------------------------------------------------------------------------- -# configuration options related to the RTF output -#--------------------------------------------------------------------------- -GENERATE_RTF = NO -RTF_OUTPUT = rtf -COMPACT_RTF = NO -RTF_HYPERLINKS = NO -RTF_STYLESHEET_FILE = -RTF_EXTENSIONS_FILE = -#--------------------------------------------------------------------------- -# configuration options related to the man page output -#--------------------------------------------------------------------------- -GENERATE_MAN = NO -MAN_OUTPUT = man -MAN_EXTENSION = .3 -MAN_LINKS = NO -#--------------------------------------------------------------------------- -# configuration options related to the XML output -#--------------------------------------------------------------------------- -GENERATE_XML = NO -XML_OUTPUT = xml -XML_SCHEMA = -XML_DTD = -XML_PROGRAMLISTING = YES -#--------------------------------------------------------------------------- -# configuration options for the AutoGen Definitions output -#--------------------------------------------------------------------------- -GENERATE_AUTOGEN_DEF = NO -#--------------------------------------------------------------------------- -# configuration options related to the Perl module output -#--------------------------------------------------------------------------- -GENERATE_PERLMOD = NO -PERLMOD_LATEX = NO -PERLMOD_PRETTY = YES -PERLMOD_MAKEVAR_PREFIX = -#--------------------------------------------------------------------------- -# Configuration options related to the preprocessor -#--------------------------------------------------------------------------- -ENABLE_PREPROCESSING = YES -MACRO_EXPANSION = NO -EXPAND_ONLY_PREDEF = NO -SEARCH_INCLUDES = YES -INCLUDE_PATH = -INCLUDE_FILE_PATTERNS = -PREDEFINED = -EXPAND_AS_DEFINED = -SKIP_FUNCTION_MACROS = YES -#--------------------------------------------------------------------------- -# Configuration::additions related to external references -#--------------------------------------------------------------------------- -TAGFILES = -GENERATE_TAGFILE = -ALLEXTERNALS = NO -EXTERNAL_GROUPS = YES -PERL_PATH = /usr/bin/perl -#--------------------------------------------------------------------------- -# Configuration options related to the dot tool -#--------------------------------------------------------------------------- -CLASS_DIAGRAMS = YES -MSCGEN_PATH = -HIDE_UNDOC_RELATIONS = YES -HAVE_DOT = YES -DOT_FONTNAME = FreeSans -DOT_FONTSIZE = 10 -DOT_FONTPATH = -CLASS_GRAPH = YES -COLLABORATION_GRAPH = YES -GROUP_GRAPHS = YES -UML_LOOK = NO -TEMPLATE_RELATIONS = NO -INCLUDE_GRAPH = YES -INCLUDED_BY_GRAPH = YES -CALL_GRAPH = NO -CALLER_GRAPH = NO -GRAPHICAL_HIERARCHY = YES -DIRECTORY_GRAPH = YES -DOT_IMAGE_FORMAT = png -DOT_PATH = -DOTFILE_DIRS = -DOT_GRAPH_MAX_NODES = 50 -MAX_DOT_GRAPH_DEPTH = 0 -DOT_TRANSPARENT = NO -DOT_MULTI_TARGETS = NO -GENERATE_LEGEND = YES -DOT_CLEANUP = YES diff --git a/src/shorah/freqEst.cpp b/src/shorah/freqEst.cpp deleted file mode 100644 index f1bd291..0000000 --- a/src/shorah/freqEst.cpp +++ /dev/null @@ -1,548 +0,0 @@ -/* freqEst.cpp - runs the EM algorithm to estimate haplotype frequencies - * given reads and reconstructed haplotypes - - # Copyright 2007, 2008, 2009 - # Niko Beerenwinkel, - # Nicholas Eriksson, - # Moritz Gerstung, - # Lukas Geyrhofer, - # Osvaldo Zagordi, - # ETH Zurich - - # This file is part of ShoRAH. - # ShoRAH is free software: you can redistribute it and/or modify - # it under the terms of the GNU General Public License as published by - # the Free Software Foundation, either version 3 of the License, or - # (at your option) any later version. - - # ShoRAH is distributed in the hope that it will be useful, - # but WITHOUT ANY WARRANTY; without even the implied warranty of - # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - # GNU General Public License for more details. - - # You should have received a copy of the GNU General Public License - # along with ShoRAH. If not, see . -*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct Read -{ - unsigned int pos; - unsigned int len; - std::string seq; -}; - -// needed to sort, by Osvaldo -typedef struct comp_class -{ - std::string* hap; - double freq; -} com_pair; - -bool comp_func(com_pair* pi, com_pair* pj) { return (pi->freq > pj->freq); } - -//#define SEEPROB - -int CORRECTPARAM = 1; // which definition of pr(r | h) do we use -int SEED = 1; // random seed -int ITERMAX = 5000; // maximum iterations in EM -double ACCURACY = .000001; // convergence for EM -double KILLP = 0; // kill all probabilities below this -int RUNS = 5; -int N, M; -int unmatchingReads = 0; - -static inline int readMatchesSomeHaplotype(Read& r, std::vector& haplotypes) -{ - for (std::vector::iterator g = haplotypes.begin(); g != haplotypes.end(); ++g) - if (r.seq == (*g).substr(r.pos, r.len)) return 1; - return 0; -} - -void getReads(std::istream& infile, std::vector& ans, std::vector& haplotypes) -{ - /* - * get the reads from a file/stdin - * FIXME: this should be much more robust - */ - - Read r; - std::string t; - std::string tmp; - std::string throwaway; - - // line of file is "pos seq" - while (!std::getline(infile, tmp, ' ').eof()) { - if (tmp[0] == '#') { - std::getline(infile, throwaway); - std::cerr << "Comment : " << tmp << " " << throwaway << std::endl; - continue; - } - r.pos = atoi(tmp.c_str()); - if ((r.pos < 0)) { - std::cerr << "Error: " << r.pos << " is an invalid start position\n"; - exit(1); - } - - std::getline(infile, r.seq); - r.len = r.seq.length(); - // std::cout << r.pos << " " << r.seq << std::endl; - - if (readMatchesSomeHaplotype(r, haplotypes)) { - ans.push_back(r); - } else { - unmatchingReads++; - } - } -} - -void getHaplotypes(std::istream& in, std::vector& genotypesFinal) -{ - /* read in the list of candidate haplotypes */ - std::string tmp; - genotypesFinal.clear(); - std::vector genotypes; - // int i = 0; - - // line of file is "pos seq" - while (!std::getline(in, tmp).eof()) { - if (tmp[0] == '#') { - std::cerr << "Comment : " << tmp << std::endl; - continue; - } - genotypes.push_back(tmp); - // std::cout << i++ << " =" << tmp << ".\n"; - } - if (genotypes.size() == 0) { - std::cout << "Error, no haplotypes\n"; - exit(1); - } - // now genotypes looks like - // PQITSLX - // .QITSSX - // PQITSS. - // So we build a consensus haplotype and fill in the .'s with the consensus - int seqLen = genotypes[0].size(); - char consChar = ' '; - std::string cons; - for (int pos = 0; pos < seqLen; ++pos) { - std::map col; - for (std::vector::iterator g = genotypes.begin(); g != genotypes.end(); ++g) { - col[(*g)[pos]]++; - } - int max = -1; - // don't want to fill in with a '.' - col['.'] = -100; - for (std::map::iterator base = col.begin(); base != col.end(); ++base) { - if (base->second > max) { - consChar = base->first; - max = base->second; - } - } - cons = cons + consChar; - } - std::cout << "consensus haplotype\n" << cons << std::endl; - - std::vector correctionCounts(genotypes.size(), 0); - int tmpCount; - for (std::vector::iterator g = genotypes.begin(); g != genotypes.end(); ++g) { - tmpCount = 0; - for (int pos = 0; pos < seqLen; ++pos) { - if ((*g)[pos] == '.') { - tmpCount++; - (*g)[pos] = cons[pos]; -#ifndef NDEBUG - std::cout << "correcting . to " << cons[pos] << " in sequence " << *g << std::endl; -#endif - } - } - correctionCounts.push_back(tmpCount); - } - - // for(std::vector::iterator a = correctionCounts.begin(); a != correctionCounts.end(); - // ++a) { - // std::cout << *a << " "; - //} - // std::cout << std::endl; - - // but the genotypes might not be unique right now - std::map uniq; - for (std::vector::iterator g = genotypes.begin(); g != genotypes.end(); ++g) { - uniq[*g]++; - } - for (std::map::iterator x = uniq.begin(); x != uniq.end(); ++x) { -#ifndef NDEBUG - std::cout << "Haplotype was repeated " << x->second << " times\n"; -#endif - genotypesFinal.push_back(x->first); - } -} - -inline int match(std::string geno, Read r) { return (r.seq == geno.substr(r.pos, r.len)); } - -inline int readsEqual(Read& r, Read& s) -{ - if (r.pos == s.pos) { - if (r.seq == s.seq) { - return 1; - } - } - return 0; -} - -std::vector countReads(std::vector& origReads, std::vector& reads) -{ - // take orig reads and do - // %hash = (rd -> number appearing in origReads - // u = values(hash) - // reads = keys(hash) - // - int tmp; - for (std::vector::iterator r = origReads.begin(); r != origReads.end(); ++r) { - tmp = 1; - for (std::vector::iterator s = origReads.begin(); s != r; ++s) { - if (readsEqual(*r, *s)) { - tmp = 0; - } - } - if (tmp) { - reads.push_back(*r); - } - } - - std::vector u(reads.size(), 0); - - // for(std::vector::iterator r = reads.begin(); r != reads.end(); ++r) { - for (unsigned int i = 0; i < reads.size(); ++i) { - for (std::vector::iterator s = origReads.begin(); s != origReads.end(); ++s) { - if (readsEqual(reads.at(i), *s)) { - u[i]++; - } - } - } -#ifndef NDEBUG - std::cout << "Started with " << origReads.size() << " reads.\n"; - std::cout << reads.size() << " are unique\n"; - std::cout << "Counts:\n"; - for (unsigned int i = 0; i < u.size(); ++i) { - std::cout << u[i] << " "; - } - std::cout << std::endl; -#endif - return u; -} - -void Estep(std::vector const& p, std::vector >& U, - std::vector > const& Z, std::vector const& u) -{ - // given p, fills U with expected frequencies - int i, j; - double ProbY; - -#ifndef NDEBUG - std::cout << "Estep input: " << std::endl; - for (i = 0; i < N; ++i) { - std::cout << p[i] << " "; - } - std::cout << std::endl; -#endif - - for (i = 0; i < M; ++i) { - ProbY = 0; - for (j = 0; j < N; ++j) { - ProbY += Z[i][j] * p[j]; - // std::cout << "ProbY = " << ProbY << std::endl; - } - for (j = 0; j < N; ++j) { - // std::cout << "U[i][j] = " << u[i] << " " << Z[i][j] << " " << p[j] << " " << - // ProbY - //<< - // std::endl; - U[i][j] = u[i] * ((Z[i][j] * p[j]) / ProbY); - } - } - // std::cout << "Estep output: " << std::endl; - // for (i = 0; i < M; ++i) { - // for (j = 0; j < N; ++j) { - // std::cout << U[i][j] << " "; - // } - // std::cout << std::endl; - // } -} - -void Mstep(std::vector& p, std::vector > const& U) -{ - std::vector v(N, 0); - double m = 0; - int i, j; - - for (j = 0; j < N; ++j) { - // std::cout << "." << v[j] << ".\n"; - for (i = 0; i < M; ++i) { - // std::cout << U[i][j] << " \n"; - v[j] += U[i][j]; - } - m += v[j]; - } - - for (j = 0; j < N; ++j) { - p[j] = v[j] / m; - } - -#ifndef NDEBUG - for (j = 0; j < N; ++j) { - std::cout << p[j] << " "; - } - std::cout << std::endl; -#endif -} - -double logLike(std::vector& p, std::vector > const& Z, - std::vector const& u) -{ - int i, j; - - double ell = 0; - double Prob_Y; - for (i = 0; i < M; i++) { - Prob_Y = 0; - for (j = 0; j < N; j++) { - Prob_Y += Z[i][j] * p[j]; - } - if (Prob_Y > 0) { - ell += (u[i] * log(Prob_Y)); - } - } - return ell; -} - -void round(std::vector& p) -{ - for (std::vector::iterator i = p.begin(); i != p.end(); ++i) { - if ((*i) < KILLP) *i = 0; - } -} - -double EM(std::vector& newP, std::vector > const& Z, - std::vector const& u) -{ - double sum = 0; - double newEll = 0; - std::vector p(N, 0); - std::vector > U(M, std::vector(N, 0)); - double ell = 0; - int iter = 0; - int j; - - for (j = 0; j < N; ++j) { - p[j] = rand(); - sum += p[j]; - } - for (j = 0; j < N; ++j) { - p[j] = p[j] / sum; - } - -#ifndef NDEBUG - for (j = 0; j < N; ++j) { - std::cout << p[j] << " "; - } - std::cout << std::endl; -#endif - - while (((iter <= 2) || (std::abs(ell - newEll) > ACCURACY)) && (iter < ITERMAX)) { - if (iter > 0) { - round(newP); - p = newP; - ell = newEll; - } - - Estep(p, U, Z, u); // fills U - Mstep(newP, U); // fills p - newEll = logLike(newP, Z, u); - - // Print out some stats - if (iter % 100 == 50 || iter == 1) { - -#ifdef SEEPROB - printf("%4d\t%f\t", iter, newEll); - - for (j = 0; j < N; ++j) { - if (p[j] > 0.05) { - std::cout << j << ":" << p[j] << " "; - } - } - std::cout << std::endl; -#else - printf("%4d\t%f\n", iter, newEll); -#endif - } - // printf("%.3f %.3f %.3f ", newP[0], newP[1], newP[2]); - // printf("%.3f %.3f %.3f ", newP[3], newP[4], newP[5]); - // printf("%.3f %.3f %.3f\n", newP[6], newP[7], newP[8]); - iter++; - } - return newEll; -} -void help(const int exit_code = EXIT_FAILURE) -{ - std::cout << "Usage: freqEst -f basename [-p precision -i maxiter -r runs -h -k kill -?]\n"; - std::cout << "Expects basename.read and basename.geno\n"; - std::cout << "Outputs to basename.popl\n"; - exit(exit_code); -} - -int main(int argc, char** argv) -{ - int c; - std::vector haplotypes; - std::vector origReads, reads; - std::string basename; - std::string filename; - std::string genotypeFilename, outfilename; - int i, j; - - /* parse options - * -f file (expect file.read file.geno) - * -h (help) - */ - while (1) { - c = getopt(argc, argv, "f:h?p:i:r:k:s:"); - if (c == -1) break; - switch (c) { - case 'k': - KILLP = atof(optarg); - break; - case 's': - SEED = atoi(optarg); - break; - case 'f': - basename = optarg; - break; - case 'p': - ACCURACY = atof(optarg); - break; - case 'i': - ITERMAX = atoi(optarg); - break; - case 'r': - RUNS = atoi(optarg); - break; - case '?': - help(); - break; - case 'h': - help(EXIT_SUCCESS); - break; - default: - help(); - } - } - if (basename == "") { - help(); - } - filename = basename + ".read"; - genotypeFilename = basename + ".geno"; - outfilename = basename + ".popl"; - - srand(SEED); - - std::ifstream infile(filename.c_str()); - std::ifstream ginfile(genotypeFilename.c_str()); - if (ginfile.fail() || infile.fail()) { - std::cout << "ERROR -- can't read " << basename << ".geno or .read\n"; - exit(1); - } - std::ofstream outfile(outfilename.c_str()); - // outfile.open (outfilename.c_str()); - - getHaplotypes(ginfile, haplotypes); - - getReads(infile, origReads, haplotypes); - - // uniq reads - std::vector u; - u = countReads(origReads, reads); - - M = reads.size(); - N = haplotypes.size(); - - std::cout << N << " haplotypes\n" << M << " reads\n"; - std::cout << "Threw out " << unmatchingReads << " unexplained reads\n"; - - // count how many reads a haplotype is compat with - std::vector K(N, 0); - - // the M x N matrix of 0/1 incidences - std::vector > A(M, std::vector(N, 0)); - - for (i = 0; i < M; ++i) { - for (j = 0; j < N; ++j) { - if (match(haplotypes[j], reads[i])) { - A[i][j] = 1; - K[j]++; - } - } - } - - // my @Z; # Z[i][j] == Prob(Y = y_i | X = x_j) - std::vector > Z(M, std::vector(N, 0)); - for (i = 0; i < M; ++i) { - for (j = 0; j < N; ++j) { - if (A[i][j] == 1) { - if (CORRECTPARAM) { - Z[i][j] = 1.0 / 1000; - } else { - Z[i][j] = 1.0 / K[j]; - } - // std::cout << "Z[i][j] = "<< Z[i][j] << std::endl; - } - } - } - - std::vector prob(N, 0); - std::vector bestProb(N, 0); - double logL; - double bestL = -1e100; - - for (int run = 0; run < RUNS; ++run) { - std::cout << "run " << run << std::endl; - logL = EM(prob, Z, u); - if (logL > bestL) { - bestProb = prob; - bestL = logL; - } - } - - // sort the haplotypes by frequency before printing - // added by Osvaldo - std::string s6(10, 'x'); - com_pair** freq_hap; - freq_hap = (com_pair**)calloc(N, sizeof(com_pair*)); - for (int i = 0; i < N; ++i) { - freq_hap[i] = (com_pair*)calloc(1, sizeof(com_pair)); - freq_hap[i]->hap = &(haplotypes[i]); - freq_hap[i]->freq = bestProb[i]; - } - - std::sort(freq_hap, freq_hap + N, comp_func); - - for (int i = 0; i < N; ++i) { - if (freq_hap[i]->freq > 0) { - // outfile << ">HAP" << i << "_" << bestProb[i] << "\n" << haplotypes[i] << std::endl; - outfile << ">HAP" << i << "_" << freq_hap[i]->freq << "\n" - << *freq_hap[i]->hap << std::endl; - std::cout << i << "\t" << freq_hap[i]->freq << "\t" << *freq_hap[i]->hap << std::endl; - // std::cout << i << " " << freq_hap[i]->freq << std::endl; - } - } - // The output is now sorted - return 0; -} diff --git a/src/python/shorah_snv.py b/src/shorah/shorah_snv.py old mode 100755 new mode 100644 similarity index 91% rename from src/python/shorah_snv.py rename to src/shorah/shorah_snv.py index ec0c377..a8def08 --- a/src/python/shorah_snv.py +++ b/src/shorah/shorah_snv.py @@ -1,4 +1,6 @@ -# Copyright 2007-2014 +#!/usr/bin/env python3 + +# Copyright 2007-2018 # Niko Beerenwinkel, # Nicholas Eriksson, # Moritz Gerstung, @@ -38,12 +40,13 @@ import shutil import sys import warnings +import shlex import logging -import logging.handlers -# Make a global logging object. -snvlog = logging.getLogger(__name__) +from pkg_resources import resource_filename +fil_exe = resource_filename(__name__, 'bin/fil') + # used to parse variants from support files posterior_thresh = 0.9 @@ -57,7 +60,7 @@ def segments(incr): try: infile = open('coverage.txt') except IOError: - snvlog.error('Coverage file generated by b2w not found.') + logging.error('Coverage file generated by b2w not found.') sys.exit('Coverage file generated by b2w not found.') for f in infile: # window_file, reference, begin, end, value @@ -111,7 +114,7 @@ def parseWindow(line, ref1): else: window = open(filename, 'r') except IOError: - snvlog.error('File not found') + logging.error('File not found') return snp beg = int(beg) @@ -125,7 +128,7 @@ def parseWindow(line, ref1): post, av = float(match_obj.group(1)), float(match_obj.group(2)) if post > 1.0: warnings.warn('posterior = %4.3f > 1' % post) - snvlog.warning('posterior = %4.3f > 1' % post) + logging.warning('posterior = %4.3f > 1' % post) if post >= posterior_thresh: reads += av pos = beg @@ -143,7 +146,7 @@ def parseWindow(line, ref1): if tot_snv > max_snv: max_snv = tot_snv - snvlog.info('max number of snvs per sequence found: %d' % max_snv) + logging.info('max number of snvs per sequence found: %d', max_snv) # normalize for k, v in snp.items(): v[5] /= v[4] @@ -161,7 +164,7 @@ def getSNV(ref, segCov, incr): try: cov_file = open('coverage.txt') except IOError: - snvlog.error('Coverage file generated by b2w not found') + logging.error('Coverage file generated by b2w not found') sys.exit('Coverage file generated by b2w not found') # cycle over all windows reported in coverage.txt @@ -172,7 +175,7 @@ def getSNV(ref, segCov, incr): if incr == 1: incr = end - beg + 1 single_window = True - snvlog.info('working on single window as invoked by amplian') + logging.info('working on single window as invoked by amplian') key = list(snp.keys()) key.sort() for k in key: @@ -310,11 +313,11 @@ def sb_filter(in_bam, sigma, amplimode="", max_coverage=100000): """run strand bias filter calling the external program 'fil' """ import subprocess - dn = sys.path[0] - my_prog = os.path.join(dn, 'fil') + # dn = sys.path[0] + my_prog = shlex.quote(fil_exe) # os.path.join(dn, 'fil') my_arg = ' -b ' + in_bam + ' -v ' + str(sigma) + amplimode + ' -x ' \ + str(max_coverage) - snvlog.debug('running %s%s' % (my_prog, my_arg)) + logging.debug('running %s%s', my_prog, my_arg) retcode = subprocess.call(my_prog + my_arg, shell=True) return retcode @@ -348,32 +351,21 @@ def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000): import csv import inspect - # set logging level - snvlog.setLevel(logging.DEBUG) - # This handler writes everything to a file. - LOG_FILENAME = './snv.log' - h = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', - maxBytes=100000, backupCount=5) - fl = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\ - %(lineno)d %(message)s") - h.setFormatter(fl) - snvlog.addHandler(h) - - snvlog.info(str(inspect.getargspec(main))) + logging.info(str(inspect.getfullargspec(main))) ref_m = dict([[s.id, str(s.seq).upper()] for s in SeqIO.parse(reference, 'fasta')]) # number of windows per segment segCov_m = segments(increment) - snvlog.debug('coverage parsed') + logging.debug('coverage parsed') # snpD_m is the file with the 'consensus' SNVs (from different windows) - snvlog.debug('now parsing SNVs') + logging.debug('now parsing SNVs') if not os.path.isfile('snv/SNV.txt'): snpD_m = getSNV(ref_m, segCov_m, increment) printRaw(snpD_m, increment) else: - snvlog.debug('snv/SNV.txt found, moving to ./') + logging.debug('snv/SNV.txt found, moving to ./') shutil.move('snv/SNV.txt', './') # run strand bias filter, output in SNVs_%sigma.txt @@ -383,7 +375,7 @@ def main(reference, bam_file, sigma=0.01, increment=1, max_coverage=100000): else: retcode_m = sb_filter(bam_file, sigma, max_coverage=max_coverage) if retcode_m is not 0: - snvlog.error('sb_filter exited with error %d' % retcode_m) + logging.error('sb_filter exited with error %d', retcode_m) sys.exit() # parse the p values from SNVs*txt file diff --git a/src/python/shorah_dec.py b/src/shorah/shotgun.py old mode 100755 new mode 100644 similarity index 85% rename from src/python/shorah_dec.py rename to src/shorah/shotgun.py index d5b95a7..c957f88 --- a/src/python/shorah_dec.py +++ b/src/shorah/shotgun.py @@ -1,4 +1,6 @@ -# Copyright 2007-2012 +#!/usr/bin/env python3 + +# Copyright 2007-2018 # Niko Beerenwinkel, # Nicholas Eriksson, # Moritz Gerstung, @@ -27,17 +29,28 @@ from __future__ import division from __future__ import print_function -import numpy as np + import os import pipes import sys - +import shlex import logging -import logging.handlers +from pkg_resources import resource_filename -# Make a global logging object. -declog = logging.getLogger(__name__) +import numpy as np +dn_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if __name__ == '__main__': + if __package__ is None: + os.sys.path.insert(1, dn_dir) + mod = __import__('shorah') + sys.modules["shorah"] = mod + import shorah_snv +else: + from . import shorah_snv + +diri_exe = resource_filename(__name__, 'bin/diri_sampler') +b2w_exe = resource_filename(__name__, 'bin/b2w') ################################################# # a common user should not edit above this line # ################################################# @@ -60,13 +73,14 @@ # Dictionary storing by read ID (key) the posterior for each of window quality = {} -count = {} -count['A'] = 0 -count['C'] = 0 -count['G'] = 0 -count['T'] = 0 -count['X'] = 0 -count['-'] = 0 +count = { + 'A': 0, + 'C': 0, + 'G': 0, + 'T': 0, + 'X': 0, + '-': 0 +} clusters = [[]] untouched = [[]] @@ -91,18 +105,18 @@ def parse_aligned_reads(reads_file): out_reads = {} if not os.path.isfile(reads_file): - declog.error('There should be a file here: ' + reads_file) + logging.error('There should be a file here: ' + reads_file) sys.exit('There should be a file here: ' + reads_file) else: - declog.info('Using file ' + reads_file + ' of aligned reads') + logging.info('Using file ' + reads_file + ' of aligned reads') handle = open(reads_file) - declog.debug('Parsing aligned reads') + logging.debug('Parsing aligned reads') for h in handle: name, start, stop, mstart, mstop, this_m = h.rstrip().split('\t') if this_m == '': - declog.warning('parsing empty read: %s' % h) + logging.warning('parsing empty read: %s', h) out_reads[name] = [None, None, None, None, []] # start of the region of interest (0-based indexing) out_reads[name][0] = int(start) @@ -122,21 +136,22 @@ def windows(run_settings): """ import subprocess bam, fasta, w, i, m, x, reg = run_settings - dn = sys.path[0] - my_prog = os.path.join(dn, 'b2w') + #dn = sys.path[0] + my_prog = shlex.quote(b2w_exe) # os.path.join(dn, 'b2w') + my_arg = ' -w %i -i %i -m %i -x %i %s %s %s' % \ (w, i, m, x, bam, fasta, reg) try: retcode = subprocess.call(my_prog + my_arg, shell=True) if retcode > 0: - declog.error('%s %s' % (my_prog, my_arg)) - declog.error('b2w returned %i' % retcode) + logging.error('%s %s', my_prog, my_arg) + logging.error('b2w returned %i', retcode) else: - declog.debug('Finished making windows') - declog.debug('b2w returned %i' % retcode) + logging.debug('Finished making windows') + logging.debug('b2w returned %i', retcode) except OSError as ee: - declog.error('Execution of b2w failed: %s' % ee) + logging.error('Execution of b2w failed: %s', ee) return retcode @@ -153,7 +168,7 @@ def run_dpm(run_setting): stem = filein.split('.reads')[0] corgz = 'corrected/%s.reads-cor.fas.gz' % stem if os.path.exists(corgz): - declog.debug('file %s already analysed, skipping' % filein) + logging.debug('file %s already analysed, skipping', filein) return # if already run before, extract the read file @@ -164,8 +179,8 @@ def run_dpm(run_setting): shutil.move(fstgz, './') subprocess.check_call(["gunzip", "%s-reads.gz" % stem]) - dn = sys.path[0] - my_prog = os.path.join(dn, 'diri_sampler') + # dn = sys.path[0] + my_prog = shlex.quote(diri_exe) # os.path.join(dn, 'diri_sampler') my_arg = ' -i %s -j %i -t %i -a %f -K %d -R %d' % \ (pipes.quote(filein), j, int(j * hist_fraction), a, init_K, seed) @@ -174,18 +189,18 @@ def run_dpm(run_setting): # os.remove('./assignment.tmp') except OSError: pass - declog.debug(my_prog + my_arg) + logging.debug(my_prog + my_arg) # runs the gibbs sampler for the dirichlet process mixture try: retcode = subprocess.call(my_prog + my_arg, shell=True) if retcode < 0: - declog.error('%s %s' % (my_prog, my_arg)) - declog.error('Child %s terminated by SIG %d' % (my_prog, -retcode)) + logging.error('%s %s' % (my_prog, my_arg)) + logging.error('Child %s terminated by SIG %d' % (my_prog, -retcode)) else: - declog.debug('run %s finished' % my_arg) - declog.debug('Child %s returned %i' % (my_prog, retcode)) + logging.debug('run %s finished' % my_arg) + logging.debug('Child %s returned %i' % (my_prog, retcode)) except OSError as ee: - declog.error('Execution of %s failed: %s' % (my_prog, ee)) + logging.error('Execution of %s failed: %s' % (my_prog, ee)) return @@ -231,7 +246,7 @@ def correct_reads(chr_c, wstart, wend): handle.close() return except IOError: - declog.warning('No reads in window %s?' % wstart) + logging.warning('No reads in window %s?' % wstart) return @@ -350,8 +365,9 @@ def merge_corrected_reads(aligned_read): return(ID, merged_corrected_read) -def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', - max_coverage=10000, alpha=0.1, keep_files=True, seed=None): +#def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', +# max_coverage=10000, alpha=0.1, keep_files=True, seed=None): +def main(args): ''' Performs the error correction analysis, running diri_sampler and analyzing the result @@ -361,25 +377,25 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', import shutil import time - import shorah_snv + #import shorah_snv + + in_bam = args.b + in_fasta = args.f + win_length = args.w + win_shifts = args.win_shifts + region = args.r + max_coverage = args.max_coverage + alpha = args.a + keep_files = args.keep_files + seed = args.seed - # set logging level - declog.setLevel(logging.DEBUG) - # This handler writes everything to a file. - LOG_FILENAME = './dec.log' - hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w', - maxBytes=100000, backupCount=5) - f = logging.Formatter("%(levelname)s %(asctime)s\ - %(funcName)s %(lineno)d %(message)s") - hl.setFormatter(f) - declog.addHandler(hl) - declog.info(' '.join(sys.argv)) + logging.info(' '.join(sys.argv)) # check options if win_length % win_shifts != 0: sys.exit('Window size must be divisible by win_shifts') if win_min_ext < 1 / win_shifts: - declog.warning('Some bases might not be covered by any window') + logging.warning('Some bases might not be covered by any window') if max_coverage / win_length < 1: sys.exit('Please increase max_coverage') if not os.path.isfile(in_bam): @@ -406,14 +422,14 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', if win_length > gen_length: sys.exit('The window size must be smaller than the genome region') - declog.info('%s reads are being considered' % len(aligned_reads)) + logging.info('%s reads are being considered', len(aligned_reads)) ############################################ # Now the windows and the error correction # ############################################ runlist = win_to_run(alpha, seed) - declog.info('will run on %d windows' % len(runlist)) + logging.info('will run on %d windows', len(runlist)) # run diri_sampler on all available processors but one max_proc = max(cpu_count() - 1, 1) pool = Pool(processes=max_proc) @@ -434,24 +450,24 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', proposed = {} for i in runlist: winFile, j, a, s = i - del(a) # in future alpha might be different on each window + del a # in future alpha might be different on each window parts = winFile.split('.')[0].split('-') chrom = '-'.join(parts[1:-2]) beg = int(parts[-2]) end = int(parts[-1]) - declog.info('reading windows for start position %s' % beg) + logging.info('reading windows for start position %s', beg) # correct reads populates correction and quality, globally defined correct_reads(chrom, beg, end) stem = 'w-%s-%s-%s' % (chrom, beg, end) - declog.info('this is window %s' % stem) + logging.info('this is window %s' % stem) dbg_file = stem + '.dbg' # if os.path.exists(dbg_file): proposed[beg] = (get_prop(dbg_file), j) - declog.info('there were %s proposed' % str(proposed[beg][0])) + logging.info('there were %s proposed' % str(proposed[beg][0])) # (re)move intermediate files if not keep_all_files: - declog.info('removing intermediate files') + logging.info('removing intermediate files') tr_files = glob.glob('./w*reads.fas') tr_files.extend(glob.glob('./*.smp')) tr_files.extend(glob.glob('./w*.dbg')) @@ -538,7 +554,7 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', ## correction[read_id][wstart] = sequence ## ## quality[read_id][wstart] = posterior ## # ########################################## - declog.info('Merging windows of corrected reads') + logging.info('Merging windows of corrected reads') # Multi-threaded version params = list(aligned_reads.items()) pool = Pool(processes=max_proc) @@ -546,12 +562,12 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', pool.close() pool.join() - declog.info('All corrected reads have been merged') + logging.info('All corrected reads have been merged') ccx = {} cin_stem = '.'.join(os.path.split(in_bam)[1].split('.')[:-1]) fch = open('%s.cor.fas' % cin_stem, 'w') - declog.debug('writing to file %s.cor.fas' % cin_stem) + logging.debug('writing to file %s.cor.fas', cin_stem) for ID, seq_list in to_correct: cor_read = ''.join(seq_list) init_x = len(cor_read.lstrip('-')) - len(cor_read.lstrip('-X')) @@ -584,7 +600,7 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', (kp, proposed[kp][0] / proposed[kp][1])) ph.close() - declog.info('running snv.py') + logging.info('running snv.py') shorah_snv.main(reference=in_fasta, bam_file=in_bam, increment=win_length // win_shifts, max_coverage=max_coverage) @@ -597,4 +613,4 @@ def main(in_bam, in_fasta, win_length=201, win_shifts=3, region='', for snv_file in glob.glob('./SNV*'): shutil.move(snv_file, 'snv/') - declog.info('dec.py ends') + logging.info('shotgun run ends')