diff --git a/.gitignore b/.gitignore
index 1e536d7..07db275 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
# Autotools cruft (NEVER check this into VCS)
Makefile
Makefile.in
+.version
aclocal.m4
.deps/
.dirstamp
@@ -17,37 +18,36 @@ configure
depcomp
install-sh
missing
+py-compile
+ar-lib
+/shorah
# generated tarball
shorah*.tar.bz2
# executables
b2w
-contain
diri_sampler
fil
-freqEst
# samtools lib
libbam.a
-# python/perl scripts are generated by Autoconf from *.{py,pl}.in
-# files, replacing the shebang in the process
-fas2read.pl
-src/scripts/amplian.py
-src/scripts/dec.py
-src/scripts/mm.py
-src/scripts/shorah.py
-src/scripts/snv.py
-
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
-py-compile
# OS X temporary files that should never be committed
.DS_Store
*.swp
*.lock
profile
+
+# other intermediate stuff
+build
+.eggs
+src/ShoRAH.egg-info/
+
+# shorah output
+shorah.log
diff --git a/.travis.yml b/.travis.yml
index 699457a..a6bc46a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,13 +1,17 @@
-language: c cpp
+language: c cpp python
+
+python:
+ - "3.6"
# GSL and Autotools needed
before_install:
- sudo add-apt-repository ppa:ondrej/autotools -y
- sudo apt-get update -qq
- sudo apt-get install -y gsl-bin libgsl0-dev autoconf automake-1.15 pkg-config m4
+ - source ~/virtualenv/python3.6/bin/activate
compiler:
- gcc
before_script: autoreconf -vif
-script: ./configure && make -j1 && make distcheck
+script: ./configure && make -j1 distcheck
diff --git a/CHANGELOG b/CHANGELOG.md
similarity index 100%
rename from CHANGELOG
rename to CHANGELOG.md
diff --git a/Makefile.am b/Makefile.am
index d96df09..3e0ef0f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,48 +1,38 @@
# install scripts in ${bindir}
bin_SCRIPTS = \
- src/scripts/amplian.py \
- src/scripts/dec.py \
- src/scripts/mm.py \
- src/scripts/shorah.py \
- src/scripts/snv.py \
- fas2read.pl
+ shorah
CLEANFILES = $(bin_SCRIPTS)
EXTRA_DIST = \
- src/scripts/amplian.py.in \
- src/scripts/dec.py.in \
- src/scripts/mm.py.in \
- src/scripts/shorah.py.in \
- src/scripts/snv.py.in \
- src/scripts/fas2read.pl.in \
- src/samtools/COPYING
+ .version \
+ meson.build \
+ setup.py \
+ src/meson.build \
+ src/cpp/meson.build \
+ src/samtools/COPYING \
+ src/samtools/meson.build
+BUILT_SOURCES = .version
-fas2read.pl: src/scripts/fas2read.pl.in Makefile
- $(SED) 's,[@]perldir[@],$(perldir),g' $< > $@
+shorah: $(top_srcdir)/src/shorah/__main__.py Makefile
+ $(SED) 's,#!/usr/bin/env python3,#!/usr/bin/env $(PYTHON),g' $(top_srcdir)/src/shorah/__main__.py > $@
# actual C/C++ programs doing the real computation
-bin_PROGRAMS = diri_sampler contain freqEst b2w fil
+bin_PROGRAMS = diri_sampler b2w fil
diri_sampler_SOURCES = \
- src/shorah/data_structures.hpp \
- src/shorah/dpm_sampler.hpp \
- src/shorah/dpm_sampler.cpp
+ src/cpp/data_structures.hpp \
+ src/cpp/dpm_sampler.hpp \
+ src/cpp/dpm_sampler.cpp
diri_sampler_CPPFLAGS = $(GSL_CFLAGS)
diri_sampler_LDADD = $(GSL_LIBS)
-contain_SOURCES = \
- src/shorah/contain.cpp
-
-freqEst_SOURCES = \
- src/shorah/freqEst.cpp
-
b2w_SOURCES = \
- src/shorah/b2w.cpp
+ src/cpp/b2w.cpp
b2w_CXXFLAGS = $(PTHREAD_CFLAGS)
b2w_CPPFLAGS = -I$(top_srcdir)/src/samtools
b2w_LDADD = libbam.a $(ZLIB_LIBS) $(PTHREAD_LIBS)
fil_SOURCES = \
- src/shorah/fil.cpp
+ src/cpp/fil.cpp
fil_CXXFLAGS = $(PTHREAD_CFLAGS)
fil_CPPFLAGS = $(GSL_CFLAGS) -I$(top_srcdir)/src/samtools
fil_LDADD = libbam.a $(ZLIB_LIBS) $(GSL_LIBS) $(PTHREAD_LIBS)
@@ -85,19 +75,14 @@ libbam_a_SOURCES = \
libbam_a_CPPFLAGS = $(ZLIB_CFLAGS) -D_USE_KNETFILE
libbam_a_CFLAGS = $(PTHREAD_CFLAGS)
-# perl modules
-dist_perl_DATA = \
- src/perl/DNA.pm \
- src/perl/NikoUtil.pm \
- src/perl/ReadUtil.pm \
- src/perl/Util.pm
-
# python modules
-python_PYTHON = \
- src/python/shorah_dec.py \
- src/python/shorah_matching.py \
- src/python/shorah_mm.py \
- src/python/shorah_snv.py
+dist_pkgpython_PYTHON = \
+ src/shorah/__main__.py \
+ src/shorah/amplicon.py \
+ src/shorah/cli.py \
+ src/shorah/shorah_snv.py \
+ src/shorah/shotgun.py
+pkgpython_PYTHON = .version
# example files
dist_pkgdata_DATA = \
@@ -110,3 +95,9 @@ dist_ampliconpkgdata_DATA = \
examples/amplicon_test/ampli_sorted.bam \
examples/amplicon_test/amplicon_reads.fastq \
examples/amplicon_test/reference.fasta
+
+# version handling
+$(top_srcdir)/.version:
+ echo $(VERSION) > $@-t && mv $@-t $@
+dist-hook:
+ echo $(VERSION) > $(distdir)/.tarball-version
diff --git a/README.md b/README.md
index 1883942..ef828fe 100755
--- a/README.md
+++ b/README.md
@@ -132,10 +132,10 @@ These can be run one after the other, or one can invoke `shorah.py`, that runs
the whole process from bam file to frequency estimation and SNV calling.
## Coding style
-All changes to the C++ code in `src/shorah` should always be formatted according to the included `.clang-format` style by doing
+All changes to the C++ code in `src/cpp` should always be formatted according to the included `.clang-format` style by doing
- clang-format -style=file -i src/shorah/*.[ch]pp
+ clang-format -style=file -i src/cpp/*.[ch]pp
in the root of the repository.
-All changes to the python code in `src/python` and `src/scripts` should always be formatted conforming to the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style guide. To this end, we advise to use [autopep8](https://pypi.python.org/pypi/autopep8).
+All changes to the python code in `src/shorah` should always be formatted conforming to the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style guide. To this end, we advise to use [autopep8](https://pypi.python.org/pypi/autopep8).
diff --git a/build-aux/git-version-gen b/build-aux/git-version-gen
new file mode 100755
index 0000000..6d073fc
--- /dev/null
+++ b/build-aux/git-version-gen
@@ -0,0 +1,227 @@
+#!/bin/sh
+# Print a version string.
+scriptversion=2018-03-07.03; # UTC
+
+# Copyright (C) 2007-2018 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+# This script is derived from GIT-VERSION-GEN from GIT: https://git-scm.com/.
+# It may be run two ways:
+# - from a git repository in which the "git describe" command below
+# produces useful output (thus requiring at least one signed tag)
+# - from a non-git-repo directory containing a .tarball-version file, which
+# presumes this script is invoked like "./git-version-gen .tarball-version".
+
+# In order to use intra-version strings in your project, you will need two
+# separate generated version string files:
+#
+# .tarball-version - present only in a distribution tarball, and not in
+# a checked-out repository. Created with contents that were learned at
+# the last time autoconf was run, and used by git-version-gen. Must not
+# be present in either $(srcdir) or $(builddir) for git-version-gen to
+# give accurate answers during normal development with a checked out tree,
+# but must be present in a tarball when there is no version control system.
+# Therefore, it cannot be used in any dependencies. GNUmakefile has
+# hooks to force a reconfigure at distribution time to get the value
+# correct, without penalizing normal development with extra reconfigures.
+#
+# .version - present in a checked-out repository and in a distribution
+# tarball. Usable in dependencies, particularly for files that don't
+# want to depend on config.h but do want to track version changes.
+# Delete this file prior to any autoconf run where you want to rebuild
+# files to pick up a version string change; and leave it stale to
+# minimize rebuild time after unrelated changes to configure sources.
+#
+# As with any generated file in a VC'd directory, you should add
+# /.version to .gitignore, so that you don't accidentally commit it.
+# .tarball-version is never generated in a VC'd directory, so needn't
+# be listed there.
+#
+# Use the following line in your configure.ac, so that $(VERSION) will
+# automatically be up-to-date each time configure is run (and note that
+# since configure.ac no longer includes a version string, Makefile rules
+# should not depend on configure.ac for version updates).
+#
+# AC_INIT([GNU project],
+# m4_esyscmd([build-aux/git-version-gen .tarball-version]),
+# [bug-project@example])
+#
+# Then use the following lines in your Makefile.am, so that .version
+# will be present for dependencies, and so that .version and
+# .tarball-version will exist in distribution tarballs.
+#
+# EXTRA_DIST = $(top_srcdir)/.version
+# BUILT_SOURCES = $(top_srcdir)/.version
+# $(top_srcdir)/.version:
+# echo $(VERSION) > $@-t && mv $@-t $@
+# dist-hook:
+# echo $(VERSION) > $(distdir)/.tarball-version
+
+
+me=$0
+
+version="git-version-gen $scriptversion
+
+Copyright 2011 Free Software Foundation, Inc.
+There is NO warranty. You may redistribute this software
+under the terms of the GNU General Public License.
+For more information about these matters, see the files named COPYING."
+
+usage="\
+Usage: $me [OPTION]... \$srcdir/.tarball-version [TAG-NORMALIZATION-SED-SCRIPT]
+Print a version string.
+
+Options:
+
+ --prefix PREFIX prefix of git tags (default 'v')
+ --fallback VERSION
+ fallback version to use if \"git --version\" fails
+
+ --help display this help and exit
+ --version output version information and exit
+
+Running without arguments will suffice in most cases."
+
+prefix=v
+fallback=
+
+while test $# -gt 0; do
+ case $1 in
+ --help) echo "$usage"; exit 0;;
+ --version) echo "$version"; exit 0;;
+ --prefix) shift; prefix=${1?};;
+ --fallback) shift; fallback=${1?};;
+ -*)
+ echo "$0: Unknown option '$1'." >&2
+ echo "$0: Try '--help' for more information." >&2
+ exit 1;;
+ *)
+ if test "x$tarball_version_file" = x; then
+ tarball_version_file="$1"
+ elif test "x$tag_sed_script" = x; then
+ tag_sed_script="$1"
+ else
+ echo "$0: extra non-option argument '$1'." >&2
+ exit 1
+ fi;;
+ esac
+ shift
+done
+
+if test "x$tarball_version_file" = x; then
+ echo "$usage"
+ exit 1
+fi
+
+tag_sed_script="${tag_sed_script:-s/x/x/}"
+
+nl='
+'
+
+# Avoid meddling by environment variable of the same name.
+v=
+v_from_git=
+
+# First see if there is a tarball-only version file.
+# then try "git describe", then default.
+if test -f $tarball_version_file
+then
+ v=`cat $tarball_version_file` || v=
+ case $v in
+ *$nl*) v= ;; # reject multi-line output
+ [0-9]*) ;;
+ *) v= ;;
+ esac
+ test "x$v" = x \
+ && echo "$0: WARNING: $tarball_version_file is missing or damaged" 1>&2
+fi
+
+if test "x$v" != x
+then
+ : # use $v
+# Otherwise, if there is at least one git commit involving the working
+# directory, and "git describe" output looks sensible, use that to
+# derive a version string.
+elif test "`git log -1 --pretty=format:x . 2>&1`" = x \
+ && v=`git describe --abbrev=4 --match="$prefix*" HEAD 2>/dev/null \
+ || git describe --abbrev=4 HEAD 2>/dev/null` \
+ && v=`printf '%s\n' "$v" | sed "$tag_sed_script"` \
+ && case $v in
+ $prefix[0-9]*) ;;
+ *) (exit 1) ;;
+ esac
+then
+ # Is this a new git that lists number of commits since the last
+ # tag or the previous older version that did not?
+ # Newer: v6.10-77-g0f8faeb
+ # Older: v6.10-g0f8faeb
+ vprefix=`expr "X$v" : 'X\(.*\)-g[^-]*$'` || vprefix=$v
+ case $vprefix in
+ *-*) : git describe is probably okay three part flavor ;;
+ *)
+ : git describe is older two part flavor
+ # Recreate the number of commits and rewrite such that the
+ # result is the same as if we were using the newer version
+ # of git describe.
+ vtag=`echo "$v" | sed 's/-.*//'`
+ commit_list=`git rev-list "$vtag"..HEAD 2>/dev/null` \
+ || { commit_list=failed;
+ echo "$0: WARNING: git rev-list failed" 1>&2; }
+ numcommits=`echo "$commit_list" | wc -l`
+ v=`echo "$v" | sed "s/\(.*\)-\(.*\)/\1-$numcommits-\2/"`;
+ test "$commit_list" = failed && v=UNKNOWN
+ ;;
+ esac
+
+ # Change the penultimate "-" to ".", for version-comparing tools.
+ # Remove the "g" to save a byte.
+ v=`echo "$v" | sed 's/-\([^-]*\)-g\([^-]*\)$/.\1-\2/'`;
+ v_from_git=1
+elif test "x$fallback" = x || git --version >/dev/null 2>&1; then
+ v=UNKNOWN
+else
+ v=$fallback
+fi
+
+v=`echo "$v" |sed "s/^$prefix//"`
+
+# Test whether to append the "-dirty" suffix only if the version
+# string we're using came from git. I.e., skip the test if it's "UNKNOWN"
+# or if it came from .tarball-version.
+if test "x$v_from_git" != x; then
+ # Don't declare a version "dirty" merely because a timestamp has changed.
+ git update-index --refresh > /dev/null 2>&1
+
+ dirty=`exec 2>/dev/null;git diff-index --name-only HEAD` || dirty=
+ case "$dirty" in
+ '') ;;
+ *) # Append the suffix only if there isn't one already.
+ case $v in
+ *-dirty) ;;
+ *) v="$v-dirty" ;;
+ esac ;;
+ esac
+fi
+
+# Omit the trailing newline, so that m4_esyscmd can use the result directly.
+printf %s "$v"
+
+# Local variables:
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC0"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/configure.ac b/configure.ac
index 058fcc1..ed793c5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -21,10 +21,10 @@ dnl https://github.com/cbg-ethz/shorah/releases
dnl Initialise Autoconf
AC_PREREQ([2.69])
AC_INIT(
- [ShoRAH],
- [1.1.1],
+ [shorah],
+ m4_esyscmd([build-aux/git-version-gen .tarball-version]),
[osvaldo.zagordi@gmail.com])
-AC_CONFIG_SRCDIR([src/shorah/freqEst.cpp])
+AC_CONFIG_SRCDIR([src/cpp/dpm_sampler.cpp])
AC_CONFIG_MACRO_DIR([m4])
@@ -64,6 +64,7 @@ AC_PROG_SED
AC_PROG_CC
AC_PROG_CXX
AC_PROG_RANLIB
+AM_PROG_AR
AC_LANG([C])
AX_COMPILER_VENDOR
@@ -142,23 +143,14 @@ AM_SILENT_RULES([yes])
dnl ======================================
dnl Initialise Perl and Python directories
dnl ======================================
-AM_PATH_PYTHON
-AC_ARG_VAR([perldir], [path in which to install perl modules, '${libdir}/perl' by default])
-: ${perldir="${libdir}/perl"}
+AM_PATH_PYTHON([3.5])
dnl ========
dnl Finalise
dnl ========
-AC_CONFIG_FILES([
- Makefile
- src/scripts/amplian.py
- src/scripts/dec.py
- src/scripts/mm.py
- src/scripts/shorah.py
- src/scripts/snv.py
- ])
+AC_CONFIG_FILES([Makefile])
AC_OUTPUT
@@ -169,12 +161,15 @@ dnl ============================
AC_MSG_RESULT([
${PACKAGE_NAME} ${VERSION}
- CC: ${CC} (${ax_cv_c_compiler_vendor}, ${ax_cv_c_compiler_version})
- CFLAGS: ${CFLAGS}
+ CC : ...................... ${CC} (${ax_cv_c_compiler_vendor}, ${ax_cv_c_compiler_version})
+ CFLAGS : .................. ${CFLAGS}
+
+ CXX : ..................... ${CXX} (${ax_cv_cxx_compiler_vendor}, ${ax_cv_cxx_compiler_version})
+ CXXFLAGS : ................ ${CXXFLAGS}
- CXX: ${CXX} (${ax_cv_cxx_compiler_vendor}, ${ax_cv_cxx_compiler_version})
- CXXFLAGS: ${CXXFLAGS}
+ CPPFLAGS : ................ ${CPPFLAGS}
+ LDFLAGS : ................. ${LDFLAGS}
- CPPFLAGS: ${CPPFLAGS}
- LDFLAGS: ${LDFLAGS}
+ PYTHON : .................. ${PYTHON}
+ Python version : .......... ${PYTHON_VERSION}
])
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..e2ae37c
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,18 @@
+project('shorah',
+ ['c', 'cpp'],
+ version : '1.9.95',
+ default_options : [
+ 'buildtype=release',
+ 'default_library=shared',
+ 'warning_level=3',
+ 'b_ndebug=if-release'],
+ license : 'GPL-2+')
+
+# info
+as_version = meson.project_version()
+ver_arr = as_version.split('.')
+as_major_version = ver_arr[0]
+as_minor_version = ver_arr[1]
+as_micro_version = ver_arr[2]
+
+subdir('src')
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ae76819
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+import glob
+import sys
+import os
+import shutil
+
+from setuptools import setup, find_packages
+from setuptools.command.test import test as TestCommand
+from setuptools.command.develop import develop
+from setuptools.command.install import install
+
+def move_files():
+ """Identify the build directory and copy python files and executables into src/shorah."""
+ try:
+ main_file = glob.glob('*/__main__.py')[0]
+ except IndexError:
+ sys.exit('No __main__.py found. Build first.')
+ build_dir = os.path.dirname(main_file)
+ for py_file in glob.glob('%s/*py' % build_dir):
+ shutil.copy(py_file, 'src/shorah/')
+ for exe in ['b2w', 'diri_sampler', 'fil']:
+ ppp = shutil.copy('%s/src/cpp/%s' % (build_dir, exe), 'src/shorah/bin/')
+ print(ppp)
+
+class CustomDevelop(develop):
+ """Subclassing develop to install files in the correct location."""
+ def run(self):
+ move_files()
+ develop.run(self)
+
+
+class CustomInstall(install):
+ """Subclassing develop to install files in the correct location."""
+ def run(self):
+ move_files()
+ install.run(self)
+
+class PyTest(TestCommand):
+ user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
+
+ def initialize_options(self):
+ TestCommand.initialize_options(self)
+ self.pytest_args = []
+
+ def finalize_options(self):
+ TestCommand.finalize_options(self)
+ self.test_args = []
+ self.test_suite = True
+
+ def run_tests(self):
+ #import here, cause outside the eggs aren't loaded
+ import pytest
+ errno = pytest.main(self.pytest_args)
+ sys.exit(errno)
+
+
+setup(
+ use_scm_version=True,
+ setup_requires=['setuptools_scm', 'setuptools_scm_git_archive'],
+ install_requires=['setuptools_scm'],
+ tests_require=['pytest', 'flake8', 'pep257'],
+ cmdclass={
+ 'test': PyTest,
+ 'install': CustomInstall,
+ 'develop': CustomDevelop
+ },
+ name='ShoRAH',
+ description='SHOrt Reads Assembly into Haplotypes',
+ url='http://github.com/cbg-ethz/shorah',
+ # author='Osvaldo Zagordi',
+ # author_email='firstname.lastname@gmail.com',
+ packages=find_packages('src'), # include all packages under src
+ package_dir={'': 'src'}, # tell setuptools packages are under src
+ entry_points={
+ 'console_scripts': ['shorah = shorah.cli:main']
+ },
+ license='GPL 3.0',
+ long_description='''
+ ShoRAH is designed to analyse genetically heterogeneous samples. It provides error correction,
+ local haplotype reconstruction, and estimation of the frequency of the different genetic variants
+ present in a mixed sample.
+ ''',
+ project_urls={
+ 'Documentation': 'http://cbg-ethz.github.io/shorah',
+ 'Source': 'https://github.com/cbg-ethz/shorah',
+ },
+ classifiers=[
+ # How mature is this project? Common values are
+ # 3 - Alpha
+ # 4 - Beta
+ # 5 - Production/Stable
+ 'Development Status :: 5 - Production/Stable',
+ # Indicate who your project is intended for
+ 'Intended Audience :: Developers',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ 'Topic :: Scientific/Engineering :: Medical Science Apps.',
+ # Pick your license as you wish (should match "license" above)
+ 'License :: GPL 3.0',
+ # Specify the Python versions you support here. In particular, ensure
+ # that you indicate whether you support Python 2, Python 3 or both.
+ # 'Programming Language :: Python :: 2',
+ # 'Programming Language :: Python :: 2.6',
+ # 'Programming Language :: Python :: 2.7',
+ # 'Programming Language :: Python :: 3',
+ # 'Programming Language :: Python :: 3.2',
+ # 'Programming Language :: Python :: 3.3',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6']
+)
diff --git a/src/shorah/b2w.cpp b/src/cpp/b2w.cpp
similarity index 98%
rename from src/shorah/b2w.cpp
rename to src/cpp/b2w.cpp
index 2733c3a..4a50de8 100644
--- a/src/shorah/b2w.cpp
+++ b/src/cpp/b2w.cpp
@@ -39,6 +39,8 @@ adapted from samtools/calDep.c
#include "faidx.h"
#include "sam.h"
+#define UNUSED(expr) (void)(expr)
+
// data for fetch_func and pileup_func
typedef struct
{
@@ -79,7 +81,8 @@ static int fetch_func1(const bam1_t* b, void* data)
overlap = tmp->end - Rstart + 1;
}
if (overlap >= tmp->min_overlap) {
- char rd[wSize + 1]; // store blank read segment of 'N's
+ //char rd[wSize + 1]; store blank read segment of 'N's
+ char *rd = new char[wSize + 1];
for (int i = 0; i < wSize; i++) {
rd[i] = 'N';
}
@@ -93,6 +96,7 @@ static int fetch_func1(const bam1_t* b, void* data)
<< '>' << bam1_qname(b) << ' ' << Rstart << '\n';
tmp->outFile // write read
<< rd << "\n";
+ delete [] rd;
}
}
return 0;
@@ -102,6 +106,8 @@ static int fetch_func1(const bam1_t* b, void* data)
static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t* pl, void* data)
{
tmpstruct_t* tmp = (tmpstruct_t*)data;
+ UNUSED(tid);
+ UNUSED(n);
int i = pos - tmp->beg; // base position within window
char* mm = (char*)tmp->rd + i; // location of base
if (pos >= tmp->beg && pos <= tmp->end) { // make sure read position within window
@@ -199,6 +205,7 @@ int main(int argc, char* argv[])
tmp.fai = fai_load(argv[optind + 1]);
int iBuild = bam_index_build(argv[optind]); // generate bam index
+ UNUSED(iBuild);
bam_index_t* idx;
idx = bam_index_load(argv[optind]); // load bam index
if (idx == 0) {
diff --git a/src/shorah/data_structures.hpp b/src/cpp/data_structures.hpp
similarity index 100%
rename from src/shorah/data_structures.hpp
rename to src/cpp/data_structures.hpp
diff --git a/src/shorah/dpm_sampler.cpp b/src/cpp/dpm_sampler.cpp
similarity index 99%
rename from src/shorah/dpm_sampler.cpp
rename to src/cpp/dpm_sampler.cpp
index f6e72cc..71be74d 100644
--- a/src/shorah/dpm_sampler.cpp
+++ b/src/cpp/dpm_sampler.cpp
@@ -810,7 +810,7 @@ void build_assignment(std::ofstream& out_file)
if (cn->next->size == 0) {
#ifndef NDEBUG
- printf("removing some node... p=%p\n", cn->next);
+ printf("removing some node... p=%p\n", (void *)cn->next);
#endif
remove_comp(&cn->next);
} else {
@@ -1300,7 +1300,7 @@ ssret* sample_class(unsigned int i, unsigned int step)
temp = new int[J / 10 + 1];
#ifndef NDEBUG
for (removed = 0; removed < q; removed++)
- printf("c_ptr[%d]=%p\n", removed, c_ptr[removed]);
+ printf("c_ptr[%d]=%p\n", removed, (void *)c_ptr[removed]);
removed = 0;
printf("---------------------------------------------------\n");
printf("-----------> sampling class for %ith read <----------\n", i);
@@ -1496,7 +1496,7 @@ ssret* sample_class(unsigned int i, unsigned int step)
#ifndef NDEBUG
for (j = 0; j <= st; j++)
- printf("with P[%i] = %e to class %p\n", j, P[j], cl_ptr[j]);
+ printf("with P[%i] = %e to class %p\n", j, P[j], (void *)cl_ptr[j]);
#endif
g = gsl_ran_discrete_preproc(st + 1, P);
@@ -1518,7 +1518,7 @@ ssret* sample_class(unsigned int i, unsigned int step)
if (from_class == to_class) {
#ifndef NDEBUG
- printf("from %p to itself\n", from_class);
+ printf("from %p to itself\n", (void *)from_class);
#endif
// free(P);
@@ -1532,7 +1532,7 @@ ssret* sample_class(unsigned int i, unsigned int step)
else if (to_class != NULL) {
#ifndef NDEBUG
- printf("moving the read from %p to %p\n", from_class, to_class);
+ printf("moving the read from %p to %p\n", (void *)from_class, (void *)to_class);
#endif
c_ptr[i] =
@@ -1554,7 +1554,7 @@ ssret* sample_class(unsigned int i, unsigned int step)
else if (to_class == NULL) {
#ifndef NDEBUG
- printf("moving %i to a new class from %p\n", i, from_class);
+ printf("moving %i to a new class from %p\n", i, (void *)from_class);
#endif
remove_read(search_read(&from_class->rlist, i));
diff --git a/src/shorah/dpm_sampler.hpp b/src/cpp/dpm_sampler.hpp
similarity index 100%
rename from src/shorah/dpm_sampler.hpp
rename to src/cpp/dpm_sampler.hpp
diff --git a/src/shorah/fil.cpp b/src/cpp/fil.cpp
similarity index 100%
rename from src/shorah/fil.cpp
rename to src/cpp/fil.cpp
diff --git a/src/cpp/meson.build b/src/cpp/meson.build
new file mode 100644
index 0000000..5295438
--- /dev/null
+++ b/src/cpp/meson.build
@@ -0,0 +1,44 @@
+message('Is popcnt available?')
+code = '''
+ #include
+ int main() { return _mm_popcnt_u64(0); }'''
+cpp = meson.get_compiler('cpp')
+success = cpp.links(code, args : '-mpopcnt', name : 'pocnt')
+if success
+ message('popcnt intrinsics are available')
+ extra_cpp_args = ['-mpopcnt']
+else
+ message('popcnt is broken')
+ extra_cpp_args = []
+endif
+
+gsl_dep = dependency('gsl')
+thread_dep = dependency('threads')
+
+dpm_exe = executable(
+ 'diri_sampler', files([
+ 'data_structures.hpp',
+ 'dpm_sampler.hpp',
+ 'dpm_sampler.cpp']),
+ dependencies : [ gsl_dep, thread_dep ],
+ include_directories : bam_inc,
+ cpp_args : extra_cpp_args,
+ install : true)
+
+b2w_exe = executable(
+ 'b2w', files([
+ 'b2w.cpp']),
+ dependencies : thread_dep,
+ include_directories : bam_inc,
+ cpp_args : extra_cpp_args,
+ link_with : bam_library,
+ install : true)
+
+fil_exe = executable(
+ 'fil', files([
+ 'fil.cpp']),
+ dependencies : [ gsl_dep, thread_dep ],
+ include_directories : bam_inc,
+ cpp_args : extra_cpp_args,
+ link_with : bam_library,
+ install : true)
diff --git a/src/meson.build b/src/meson.build
new file mode 100644
index 0000000..b390df9
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,2 @@
+subdir('samtools')
+subdir('cpp')
diff --git a/src/perl/DNA.pm b/src/perl/DNA.pm
deleted file mode 100755
index 0ff403a..0000000
--- a/src/perl/DNA.pm
+++ /dev/null
@@ -1,761 +0,0 @@
-# DNA.pm - Utilities
-
-# Copyright 2007, 2008, 2009
-# Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Osvaldo Zagordi,
-# ETH Zurich
-
-# This file is part of ShoRAH.
-# ShoRAH is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# ShoRAH is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with ShoRAH. If not, see .
-
-package DNA;
-
-use strict;
-use NikoUtil;
-use Util;
-use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
-use Exporter;
-
-$VERSION = 1.00;
-@ISA = qw(Exporter);
-
-# autoexport:
-@EXPORT = qw(
- translate
- makeCodons
- reverseCodons
- revComp
- NW SW
- generalizedSW
- blastAlign
- correctGaps
- correctGaps2
- ambigString
- );
-# export on request:
-@EXPORT_OK = qw();
-
-sub translate {
- # input:
- # dna string
- # codon hash
- # optional: if ambiguous, should we output just one aa or
- # a whole block [CKEIF] ??
-
- my $dna = shift;
- my $codon = shift;
- my $JUSTONEAA = 0;
- if (@_ > 0) {
- $JUSTONEAA = shift;
- }
- my $pro = '';
- my $cod;
- my $choices = 1;
- my $ambig = 0;
-
- for(my $i=0; $i < length($dna); $i+=3) {
- $cod = substr($dna,$i,3);
- if (length($cod) == 3) {
- if (defined $codon->{$cod}) {
- $pro .= $codon->{$cod};
- } else {
- my $l = resolveCodon($cod);
- my %poss;
- foreach (@$l) {
- if (defined ($codon->{$_})) {
- $poss{$codon->{$_}} = 1;
- } else {
- $poss{' '} = 1;
- }
- }
- my $tmp = join('', sort(keys(%poss)));
- if (keys(%poss) > 1) {
- if ($JUSTONEAA) {
- $pro .= substr($tmp,0,1);
- } else {
- $pro .= '[' . $tmp . ']';
- }
- $choices *= keys(%poss);
- $ambig += 1;
- } else {
- $pro .= $tmp;
- }
-#print "#error : don't know codon ", substr($dna,$i,3),"\n";
- }
- }
- }
- #print "$choices choices for protein\n";
- #print "$ambig ambiguous codons\n";
- #print "$pro\n";
- return $pro;
-}
-
-sub resolveCodon {
- #take codon with ambiguity codes and output list of all codons that could
- #give it
- my $c = shift;
- my @l = split '', $c;
- my @ans = ('', '', '');
- my @codons;
- foreach my $p (0..2) {
- if ($l[$p] eq 'A') {
- $ans[$p] .= 'A';
- } elsif ($l[$p] eq 'C') {
- $ans[$p] .= 'C';
- } elsif ($l[$p] eq 'G') {
- $ans[$p] .= 'G';
- } elsif ($l[$p] eq 'T') {
- $ans[$p] .= 'T';
- } elsif ($l[$p] eq 'R') {
- $ans[$p] .= 'AG';
- } elsif ($l[$p] eq 'Y') {
- $ans[$p] .= 'CT';
- } elsif ($l[$p] eq 'K') {
- $ans[$p] .= 'GT';
- } elsif ($l[$p] eq 'M') {
- $ans[$p] .= 'AC';
- } elsif ($l[$p] eq 'S') {
- $ans[$p] .= 'CG';
- } elsif ($l[$p] eq 'W') {
- $ans[$p] .= 'AT';
- } elsif ($l[$p] eq 'D') {
- $ans[$p] .= 'AGT';
- } elsif ($l[$p] eq 'H') {
- $ans[$p] .= 'ACT';
- } elsif ($l[$p] eq 'B') {
- $ans[$p] .= 'CGT';
- } elsif ($l[$p] eq 'V') {
- $ans[$p] .= 'ACG';
- } elsif ($l[$p] eq 'X') {
- $ans[$p] .= 'ACGT';
- } elsif ($l[$p] eq 'N') {
- $ans[$p] .= 'ACGT';
- }
- }
- #print "$c -> @ans\n";
- foreach my $i (0 .. length($ans[0])-1) {
- foreach my $j (0 .. length($ans[1])-1) {
- foreach my $k (0 .. length($ans[2])-1) {
- push(@codons, substr($ans[0],$i,1) . substr($ans[1],$j,1) . substr($ans[2],$k,1));
- }
- }
- }
- #print "Codon $c -> @codons\n";
-
- return \@codons;
-}
-
-sub makeCodons {
- my %codon;
- $codon{GCT} = $codon{GCC} = $codon{GCA} = $codon{GCG} = 'A';
- $codon{TTA} = $codon{TTG} = $codon{CTT} = $codon{CTC} = $codon{CTA} = $codon{CTG} = 'L';
- $codon{CGT} = $codon{CGC} = $codon{CGA} = $codon{CGG} = $codon{AGA} = $codon{AGG} = 'R';
- $codon{AAA} = $codon{AAG} = 'K';
- $codon{AAT} = $codon{AAC} = 'N';
- $codon{ATG} = 'M';
- $codon{GAT} = $codon{GAC} = 'D';
- $codon{TTT} = $codon{TTC} = 'F';
- $codon{TGT} = $codon{TGC} = 'C';
- $codon{CCT} = $codon{CCC} = $codon{CCA} = $codon{CCG} = 'P';
- $codon{CAA} = $codon{CAG} = 'Q';
- $codon{TCT} = $codon{TCC} = $codon{TCA} = $codon{TCG} = $codon{AGT} = $codon{AGC} = 'S';
- $codon{GAA} = $codon{GAG} = 'E';
- $codon{ACT} = $codon{ACC} = $codon{ACA} = $codon{ACG} = 'T';
- $codon{GGT} = $codon{GGC} = $codon{GGA} = $codon{GGG} = 'G';
- $codon{TGG} = 'W';
- $codon{CAT} = $codon{CAC} = 'H';
- $codon{TAT} = $codon{TAC} = 'Y';
- $codon{ATT} = $codon{ATC} = $codon{ATA} = 'I';
- $codon{GTT} = $codon{GTC} = $codon{GTA} = $codon{GTG} = 'V';
- #$codon{ATG} = '+';
- $codon{TAG} = $codon{TGA} = $codon{TAA} = '*';
- return \%codon;
-}
-sub reverseCodons {
- my $codon = shift;
- my %a2c;
- #$a2c{A} = [GCT, GCC, GCA, GCG]
- foreach my $cod (keys(%$codon)) {
- push(@{$a2c{$$codon{$cod}}},$cod);
- }
- return \%a2c;
-}
-
-sub revComp {
- my $seq = $_[0];
- my $seqRC = "";
- my $I;
-
- for($I = length($seq)-1; $I >= 0; $I--) {
- my $base = substr($seq, $I, 1);
- if($base eq "A") {
- $seqRC .= "T";
- } elsif($base eq "C") {
- $seqRC .= "G";
- } elsif($base eq "G") {
- $seqRC .= "C";
- } elsif($base eq "T") {
- $seqRC .= "A";
- } else {
- $seqRC .= $base;
- }
- }
- return($seqRC);
-}
-
-
-sub trans {
- my $x = shift;
- if ($x eq 'A') {
- return 0;
- } elsif ($x eq 'C') {
- return 1;
- } elsif ($x eq 'G') {
- return 2;
- } else {
- return 3;
- }
-}
-
-sub NW {
- # be careful if using... .fixes went into SW
- my $a = shift;
- my $b = shift;
- my $d;
- my $S;
- if (@_ > 0) {
- $d = shift; #linear gap
- $S = shift; #[][] = costs
- } else {
- $d = -400;
- $S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]];
- }
-
- my @A = split('', $a);
- my @B = split('', $b);
- my ($i, $j, $choice1, $choice2, $choice3);
- my $AlignmentA;
- my $AlignmentB;
- my @F;
-
- foreach $i (0..@A) {
- $F[$i][0] = $d * $i;
- }
- foreach $j (0..@B) {
- $F[0][$j] = $d * $j;
- }
-
- foreach $i (1..@A) {
- foreach $j (1..@B) {
- $choice1 = $F[$i-1][$j-1] + $S->[trans($A[$i-1])][trans($B[$j-1])];
- $choice2 = $F[$i-1][$j] + $d;
- $choice3 = $F[$i][$j-1] + $d;
- $F[$i][$j] = max($choice1,$choice2,$choice3);
- }
- }
-
- #foreach $i (0..@A) {
- # foreach $j (0..@B) {
- # print "$F[$i][$j] ";
- # }
- # print "\n";
- #}
-
- $AlignmentA = '';
- $AlignmentB = '';
- $i = @A;
- $j = @B;
-
- while ($i > 0 and $j > 0) {
- my $Score = $F[$i][$j];
- my $ScoreDiag = $F[$i - 1][$j - 1];
- my $ScoreUp = $F[$i][$j - 1];
- my $ScoreLeft = $F[$i - 1][$j];
- my $Ai = trans($A[$i-1]);
- my $Bj = trans($B[$j-1]);
-
- if ($Score == $ScoreDiag + $S->[$Ai][$Bj]) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $i--;
- $j--;
- } elsif ($Score == $ScoreLeft + $d) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = '-' . $AlignmentB;
- $i--;
- } else { # (Score == ScoreUp + d)
- $AlignmentA = '-' . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $j--;
- }
- }
- while ($i > 0) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = '-' . $AlignmentB;
- $i--;
- }
- while ($j > 0) {
- $AlignmentA = '-' . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $j--;
- }
- return ($AlignmentA,$AlignmentB);
-}
-
-
-sub SW {
-
- #Smith Waterman local alignment
- # assume that first sequence is the short one
- my $a = shift;
- my $b = shift;
- my $d;
- my $S;
- if (@_ > 0) {
- $d = shift; #linear gap
- $S = shift; #[][] = costs
- } else {
- #$d = -100;
- #$S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]];
- $d = -4;
- $S = [[4,-4,-4,-4],[-4,4,-4,-4],[-4,-4,4,-4],[-4,-4,-4,4]];
- }
- my $DEBUG = 0;
- if (@_ > 0) {
- $DEBUG = shift;
- }
-
-
- my @A = split('', $a);
- my @B = split('', $b);
- my @tA = map{trans($_)} @A;
- my @tB = map{trans($_)} @B;
- #my @tA = map {trans($_)} split ('', $a);
- #my @tB = map {trans($_)} split ('', $b);
- my ($i, $j);
- my $AlignmentA;
- my $AlignmentB;
- my @F;
- my ($largestF, $largesti,$largestj);
- $largestF = 1;
-
- foreach $i (0..@A) {
- $F[$i][0] = 0;
- }
- foreach $j (0..@B) {
- $F[0][$j] = 0;
- }
-
-
- my ($c1, $c2, $c3, $m);
-
- foreach $i (1..@A) {
- foreach $j (1..@B) {
- $c1 = $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]];
- $c2 = $F[$i-1][$j] + $d;
- $c3 = $F[$i][$j-1] + $d;
- $m = $c1 > $c2 ? ($c1 > $c3 ? $c1 : $c3) : ($c2 > $c3 ? $c2 : $c3);
- $F[$i][$j] = $m > 0 ? $m : 0;
- #$F[$i][$j] = max($choice1,$choice2,$choice3,0);
-
- #$F[$i][$j] = max($F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]],
- # $F[$i-1][$j] + $d, $F[$i][$j-1] + $d, 0);
-
- if ($F[$i][$j] >= $largestF) {
- #print "max at $i $j -- $F[$i][$j]\n";
- $largestF = $F[$i][$j];
- $largesti = $i;
- $largestj = $j;
- }
- }
- }
- if ($DEBUG) {
- foreach my $row (@F) {
- print join("\t", @$row), "\n";
- }
- }
-
-
- #now trace back from the largest value in the matrix and build up the alignment
-
- $AlignmentA = '';
- $AlignmentB = '';
- $i = $largesti;
- $j = $largestj;
- #print "Largest F at $i, $j\n";
- #
- # my $gapend = (length($a) - $largesti);
- # if ($gapend > 0) {
- # print "adding $gapend gaps at end\n";
- # $AlignmentA = 'x' x $gapend;
- # #foreach my $x ($j .. ($j + $gapend - 1)) {
- # # print "$x $B[$x]\n";
- # #}
- # #$AlignmentB = lc(join('',@B[$j .. ($j + $gapend-1)]));
- # $AlignmentB = 'x' x $gapend;
- # }
- #
- while ($F[$i][$j] > 0) {
- if ($DEBUG) {
- print "$F[$i-1][$j-1] $F[$i-1][$j]\n$F[$i][$j-1] $F[$i][$j]\n";
- }
- if ($F[$i][$j] == $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $i--; $j--;
- } elsif ($F[$i][$j] == $F[$i-1][$j] + $d) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = '-' . $AlignmentB;
- $i--;
- } else {
- $AlignmentA = '-' . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $j--;
- }
- if ($DEBUG) {
- print "$AlignmentA\n$AlignmentB\n";
- print '-' x 20, "\n";
- }
- }
-
- if (0) {
-
- while ($F[$i][$j] > 0) {
- if ($F[$i][$j] == $F[$i-1][$j] + $d) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = '-' . $AlignmentB;
- $i--;
- } elsif ($F[$i][$j] == $F[$i][$j-1] + $d) {
- $AlignmentA = '-' . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $j--;
- } elsif ($F[$i][$j] == $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]]) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $i--; $j--;
- }
- print "$AlignmentA\n$AlignmentB\n";
- print '-' x 20, "\n";
- }
-}
- #while ($i > 0) {
- # $AlignmentA = '-' . $AlignmentA;
- # $AlignmentB = $B[$i] . $AlignmentB;
- # #$AlignmentB = 'x' . $AlignmentB;
- # $i--;
- #}
-
- return ($AlignmentA,$AlignmentB, $j);
-}
-
-sub generalizedSW {
-
- #Smith Waterman local alignment
- #align a sequence to a quasi-sequence:
- #
- # so $a = 'AACCTT'
- # and $b = [ {A => .5, C => .5}, ... ]
- # assume that first sequence is the short one
- # b is a list of lists, actually, in order ACGT
- # b = [ [.5, .5, 0, 0], ... ]
- my $a = shift;
- my $b = shift;
- # ltrIdx is hash A => 0, ... T => 3
- # alphabet is list [A C G T]
- my $LtrIdx = shift;
- my $alphabet = shift;
- my $d;
- my $S;
- if (@_ > 0) {
- $d = shift; #linear gap
- $S = shift; #[][] = costs
- } else {
- #$d = -100;
- #$S = [[91,-114,-31,-123],[-114,100,-125,-31],[-31,-125,100,-114],[-123,-31,-114,91]];
- $d = -4;
- #$S = [[4,-8,-8,-8],[-8,4,-8,-8],[-8,-8,4,-8],[-8,-8,-8,4]];
- $S = [[4,-4,-4,-4],[-4,4,-4,-4],[-4,-4,4,-4],[-4,-4,-4,4]];
- }
- my $DEBUG = 0;
- if (@_ > 0) {
- $DEBUG = shift;
- }
-
-
- if ($DEBUG) {
- print "Aligning\n$a\n";
- print "[@$_], " foreach (@$b);
- print "\n";
- }
- my @A = split('', $a);
- my @tA = map{$LtrIdx->{$_}} @A;
-
- my $numB = @$b;
- #my @B = split('', $b);
- #my @tB = map{trans($_)} @B;
-
- my @B = map {$alphabet->[maxind($_)]} @$b;
-
- my ($i, $j);
- my $AlignmentA;
- my $AlignmentB;
- my @F;
- my ($largestF, $largesti,$largestj);
- $largestF = 1;
-
- foreach $i (0..@A) {
- $F[$i][0] = 0;
- }
- foreach $j (0..$numB) {
- $F[0][$j] = 0;
- }
-
-
- my ($c1, $c2, $c3, $m);
-
- foreach $i (1..@A) {
- foreach $j (1..$numB) {
- #$c1 = $F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]];
- #$c1 = F[i-1][j-1] + sum_alphabet pr(b_j-1 = K)*S->[a_i-1][K]
- $c1 = $F[$i-1][$j-1] + score($tA[$i-1], $b->[$j-1], $S);
-
- $c2 = $F[$i-1][$j] + $d;
- $c3 = $F[$i][$j-1] + $d;
- #print "$i $j = max($c1 $c2 $c3)\n";
- $m = $c1 > $c2 ? ($c1 > $c3 ? $c1 : $c3) : ($c2 > $c3 ? $c2 : $c3);
- $F[$i][$j] = $m > 0 ? $m : 0;
- #$F[$i][$j] = max($choice1,$choice2,$choice3,0);
-
- #$F[$i][$j] = max($F[$i-1][$j-1] + $S->[$tA[$i-1]][$tB[$j-1]],
- # $F[$i-1][$j] + $d, $F[$i][$j-1] + $d, 0);
-
- if ($F[$i][$j] >= $largestF) {
- #print "max at $i $j -- $F[$i][$j]\n";
- $largestF = $F[$i][$j];
- $largesti = $i;
- $largestj = $j;
- }
- }
- }
- if ($DEBUG) {
- foreach my $row (@F) {
- print join("\t", @$row), "\n";
- }
- }
-
-
- #now trace back from the largest value in the matrix and build up the alignment
-
- $AlignmentA = '';
- $AlignmentB = '';
- $i = $largesti;
- $j = $largestj;
- $i = @A;
- $j = $numB;
- #print "Largest F at $i, $j\n";
- #
- # my $gapend = (length($a) - $largesti);
- # if ($gapend > 0) {
- # print "adding $gapend gaps at end\n";
- # $AlignmentA = 'x' x $gapend;
- # #foreach my $x ($j .. ($j + $gapend - 1)) {
- # # print "$x $B[$x]\n";
- # #}
- # #$AlignmentB = lc(join('',@B[$j .. ($j + $gapend-1)]));
- # $AlignmentB = 'x' x $gapend;
- # }
- #
- while ($F[$i][$j] > 0) {
- if ($DEBUG) {
- print "$F[$i-1][$j-1] $F[$i-1][$j]\n$F[$i][$j-1] $F[$i][$j]\n";
- }
- if ($F[$i][$j] == $F[$i-1][$j-1] + score($tA[$i-1], $b->[$j-1], $S)) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $i--; $j--;
- } elsif ($F[$i][$j] == $F[$i-1][$j] + $d) {
- $AlignmentA = $A[$i-1] . $AlignmentA;
- $AlignmentB = '-' . $AlignmentB;
- $i--;
- } else {
- $AlignmentA = '-' . $AlignmentA;
- $AlignmentB = $B[$j-1] . $AlignmentB;
- $j--;
- }
- if ($DEBUG) {
- print "$AlignmentA\n$AlignmentB\n";
- print '-' x 20, "\n";
- }
- }
-
- return ($AlignmentA,$AlignmentB, $j);
-}
-
-sub blastAlign {
-#s2 should be a subsequence of s1
- my $s1 = shift;
- my $s2 = shift;
- system("echo $s1 > /tmp/seq1");
- system("echo $s2 > /tmp/seq2");
- system("bl2seq -i /tmp/seq1 -j /tmp/seq2 -p blastn |grep -A 2 Query:");
-}
-
-sub score {
- my $a = shift;
- my $b = shift;
- my $S = shift;
- my $ans = 0;
- foreach my $baseInd (0 .. @$b-1) {
- $ans += $S->[$a][$baseInd] * $b->[$baseInd];
- }
- #print "Score $a, [@$b] = $ans\n";
- return $ans;
-}
-
-sub correctGaps {
- my $reads = shift;
-
- my @AAlist;
- foreach my $rd (@$reads) {
- $rd->{seq} =~ s/N/-/g;
- foreach my $pos ($$rd{st} .. $$rd{end}-1) {
- ${$AAlist[$pos]}{substr($$rd{seq},$pos - $$rd{st}, 1)}++;
- }
- }
- my @corInd;
- my @cons;
-
- foreach my $i (0..@AAlist-1) {
- #my @chars = keys(%{$AAlist[$i]});
- if ((defined ${$AAlist[$i]}{'-'}) and (keys(%{$AAlist[$i]}) == 2)) {
- #foreach (sort{$AAlist[$i]{$b} <=> $AAlist[$i]{$a}}keys(%{$AAlist[$i]})) {
- delete ${$AAlist[$i]}{'-'};
- push(@corInd, $i);
- #my @c = keys(%{$AAlist[$i]});
- my @c = sort {$AAlist[$i]{$b} <=> $AAlist[$i]{$a}} keys(%{$AAlist[$i]});
- push(@cons, $c[0]);
- }
- }
-
- foreach my $r (@$reads) {
- foreach my $i (0 .. @corInd-1) {
- #if r->st <= corInd[i] <= r->end
- #and r->seq . substr(i-start) == '-'
- #correct to cons
- if (($r->{st} <= $corInd[$i]) and ($r->{end} >= $corInd[$i])) {
- if (substr($r->{seq},$corInd[$i] - $r->{st},1) eq '-') {
- substr($r->{seq},$corInd[$i] - $r->{st},1) = $cons[$i];
- }
- }
- }
- }
-}
-
-
-
-sub correctGaps2 {
- #take a set of reads (given as two lists, @head and @seq)
- #and correct all gaps for which there is only one choice
- my $head = shift;
- my $seq = shift;
- my $starts;
- my $ends;
- #fix starts and ends
- my $n = @$head;
- my $num = 0;
- foreach my $i (0 .. $n-1) {
- my ($left, $right) = split /\$/, $head->[$i];
- my ($id, $start, undef) = split / /, trim($left);
- my $len = length($seq->[$i]);
- push(@$starts, $start);
- push(@$ends, $start + $len);
- }
-
- my @AAlist;
- foreach my $rd (0 .. $n-1) {
- $seq->[$rd] =~ s/N/-/g;
- #if ($seq->[$rd] =~ / /) {
- # print "SPACE: read $rd == .$seq->[$rd].\n";
- #}
- #print "read $rd starts at $starts->[$rd] ends $ends->[$rd]\n";
- #print "length ", length($seq->[$rd]), "\n";
- foreach my $pos ($starts->[$rd] .. $ends->[$rd] - 1) {
- ${$AAlist[$pos]}{substr($seq->[$rd],$pos - $starts->[$rd], 1)}++;
- }
- }
- my @corInd;
- my @cons;
-
- foreach my $i (0..@AAlist-1) {
- #my @chars = keys(%{$AAlist[$i]});
- #print "pos $i, keys = [", join(',', keys(%{$AAlist[$i]})), "]\n";
- if ((defined ${$AAlist[$i]}{'-'}) and (keys(%{$AAlist[$i]}) == 2)) {
- #foreach (sort{$AAlist[$i]{$b} <=> $AAlist[$i]{$a}}keys(%{$AAlist[$i]}))
- delete ${$AAlist[$i]}{'-'};
- push(@corInd, $i);
- my @c = sort {$AAlist[$i]{$b} <=> $AAlist[$i]{$a}} keys(%{$AAlist[$i]});
- push(@cons, $c[0]);
- }
- }
- #print "correcting at @corInd\n";
-
- foreach my $r (0 .. $n-1) {
- foreach my $i (0 .. @corInd-1) {
- #if r->st <= corInd[i] <= r->end
- #and r->seq . substr(i-start) == '-'
- #correct to cons
- if (($starts->[$r] <= $corInd[$i]) and ($ends->[$r] >= $corInd[$i])) {
- if (substr($seq->[$r],$corInd[$i] - $starts->[$r],1) eq '-') {
- substr($seq->[$r],$corInd[$i] - $starts->[$r],1) = $cons[$i];
- #print "Correct at $corInd[$i]\n";
- $num++;
- }
- }
- }
- }
- return $num;
-}
-
-sub ambigString {
- #input: $a = asdf[123]
- #output @b = ({a=>1} {s=>1} {d=>1} {f=>1} {1=>1, 2=>1, 3=>1})
- #
- my $a = shift;
- my @b;
-
- my $i = 0;
-
- while ($i < length($a)) {
- my $chunk = '';
- if (substr($a,$i,1) ne '[') {
- $chunk = substr($a,$i,1);
- } else {
- # find end of [...] block
- while (substr($a,$i,1) ne ']') {
- $chunk .= substr($a,$i,1);
- $i++;
- }
- }
- $i++;
- my %tmp = map {$_ => 1.0/length($chunk)} split '', $chunk;
- push(@b, \%tmp);
- }
- return \@b;
-}
-
-
-1;
diff --git a/src/perl/NikoUtil.pm b/src/perl/NikoUtil.pm
deleted file mode 100755
index 665d2f3..0000000
--- a/src/perl/NikoUtil.pm
+++ /dev/null
@@ -1,473 +0,0 @@
-#
-# NikoUtil.pm - Utilities
-# Copyright 2007, 2008, 2009
-# Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Osvaldo Zagordi,
-# ETH Zurich
-
-# This file is part of ShoRAH.
-# ShoRAH is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# ShoRAH is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with ShoRAH. If not, see .
-
-package NikoUtil;
-
-use strict;
-use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
-use Exporter;
-
-$VERSION = 1.00;
-@ISA = qw(Exporter);
-
-# autoexport:
-@EXPORT = qw(
- trim
- is_numeric
- round
- intrand discr_rand
- concat_list
- read_fasta read_fastas print_fasta
- print_matrix transpose exp_matrix
- where_clause_list
- hamming
- hamming_distance alphabet
- makeDistMatrix
- random
- Nmin Nmax
-);
-
-# export on request:
-@EXPORT_OK = qw();
-
-sub concat_list { # concatenate lists
-
- my @lists = @_; # list of refs. to lists to be concatenated
-
-# concatenate all lists
- my @all;
- for (@lists) {
- push @all, @{$_};
- }
-
-# count occurences in concatenation
- my (@union, %count);
- for (@all) {
- $count{"@{$_}"}++;
- push @union, $_ unless ($count{"@{$_}"} > 1);
- }
-
- return \@union; # multiple occurences appear only once
-}
-
-
-
-sub is_numeric { # Cookbook p. 44
- my $val = shift || ''; # return FALSE if $val is FALSE
-
- return ($val =~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/);
-}
-
-
-
-sub round { # round to nearest integer
-
- my @input = @_;
-
- for (@input) {
- $_ = int($_ + 0.5);
- }
-
- return wantarray ? @input : $input[0];
-}
-
-
-
-sub intrand {
-
-# random integer number between and including $a and $b
-
- my $a = shift;
- my $b = shift;
-
- return int($a) + int(rand($b-$a+1));
-}
-
-
-
-sub discr_rand {
-
-# random integers in given proportion
-
- my @proportion = @_;
- my $n = scalar(@proportion);
-
- my @cum;
- my $i;
-
- my $sum = 0.0;
- for $i (0 .. $n - 1) {
- $sum += $proportion[$i];
- }
- $cum[0] = ($proportion[0] / $sum);
- for $i (1 .. $n - 1) {
- $cum[$i] = $cum[$i-1] + ($proportion[$i] / $sum);
- }
-
- my $r = rand();
-#print "@cum ---> $r\n";
-
- $i = 0;
- while (($i < $n-1) && ($cum[$i] < $r)) {
- $i++;
- }
-
- return $i;
-}
-
-
-
-sub trim { # remove white spaces to the left and right
-
- my @input = @_;
-
- for (@input) {
- s/^\s+//;
- s/\s+$//;
- }
-
- return wantarray ? @input : $input[0];
-}
-
-
-
-sub read_fasta {
-
- my $filename = shift;
-
- my $sep = chr(13); # CR
-
- open FASTA, $filename
- or die "Can't open fasta file $filename!\n";
- my @lines = ;
- close FASTA;
-
- for my $i (0 .. $#lines) {
- splice (@lines, $i, 1, (split /$sep/, $lines[$i]));
- }
-
- return (substr($lines[0], 1), join('', @lines[1..$#lines]));
-}
-
-
-
-sub read_fastas {
-
- my $filename = shift;
-
- my (@header, @seq);
- my $i = -1;
-
- open FASTA, $filename
- or die "Can't open fasta file -- $filename!\n";
-
- while (my $line = ) {
- next if ($line =~ /^#/);
- if (substr($line, 0, 1) eq ">") { # header
- $i++;
- $header[$i] = trim(substr($line, 1));
- }
- else { # sequence
- die "Error - fasta file without header lines @ $line\n" if ($i == -1);
- $seq[$i] .= trim($line);
- }
- }
-
- close FASTA;
-
- return (\@header, \@seq);
-
-}
-
-
-
-sub print_fasta {
-
- my $head = shift;
- my $seq = shift;
-
- my $width = 60;
-
- my @seq = split //, $seq;
-
- my $fasta = ">$head\n";
- while (scalar(@seq) > 0) {
- $fasta .= join('', splice(@seq, 0, $width)) . "\n";
- }
-
-#return $fasta . "\n";
- return $fasta;
-}
-
-
-
-sub print_matrix {
-
-# print matrix (tab- and newline-separated)
-
- my @mat = @{shift @_}; # data im $mat[][]
- my $row_label = shift; # row labels == first column to print
- my $col_label = shift; # column labels == first row to print
- my $mat_label = shift || "\\"; # upper leftmost cell
-
- print $mat_label, "\t" if ($row_label);
- if ($col_label) {
- print join("\t", @$col_label), "\n";
- }
-
- for my $i (0 .. $#mat) {
- print $row_label->[$i], "\t" if ($row_label);
- for my $j (0 .. $#{$mat[$i]}) {
- if (defined($mat[$i][$j])) {
- print $mat[$i][$j], "\t";
- }
- else {
- print "undef\t";
- }
- }
- print "\n";
- }
-}
-
-
-
-sub transpose {
-
-# transpose matrix
-
- my @M = @{shift @_};
-
- my @Mt;
- for my $i (0 .. $#M) {
- for my $j (0 .. $#{$M[0]}) {
- $Mt[$j][$i] = $M[$i][$j];
- }
- }
-
- return \@Mt;
-}
-
-
-
-sub exp_matrix {
-
- my $mat = shift;
- my $base = shift || 10;
-
- my $emat;
-
- for my $i (0 .. scalar(@$mat)-1) {
- for my $j (0 .. scalar(@{$mat->[0]})-1) {
- $emat->[$i][$j] = $base ** $mat->[$i][$j];
- }
- }
-
- return $emat;
-}
-
-
-
-sub where_clause_list {
-
- my $field = shift;
- my @values = @_;
-
- my @cond = ();
-
- for (@values) {
- if (is_numeric($_)) {
- push @cond, sprintf "%s=%d", $field, $_;
- }
- else {
- push @cond, sprintf "%s='%s'", $field, $_;
- }
- }
-
- return join(' or ', @cond);
-}
-
-
-
-
-sub Nmax {
-
-# Maximum of a vector containing undef's
-
- my @input = @_;
-
- my ($max, $argmax);
-
- for my $i (0 .. $#input) {
- if (defined($input[$i])) {
- if (! defined($argmax)) {
- $max = $input[$i];
- $argmax = $i;
- }
- elsif ($input[$i] > $max) { # same action, but avoids warning
- $max = $input[$i];
- $argmax = $i;
- }
- }
- }
-
- return wantarray ? ($max, $argmax) : $max;
-}
-
-
-sub Nmin {
-
-# Minimum of a vector containing undef's
-
- my @input = @_;
-
- my ($min, $argmin);
-
- for my $i (0 .. $#input) {
- if (defined($input[$i])) {
- if (! defined($argmin)) {
- $min = $input[$i];
- $argmin = $i;
- }
- elsif ($input[$i] < $min) { # same action, but avoids warning
- $min = $input[$i];
- $argmin = $i;
- }
- }
- }
- return wantarray ? ($min, $argmin) : $min;
-}
-
-sub hamming_distance {
-
- my $X = uc(shift);
- my $Y = uc(shift);
-
- my $min;
- if (length($X) > length($Y)) {
- $min = length($Y);
- } else {
- $min = length($X);
- }
-
- my $comparisons = 0;
- my $dist = 0;
- my ($x, $y);
- for my $pos (0 .. $min - 1) {
- $x = substr($X,$pos,1);
- $y = substr($Y,$pos,1);
- if (($x ne '.') && ($y ne '.')) {
- $comparisons++;
- if ($x ne $y) {
- $dist++;
- }
- }
- }
-
- #die "Incomparable strings:\n$X\n$Y\n" if ($comparisons == 0);
- if ($comparisons == 0) {
- #print "Error: incomparable strings:\n$X\n$Y\n";
- return 100;
- }
-
- return ($dist / $comparisons);
-}
-
-sub makeDistMatrix {
- my $seqs = shift;
- my @ans;
- my $N = @$seqs;
- foreach my $i (0..$N-1) {
- foreach my $j (0..$N-1) {
- $ans[$i][$j] = hamming($$seqs[$i],$$seqs[$j]);
- }
- #print "$i\n" if ($i % 10 == 0);
- }
- #print "distance matrix:\n";
- #print_matrix(\@ans);
- #print "\n\n";
- return \@ans;
-}
-
-sub hamming {
- #hamming distance, absolute, not caring about '.'
- my $s = shift;
- my $t = shift;
- my $diff = 0;
- # compute numer of differences between $s and $t
- #die "ERROR in hamming $s\n$t\n" if (length($s) != length($t));
- my $len = Nmin(length($s), length($t));
- foreach my $i (0.. $len-1) {
- if (!(substr($s,$i,1) eq substr($t,$i,1))) {
- $diff++;
- }
- }
- return $diff;
-}
-
-
-
-sub alphabet {
-
- my $type = uc(shift);
-
- my (@Alphabet, @Special);
-
- if ($type eq 'AA') {
- @Alphabet = qw(A C D E F G H I K L M N P Q R S T V W Y);
- @Special = ("-", ".", "*");
- }
- elsif ($type eq 'NT') {
- @Alphabet = qw(A C G T);
- @Special = ("-", ".", "N");
- }
- elsif ($type eq '01') {
- @Alphabet = qw(0 1);
- @Special = ("-", ".");
- }
- else {
- die "Unknown alphabet -- $type!\n";
- }
-
- # indices:
- my %LtrIdx;
- for my $k (0 .. $#Special) {
- $LtrIdx{$Special[$k]} = -$k - 1;
- }
- for my $k (0 .. $#Alphabet) {
- $LtrIdx{$Alphabet[$k]} = $k;
- }
-
- return (\@Alphabet, \@Special, \%LtrIdx);
-}
-
-sub random { # generate an integer random number
-
- my $x = shift; # between $x
- my $y = shift; # and $y (including both)
-
- return int(rand($y - $x + 1)) + $x;
-}
-
-
-1;
diff --git a/src/perl/ReadUtil.pm b/src/perl/ReadUtil.pm
deleted file mode 100755
index 644db85..0000000
--- a/src/perl/ReadUtil.pm
+++ /dev/null
@@ -1,219 +0,0 @@
-# utilities for reading files of reads
-#
-# Copyright 2007, 2008, 2009
-# Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Osvaldo Zagordi,
-# ETH Zurich
-
-# This file is part of ShoRAH.
-# ShoRAH is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# ShoRAH is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with ShoRAH. If not, see .
-
-package ReadUtil;
-
-use strict;
-use NikoUtil;
-use DNA;
-use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
-use Exporter;
-
-$VERSION = 1.00;
-@ISA = qw(Exporter);
-
-# autoexport:
-@EXPORT = qw(read_ReadSim
-readReadFile readFastaFile
-compareToCons compareReadToCons
-);
-# export on request:
-@EXPORT_OK = qw();
-
-
-
-sub read_ReadSim {
- my $ReadSimOut = shift;
- my @readList;
-
- my ($h, $s) = read_fastas($ReadSimOut);
- foreach my $i (0 .. @$s-1) {
- # process $h->[$i] = read_0001|beg|2|length|92|forward|HIV
- # and $s->[$i] = TCAAATCACTCTTTGGCAACGACCCCTCGTCGACAATAA
-
- #split on '|' doesn't work well, so first substitute spaces
- $h->[$i] =~ s/\|/ /g;
- my @head = split ' ', $h->[$i];
-
- die "missing header $h->[$i]\n" if (@head <= 1);
- my $seqID = $head[0];
- my $startpos = $head[2];
- my $dir = $head[5];
- my $source = $head[6];
-
- my %read;
- $read{id} = $seqID;
- $read{st} = $startpos;
- if ($dir eq 'forward') {
- $read{seq} = $s->[$i];
- } else {
- $read{seq} = revComp($s->[$i]);
- }
- $read{len} = length($s->[$i]);
- $read{end} = $startpos + $read{len} - 1;
- $read{dir} = $dir;
- $read{source} = $source;
- push(@readList, \%read);
- }
-
- return \@readList;
-}
-
-sub readReadFile {
- # read a file of reads given by
- # startpos sequence
- # startpos sequence
- # startpos sequence
- # ...
- my $file = $_[0];
- my @readList;
-
- if (! defined $file) {
- die "Error: No fasta file specified\n";
- }
-
- if (! -r $file) {
- die "Fasta file \"$file\" not readable\n";
- }
- open(IN, "<$file") or die "Can't open $file: $!";
- while () {
- chomp;
- next if (/^#/);
- my ($start, $seq) = split;
- my %read;
- $read{st} = $start;
- $read{seq} = $seq;
- $read{len} = length($seq);
- $read{end} = $start + $read{len} - 1;
- push(@readList, \%read);
- }
- return \@readList;
-}
-
-
-
-
-sub readFastaFile {
- my $file = $_[0];
- my $seqID;
- my $startpos;
- my @readList;
- my $seq = '';
- my @head;
-
- if (! defined $file) {
- die "Error: No fasta file specified\n";
- }
-
- if (! -r $file) {
- die "Fasta file \"$file\" not readable\n";
- }
- open(IN, "<$file") or die "Can't open $file: $!";
- while () {
- chomp;
- if (/^\#/) {
- next;
- } elsif ($_ eq '') {
- next;
- } elsif (/^>/) {
- # header looks like ">seqID startpos ..."
- if (length($seq) > 0) {
- #we've found a sequence, so store it
-
- my %read;
- $read{id} = $seqID;
- $read{st} = $startpos;
- $read{seq} = $seq;
- $read{len} = length($seq);
- $read{end} = $startpos + $read{len} - 1;
- #my @tmp = @head;
- #$read{qs} = \@tmp;
- push(@readList, \%read);
- $seq = '';
- }
- @head = split;
- if (@head > 1) {
- $seqID = shift @head; #$head[0];
- $seqID =~ s/>//;
- $startpos = shift @head; #$head[1];
- } else {
- print "Error - @head missing characters\n";
- }
- } else {
- #a sequence line
- #remove spaces at beginning and end
- s/^\s+//;
- s/\s+$//;
- $seq .= $_;
- }
-
- }
- #still have sequence in the queue, need to put the last one in
- #this should be done in a better fashion...
- my %read;
- $read{id} = $seqID;
- $read{st} = $startpos;
- $read{seq} = $seq;
- $read{len} = length($seq);
- $read{end} = $startpos + $read{len} - 1;
-# $read{qs} = \@head;
- push(@readList, \%read);
-
- close IN;
- return \@readList;
-}
-
-sub compareToCons {
- my $seq = shift;
- my $cons = shift;
- my @ans;
- #die "$seq\n$cons\n" if (length($seq) != length($cons));
- my $minl = Nmin(length($seq), length($cons));
- #input: two strings of same length
- #output: list of differences in format A90I
- foreach my $i (1..$minl) {
- my $c1 = substr($seq,$i-1,1);
- my $c0 = substr($cons,$i-1,1);
- if (!($c1 eq $c0)) {
- push(@ans, $c0 . $i . $c1);
- }
- }
- return \@ans;
-}
-
-sub compareReadToCons {
- # not really right, since compareToCons
- # prints in local coordinates...
- my $read = shift;
- my $cons = shift;
- #input a read %r = {id:AHAHA seq : ACGTAT, len : 6, start : 80}
- #and a string
- #output: %r = {id: AHAHA, ..., muts = join(',', compareToCons(read, cons) }
- #my %r;
- #$r{id} = $read->{id};
- #$r{len} = $read->{len};
- $read->{muts} = join(',', compareToCons($read->{seq}, substr($cons, $read->{st}, $read->{len})));
-}
-
-1;
diff --git a/src/perl/Util.pm b/src/perl/Util.pm
deleted file mode 100755
index 0b7b3a1..0000000
--- a/src/perl/Util.pm
+++ /dev/null
@@ -1,255 +0,0 @@
-package Util;
-# various statistical utilities
-
-# Copyright 2007, 2008, 2009
-# Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Osvaldo Zagordi,
-# ETH Zurich
-
-# This file is part of ShoRAH.
-# ShoRAH is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# ShoRAH is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with ShoRAH. If not, see .
-
-use strict;
-use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
-use Exporter;
-use NikoUtil;
-
-$VERSION = 1.00;
-@ISA = qw(Exporter);
-
-# autoexport:
-@EXPORT = qw(dec2bin findProg Psystem average median countChar
-normalize maxind makeRandomDist makeRandomDistMin binomial fact rand_perm
-computeInDelMut);
-# export on request:
-@EXPORT_OK = qw();
-
-sub dec2bin {
- my $arg = shift;
- my $N = shift;
- substr(unpack("B32", pack('C', $arg)),-$N);
-}
-
-sub findProg {
- my $name = shift;
- my $ans;
-
- my @pathList = split(':',$ENV{'PATH'});
- push(@pathList,"$ENV{'HOME'}/bin");
- push(@pathList,'.');
-
- my $found = 0;
- foreach my $dir (@pathList) {
- if (-r "$dir/$name") {
- $ans = "$dir/$name";
- $found = 1;
- print "found $name as $ans\n";
- last;
- }
- }
- if ($found == 0) {
- print "error, can't find $name in PATH = @pathList\n";
- exit 1;
- }
- return $ans;
-}
-
-sub Psystem {
- my $s = shift;
- print "exec : $s\n";
- system ("$s");
-}
-
-sub Fsystem {
- my $s = shift;
- print "$s\n";
-}
-
-sub average {
- my $l = shift;
- if (@$l == 0) {
- return undef;
- }
- my $ave = 0;
- foreach (@$l) {
- $ave += $_;
- }
- $ave /= @$l;
- return $ave;
-}
-
-sub median {
- #actually computes the mode
- my $l = shift;
- my %c;
- foreach (@$l) {
- $c{$_}++;
- }
- my $maxcnt = 0;
- my $med;
- foreach (keys(%c)) {
- if ($c{$_} > $maxcnt) {
- $maxcnt = $c{$_};
- $med = $_;
- }
- }
- return $med;
-}
-
-sub countChar {
- #count occurances of $char in $s
- #
- my $s = shift;
- my $char = shift;
- #print "counting $char in $s\n";
- # this little trick doesn't work if char = '.'
- #$_ = $s;
- #return tr/$char/$char/;
-
- my $count = 0;
- foreach (0.. length($s)) {
- if (substr($s,$_,1) eq $char) {
- $count++;
- }
- }
- #print "get $count\n";
- return $count;
-}
-
-sub normalize {
- #input: list of positive integers
- #output: list / sum
- my $l = shift;
- #print "Calling norm with @$l\n";
- my $sum = 0;
- foreach my $a (@$l) {
- if (defined $a) {
- $sum += $a;
- }
- }
- if ($sum > 0) {
- foreach my $a (@$l) {
- if (defined $a) {
- $a = $a / $sum;
- } else {
- $a = 0;
- }
- }
- }
-}
-
-sub maxind {
- my $l = shift;
- my $max = -1e100;
- my $maxind = 0;
- foreach my $i (0..@$l-1) {
- if ($l->[$i] > $max) {
- $max = $l->[$i];
- $maxind = $i;
- }
- }
- return $maxind;
-}
-
-sub makeRandomDist {
- my $n = int(shift);
- return 0 if ($n < 1);
- my @ans;
-
- foreach my $i (1 .. $n) {
- push(@ans, int(rand(1000)));
- }
- normalize(\@ans);
- return \@ans;
-}
-
-sub makeRandomDistMin {
- my $n = int(shift);
- my $minFreq = shift;
- return 0 if ($n < 1);
- die "$n * $minFreq > 1\n" if ($n * $minFreq > 1);
- my @ans;
-
- push @ans, 0;
- foreach my $i (1 .. $n-1) {
- push(@ans, int(rand(1000)));
- }
- # normalize so that sum @ans = 1 - $minFreq*$n
- my $num = 1 - $minFreq*$n;
- my $sum = 0;
- $sum += $_ foreach (@ans);
- $_ *= $num/$sum foreach (@ans);
- $_ += $minFreq foreach (@ans);
- return \@ans;
-}
-
-sub binomial {
- my ($n, $k) = round(@_);
- if ($k > $n) {
- return 0;
- }
- return (fact($n) / (fact($k) * fact($n - $k)));
-}
-
-sub fact {
- my $n = shift;
- if ($n == 0) {
- return 1;
- } else {
- my $ans = 1;
- foreach (1..$n) {
- $ans *= $_;
- }
- return $ans;
- }
-}
-
-
-sub rand_perm {
-
- # returns a random permutation of the $N integers 0, ..., $N-1
-
- my $N = shift;
-
- my @list = (0 .. $N-1);
- my @perm;
-
- while (@list) {
- my $r = random(0, $#list);
- push @perm, splice(@list, $r, 1);
- }
-
- return @perm;
-}
-
-sub computeInDelMut {
- my $a = shift;
- my $b = shift;
- my $del = countChar($a, '-');
- my $ins = countChar($b, '-');
- my $Mut = 0;
- foreach my $i (0 .. length($a) - 1) {
- my $x = substr($a,$i,1);
- my $y = substr($b,$i,1);
- if (($x ne $y) and ($x ne '-') and ($y ne '-')) {
- $Mut++;
- }
- }
- return ($ins, $del, $Mut);
-}
-
-1;
diff --git a/src/python/shorah_matching.py b/src/python/shorah_matching.py
deleted file mode 100755
index 6b36ef8..0000000
--- a/src/python/shorah_matching.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Hopcroft-Karp bipartite max-cardinality matching and max independent set
-# David Eppstein, UC Irvine, 27 Apr 2002
-# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/123641
-
-# Example from Cormen, Leiserson, and Rivest, section 27.3:
-#a = bipartiteMatch({ 0:[0], 1:[0,2], 2:[1,2,3], 3:[2], 4:[2] })
-
-
-def bipartiteMatch(graph):
- '''Find maximum cardinality matching of a bipartite graph (U,V,E).
- The input format is a dictionary mapping members of U to a list
- of their neighbors in V. The output is a triple (M,A,B) where M is a
- dictionary mapping members of V to their matches in U, A is the part
- of the maximum independent set in U, and B is the part of the MIS in V.
- The same object may occur in both U and V, and is treated as two
- distinct vertices if this happens.'''
-
- # initialize greedy matching (redundant, but faster than full search)
- matching = {}
- for u in graph:
- for v in graph[u]:
- if v not in matching:
- matching[v] = u
- break
-
- while 1:
- # structure residual graph into layers
- # pred[u] gives the neighbor in the previous layer for u in U
- # preds[v] gives a list of neighbors in the previous layer for v in V
- # unmatched gives a list of unmatched vertices in final layer of V,
- # and is also used as a flag value for pred[u] when u is in the first
- # layer
- preds = {}
- unmatched = []
- pred = dict([(u, unmatched) for u in graph])
- for v in matching:
- del pred[matching[v]]
- layer = list(pred)
-
- # repeatedly extend layering structure by another pair of layers
- while layer and not unmatched:
- newLayer = {}
- for u in layer:
- for v in graph[u]:
- if v not in preds:
- newLayer.setdefault(v, []).append(u)
- layer = []
- for v in newLayer:
- preds[v] = newLayer[v]
- if v in matching:
- layer.append(matching[v])
- pred[matching[v]] = v
- else:
- unmatched.append(v)
-
- # did we finish layering without finding any alternating paths?
- if not unmatched:
- unlayered = {}
- for u in graph:
- for v in graph[u]:
- if v not in preds:
- unlayered[v] = None
- return (matching, list(pred), list(unlayered))
-
- # recursively search backward through layers to find alternating paths
- # recursion returns true if found path, false otherwise
- def recurse(v):
- if v in preds:
- L = preds[v]
- del preds[v]
- for u in L:
- if u in pred:
- pu = pred[u]
- del pred[u]
- if pu is unmatched or recurse(pu):
- matching[v] = u
- return 1
- return 0
-
- for v in unmatched:
- recurse(v)
diff --git a/src/python/shorah_mm.py b/src/python/shorah_mm.py
deleted file mode 100755
index 9944e4e..0000000
--- a/src/python/shorah_mm.py
+++ /dev/null
@@ -1,778 +0,0 @@
-# Copyright 2007, 2008, 2009
-# Niko Beerenwinkel,
-# Nicholas Eriksson,
-# Moritz Gerstung,
-# Lukas Geyrhofer,
-# Osvaldo Zagordi,
-# Susana Posada Cespedes,
-# ETH Zurich
-
-# This file is part of ShoRAH.
-# ShoRAH is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# ShoRAH is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with ShoRAH. If not, see .
-
-'''
-
-usage: mm.py filename [max_haplos]
-
-------------
-Input: filename should consist of lines like
-0 ACAGACGA
-1 CAGACGA
-4 AAAAAAAAAA
-
-where the first column is the start position and the second column is the read.
-The start positions must be indexed from 0.
-
-Any alphabet is allowed for the reads.
-
-Duplicate reads or reads which are redundant might cause unexpected results.
-For example, the reads
-0 ACGT
-1 CGT
-are redundant (since the first contains the second exactly).
-
-------------
-Output:
- the last 4 characters of filename are deleted and replaced with geno
- so if input is file.read then output is file.geno
-------------
-parameters (see the beginning of main() in the code to change these)
-
-MAXGENO = 10000 (can be changed with max_haplos on command line)
- - maximum number of haplotypes we output
-
-MAXITER = 500
- - maximum number of times we rerun the matching algorithm to try to find new
- haplotypes
-
-ONEPATH = True
- - extend chains to just one path in the graph? If false, chains get extended
- to all possible paths.
- This can be a problem if the chain is small, you can get too many paths.
-
-TODO
- ONEPATH should have an option to output up to MAXPATHS paths
-'''
-
-from __future__ import print_function
-import random
-import shorah_matching
-
-MAXGENO_DEFAULT = 10000
-MAXITER_DEFAULT = 50
-MINOVERLAP_DEFAULT = 1
-ONEPATH_DEFAULT = True
-OUTPUTGRAPH_DEFAULT = False
-
-
-def parse_com_line():
-
- import argparse
-
- parser = argparse.ArgumentParser(description="Global reconstruction: minimal set of haplotypes that\
- explains all reads in the dataset",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- opts = main.__defaults__
-
- parser.add_argument("filename", nargs=1,
- help="Input file containing starting position and sequences\
- for each of the reads")
-
- parser.add_argument("-m", "--maxhaplo", action="store", metavar='INT', type=int,
- required=False, dest="MAXGENO", default=opts[0],
- help="keep searching until we find at least MAXGENO \
- haplotypes")
-
- parser.add_argument("-i", "--maxiter", action="store", metavar='INT', type=int,
- required=False, dest="MAXITER", default=opts[1],
- help="but only look up to MAXITER rounds")
-
- parser.add_argument("-l", "--length", action="store", metavar='INT', type=int,
- required=False, dest="MINOVERLAP", default=opts[2],
- help="minimum overlap to include in graph")
-
- # logical options
- parser.add_argument("-o", "--one", action="store_true", required=False,
- dest="ONEPATH", default=ONEPATH_DEFAULT,
- help="search one path only")
-
- parser.add_argument("-a", "--all", action="store_true", required=False,
- dest="MULTIPATH", help="extend search to many paths")
-
- parser.add_argument("-g", "--graph", action="store_true", required=False,
- dest="OUTPUTGRAPH", default=OUTPUTGRAPH_DEFAULT,
- help="output a graph?")
-
- args = parser.parse_args()
- if args.MULTIPATH:
- args.ONEPATH = False
-
- return args
-
-
-def permuteList(l, perm):
- '''apply a permutation to a list'''
- return [perm[i] for i in l]
-
-
-def permuteGraph(graph, perm):
- '''apply a permutation to a graph'''
- return dict([(perm[g], permuteList(child, perm))
- for (g, child) in graph.items()])
-
-
-def permute(graph, reads, descList):
- ''' pick a random elt of S_{len(graph) - 2} and apply the permutation
- (fixing the source and sink) to graph, reads, and descList
- '''
- # build permutation in S_n
- n = len(graph) - 2
- perm = list(range(n))
- random.shuffle(perm)
- # and then make sure to fix the source and sink
- perm.append(source)
- perm.append(sink)
- if verbose:
- print('permuting with', perm)
- newGraph = permuteGraph(graph, perm)
- #newReads = [reads[i] for i in perm[:-2]]
- # DOESN'T WORK - NEED INVERSE OF PERM
-
- newReads = [[] for i in range(n)]
- for i in range(n):
- newReads[perm[i]] = reads[i]
- # permute labels in descList
- tmpdescList = list(
- map(lambda l: permuteList(l, perm), map(list, descList)))
- # also permute places ### VIA INVERSE
- newdescList = [[] for i in tmpdescList]
- for i in range(n + 2):
- newdescList[perm[i]] = tmpdescList[i]
- return newGraph, newReads, newdescList
-
-
-def readFasta(filename):
- '''name says it'''
- try:
- f = open(filename, 'r')
- except IOError:
- return [], 0
- ans = []
- maxEnd = 0
- for line in f:
- pos, seq = line.split()
- pos = int(pos)
- end = pos + len(seq)
- if end > maxEnd:
- maxEnd = end
- ans.append([pos, seq])
- return ans, maxEnd
-
-
-def printFasta(seqs, lineLen=80):
- '''simple enough'''
- start = 0
- totalLen = len(seqs[0])
-
- while start < totalLen:
- length = min(lineLen, totalLen - start)
- print()
- for i in seqs:
- print(i[start:start + length])
- start += length
- return 0
-
-
-def addSourceSink(graph, descList, reads, seqLen):
- """
- adds a source and sink onto the graph
- and into descList
- """
- source = len(reads)
- sink = len(reads) + 1
- maxstart = 0
- for r in reads:
- if r[0] > maxstart:
- maxstart = r[0]
- # reads.append([-1,''])
- #reads.append([seqLen + 1,''])
-
- # need edges source -> r whenever r has no parents
- # and r -> sink whenever it has no children
-
- # find everything which is not a child
-
- allChildren = set([])
- list(map(allChildren.update, [graph[i] for i in graph]))
- initialReads = set(list(range(len(graph)))).difference(allChildren)
- graph.setdefault(source, list(initialReads))
-
- # find everything without children and create edge to sink
- for r in range(len(graph)):
- if graph[r] == []:
- graph[r].append(sink)
- graph.setdefault(sink, [])
-
- # print(graph)
- if OUTPUTGRAPH:
- print('BEGIN')
- for r in graph:
- #print('node name', r)
- #print('node start', reads[r][0])
- #print('children', graph[r])
- if r == sink:
- print(1000, ':', r, ':', [])
- if r == source:
- print(-1, ':', r, ':', graph[r])
- if (r != sink and r != source):
- print(reads[r][0], ':', r, ':', graph[r])
- print('END')
- print(graph)
- # fix up descList
- # everything as sink as desc
- for i in descList:
- i.add(sink)
- # push source and then sink on descList
- # source has everthing as desc
- sourceDesc = set(list(range(len(descList))))
- sourceDesc.add(sink)
- sinkDesc = set([])
- descList.append(sourceDesc)
- descList.append(sinkDesc)
-
-
-def makeGraph(reads):
- '''missing'''
- def readsMatch(r, s):
- '''check if two reads agree on their overlap
- called with r starting before s
- '''
- overlap = r[0] + len(r[1]) - s[0]
- if overlap < MINOVERLAP:
- return 0
- if overlap > len(s[1]):
- overlap = len(s[1])
- #if overlap < 0: overlap = 0
- return r[1][-overlap:] == s[1][:overlap]
-
- graph = {}
- # initialize graph
- for r in range(len(reads)):
- graph.setdefault(r, [])
-
- # set edges between reads
- for r in range(len(reads)):
- if r % 100 == 0:
- print(r)
- for s in range(len(reads)):
- if reads[r][0] < reads[s][0]:
- if readsMatch(reads[r], reads[s]):
- # edge from r to s
- graph.setdefault(r, []).append(s)
- return graph
-
-
-def findDescendants(graph):
- '''missing'''
- descList = []
- for i in range(len(graph)):
- descList.append(set([]))
-
- def getDesc(node):
- '''missing'''
- #print('getDesc called on', node)
- # node is an int,
- # descList is a list of lists of ints
- # set descList[node] by:
- # union of all descList[x] for x in graph[node]
- if len(descList[node]) > 0:
- #print(node, descList[node])
- return descList[node]
- else:
- #descList[node] = descList[node].union(set(graph[node]))
- descList[node].update(set(graph[node]))
- for x in graph[node]:
- # doesn't speed it up
- # if len(descList[x]) > 0:
- # descList[node] = descList[node].union(descList[x])
- # else:
- #descList[node] = descList[node].union(getDesc(x))
- descList[node].update(getDesc(x))
- return descList[node]
-
- # must call getDesc on the entire graph
- # but most of the calls are trivial
- for i in range(len(graph)):
- getDesc(i)
- #print('desc of', i, 'are', getDesc(i))
- return descList
-
-
-def removeCycles(graph, descList):
- '''missing'''
- for i in graph.keys():
- ##print('checking the',len(graph[i]),'children of node',i)
- # python is a pain if we modify graph[i] while looping over it
- # so need a temporary variable
- tmp = []
- tmp.extend(graph[i])
- for j in tmp:
- for k in tmp:
- if j != k:
- # print('checking',j,k)
- if j in descList[k]:
- #print('removing node',j)
- #print('graph[i] =',graph[i])
- graph[i].remove(j)
- break
- ###print('reduced to',len(graph[i]), 'children')
- return graph
-
-
-def matchToChain(match, n):
- '''takes a matching and creates the
- corresponding chain in the poset
- Who knows how it works...'''
- startpoints = set(list(range(n))).difference(list(match.values()))
- allChains = []
- for i in startpoints:
- curChain = []
- curChain.append(i)
- tmp = i
- while True:
- if tmp in match.keys():
- tmp = match[tmp]
- curChain.append(tmp)
- else:
- break
- allChains.append(curChain)
- return allChains
-
-
-def onePath(x, y):
- ''' find one path between x and y
- using descList
- currently unused, but might be nice
- '''
- path = [x]
- if y not in descList[x]:
- return None
- if x == y:
- return path
- done = 0
- while not done:
- for child in graph[path[-1]]:
- if y in descList[child]:
- path.append(child)
- break
- if y == child:
- path.append(y)
- done = 1
- break
- return path
-
-
-def find_all_paths(GRAPH, start, end, path=[]):
- '''from Guido's essay'''
- path = path + [start]
- if start == end:
- return [path]
-# if not GRAPH.has_key(start):
- if start not in GRAPH:
- return []
- paths = []
- for node in GRAPH[start]:
- if node not in path:
- newpaths = find_all_paths(GRAPH, node, end, path)
- for newpath in newpaths:
- paths.append(newpath)
- return paths
-
-
-def allPaths(x, y):
- '''find all paths from x to y in the graph'''
- ##print('allPaths', x, y)
- paths = [[x]]
- if y not in descList[x]:
- return None
- if x == y:
- return paths
- finishedpaths = []
- while len(paths) > 0:
- # for last element of each list in paths
- # loop over children, making new list of paths
- newpaths = []
- # for every path
- for i in range(len(paths)):
- # try to extend it
- for child in graph[paths[i][-1]]:
- # if we can extend along this child
- if y in descList[child]:
- # y is later, so create a new, longer path
- # FIXME: test this, possibly just use find_all_paths
- #tmppath = []
- # tmppath.extend(paths[i])
- # tmppath.append(child)
- newpaths.append(paths[i] + [child])
- if y == child:
- # done, push on final
- tmppath = []
- tmppath.extend(paths[i])
- # don't include y
- # tmppath.append(child)
- finishedpaths.append(tmppath)
- #print('newpaths = ', newpaths)
- paths = newpaths
- #print('returns', len(finishedpaths), 'paths')
- return finishedpaths
-
-
-def pathToGeno(path):
- '''
- first get rid of sink and source
- (this has the side effect of modifying the paths)
- '''
- geno = ''
- if len(path) == 0:
- return None
- if path[0] == source:
- path.pop(0)
- if len(path) == 0:
- return None
- if path[-1] == sink:
- path.pop(-1)
- for i in range(len(path) - 1):
- r = path[i]
- s = path[i + 1]
- # nonoverlap is how much of read r occurs before read s starts
- nonoverlap = reads[s][0] - reads[r][0]
- geno = geno + reads[r][1][:nonoverlap]
- # geno.append(reads[r][1][:nonoverlap]
- # add the last read
- geno = geno + reads[path[-1]][1]
- # geno.append(reads[path[-1]][1])
-
- # pad with .'s at beginning and end to make right length
- # countBegin = 0
- # countEnd = 0
-
- for i in range(reads[path[0]][0]):
- # geno.append('.')
- geno = '.' + geno
- # countBegin += 1
-
- offsetEnd = seqLen - reads[path[-1]][0] - len(reads[path[-1]][1])
- for i in range(offsetEnd):
- geno = geno + '.'
- # countEnd += 1
-
-# print('path of length',len(path), 'has', countBegin, '. and', countEnd)
-# if countBegin > .5 * len(geno):
-# geno = None
-# if countEnd > .5 * len(geno):
-# geno = None
-
- return geno
-
-
-def extendChainToOne(chain):
- '''extend chain to ONE path in the graph
- input: a chain (list of ints)
- output: a list containing a path that extends the chain
- '''
- ans = []
- if len(chain) == 0:
- return ans
- if chain[0] == source:
- chain.pop(0)
- if len(chain) == 0:
- return None
- if chain[-1] == sink:
- chain.pop(-1)
- # deal with paths from source to chain[0] but
- # if chain started as [source, sink] then it is of
- # length 0 now
- if len(chain) == 0:
- ans = onePath(source, sink)
- else:
- ans = onePath(source, chain[0])
- for i in range(len(chain)):
- if i != len(chain) - 1:
- nxt = chain[i + 1]
- else:
- nxt = sink
- if nxt not in graph[chain[i]]:
- ans.extend(onePath(chain[i], nxt))
- else:
- ans.append(chain[i])
- return [ans]
-
-
-def extendChain(chain):
- '''extend chain to all possible paths in graph
- input: a chain (list of ints)
- output: a list of all paths that extend the chain
- print 'input:',chain
- '''
- ans = [[]]
- newans = []
- if len(chain) == 0:
- return ans
- if chain[0] == source:
- chain.pop(0)
- if len(chain) == 0:
- return None
- if chain[-1] == sink:
- chain.pop(-1)
- # deal with paths from source to chain[0] but
- # if chain started as [source, sink] then it is of
- # length 0 now
- if len(chain) == 0:
- ans = allPaths(source, sink)
- else:
- ans = allPaths(source, chain[0])
- for i in range(len(chain)):
- #print('currently have', len(ans),'chains')
- if i != len(chain) - 1:
- nxt = chain[i + 1]
- else:
- nxt = sink
- if nxt not in graph[chain[i]]:
- for p in allPaths(chain[i], nxt):
- for j in ans:
- newans.append(j + p)
- ans = newans
- newans = []
- else:
- for j in ans:
- j.append(chain[i])
- ##print('output:', ans)
- return ans
-
-
-def minimalCover(graph, reads, descList):
- """ the main code: takes a graph and creates a matching
- problem. then solves the matching to get chains in
- the graph, which are extended to paths in all ways """
-
- #print('making bipartite graph')
- # make bipartiteGraph
- biGraph = {}
- for i in range(len(graph)):
- biGraph.setdefault(i, list(descList[i]))
-
- print('...finding matching')
- # find matching
- a = shorah_matching.bipartiteMatch(biGraph)
- # a[0] is the matching, a[1] and a[2] the parts of
- # the maximal independent sets in the two parts of biGraph
- print('...Reversing matching')
- # reverse the matching
- match = dict([(v, k) for (k, v) in a[0].items()])
- # free some memory, perhaps
- del a
- #print('matching =', match)
-
- # transform to chains in poset
- chainList = matchToChain(match, len(graph))
- if OUTPUTGRAPH:
- print('chains=', chainList)
- maximalAntiChain = len(chainList)
- print('...largest antichain of size', maximalAntiChain)
- if verbose:
- for zz in range(len(chainList)):
- print(zz, '==', chainList[zz])
-
- # now that we're done with the matching, add the source and sink into graph
- # this is needed for the extendChain and pathToGeno functions
- # now done before the matching
- #addSourceSink(graph, descList, reads, seqLen)
-
- print('...transforming chains to paths')
- # transform chains to paths in all possible ways
- if ONEPATH:
- paths = list(map(extendChainToOne, chainList))
- else:
- paths = list(map(extendChain, chainList))
-
- print(len(chainList), 'chains give', sum(map(len, paths)),
- 'paths (', list(map(len, paths)), ')')
- if verbose:
- print('paths=', paths)
-
- print('...translating paths->genotypes')
- genos = []
- for i in range(len(paths)):
- genos.extend(map(pathToGeno, paths[i]))
- del(reads)
-
- return genos, maximalAntiChain
-
-
-def countPaths(graph, source, sink):
- '''missing'''
- # numPaths[i] = # of paths from i to the sink in graph
- numPaths = [-1 for i in range(len(graph))]
-
- numPaths[sink] = 1
-
- def getPaths(node):
- ''' count paths from node to sink '''
- if numPaths[node] < 0:
- numPaths[node] = sum(map(getPaths, graph[node]))
- #print('node', node, 'has', numPaths[node], 'paths to sink')
- return numPaths[node]
- getPaths(source)
- #print('paths =', numPaths)
- return numPaths[source]
-
-
-###########main code#########
-
-def main(readfile, maxhaplo=MAXGENO_DEFAULT, maxiter=MAXITER_DEFAULT,
- minoverlap=MINOVERLAP_DEFAULT, onepath=ONEPATH_DEFAULT,
- outputgraph_local=OUTPUTGRAPH_DEFAULT):
- '''This can be called from shorah.py directly
- '''
- # want to do outfile = readfile; outfile ~= s/rest/geno$/
- if readfile[-4:] == 'rest' or readfile[-4:] == 'read':
- #outFileName = readfile[:-4] + str(MAXGENO) + '.geno'
- outFileName = readfile[:-4] + 'geno'
- else:
- outFileName = readfile + '.geno'
-
- # This use of global variables should definitely be changed
- global verbose, seqLen, descList, graph, reads, source, sink
- global MAXGENO
- MAXGENO = maxhaplo
-
- global MAXITER
- MAXITER = maxiter
-
- global ONEPATH
- ONEPATH = onepath
-
- global MINOVERLAP
- MINOVERLAP = minoverlap
-
- global OUTPUTGRAPH
- OUTPUTGRAPH = outputgraph_local
-
- print('inputfile =', readfile)
- print('outputfile=', outFileName)
- print('Creating up to', MAXGENO, 'haplotypes')
-
- print('Permuting up to', MAXITER, 'times')
- print('ONEPATH = ', ONEPATH)
- print('minimum overlap in graph:', MINOVERLAP)
-
- # get reads
- reads, seqLen = readFasta(readfile)
- if seqLen == 0:
- print('No reads in file (or couldn\'t open file)')
- return 0
- source = len(reads)
- sink = len(reads) + 1
- if len(reads) < 51:
- verbose = 1
- else:
- verbose = 0
- print(len(reads), 'reads, sequence length =', seqLen)
- if verbose:
- for r in reads:
- print(reads.index(r), r)
-
- # Build read graph
- graph = makeGraph(reads)
- if verbose:
- print('graph = ', graph)
- print('finished graph')
-
- # take transitive closure of graph
- descList = findDescendants(graph)
- if verbose:
- print('desc =', list(map(list, descList)))
- print('...done with transitive closure')
-
- print('...removing cycles')
- graph = removeCycles(graph, descList)
- if verbose:
- print('graph \ cycles = ', graph)
-
- # add on a source and sink to the graph
- addSourceSink(graph, descList, reads, seqLen)
- if verbose:
- print('graph with SS', graph)
- print('new Desclist', list(map(list, descList)))
- # count paths in the graph
-
- totalPaths = countPaths(graph, source, sink)
- print('There are', totalPaths, 'paths in the graph')
-
- #global maximalAntiChain
-
- allGenos = set([])
- DONE = False
-
- iteration = 0
- if totalPaths < MAXGENO:
- print('This is less than MAXGENO =', MAXGENO)
- print('So matching is not run')
- paths = extendChain([source, sink])
- genos = list(map(pathToGeno, paths))
- allGenos.update(genos)
- DONE = True
- genos, maximalAntiChain = minimalCover(graph, reads, descList)
- else:
- print('starting matching')
- print()
- while not DONE and (len(allGenos) < MAXGENO) and (iteration < MAXITER):
- # Done with preproccessing. Now permute
- # graph, reads, and descList and get a new set of genotypes
- # then find a minimal chain decomposition of the graph
- # and return all genotypes obtained by extending
- # these chains
- if iteration > 1:
- print('...permuting')
- graph, reads, descList = permute(graph, reads, descList)
- genos, maximalAntiChain = minimalCover(graph, reads, descList)
- sizeBefore = len(allGenos)
- allGenos.update(genos)
- sizeAfter = len(allGenos)
- print('found', len(allGenos), 'genotypes out of', MAXGENO)
- print('gained', sizeAfter - sizeBefore, 'in round', iteration)
- print()
- # printFasta(genos,1000)
- iteration += 1
-
- outfile = open(outFileName, 'w')
-
- for i in list(allGenos):
- # check if i == None
- if i:
- outfile.write(i)
- outfile.write('\n')
- outfile.close()
- if verbose:
- printFasta(list(allGenos), 1000)
-
- # summary
- print('graph has', totalPaths, 'total paths')
- print('minimal path cover is of size', maximalAntiChain)
- print('ran matching algorithm', iteration, 'times')
- print('output', len(allGenos), 'haplotypes')
-
- print('% TOTALPATHS =', totalPaths)
- print('% ANTICHAIN =', maximalAntiChain)
- print('% HAPLO =', len(allGenos))
diff --git a/src/samtools/meson.build b/src/samtools/meson.build
new file mode 100644
index 0000000..cb7b458
--- /dev/null
+++ b/src/samtools/meson.build
@@ -0,0 +1,45 @@
+bam_sources = files([
+ 'bam.c',
+ 'bam_aux.c',
+ 'bam_cat.c',
+ 'bam_import.c',
+ 'bam_index.c',
+ 'bam_lpileup.c',
+ 'bam_md.c',
+ 'bam_pileup.c',
+ 'bam_reheader.c',
+ 'bam_sort.c',
+ 'bedidx.c',
+ 'bgzf.c',
+ 'faidx.c',
+ 'knetfile.c',
+ 'kprobaln.c',
+ 'kstring.c',
+ 'razf.c',
+ 'sam.c',
+ 'sam_header.c',
+ 'bam.h',
+ 'bam_endian.h',
+ 'bgzf.h',
+ 'faidx.h',
+ 'kaln.h',
+ 'khash.h',
+ 'knetfile.h',
+ 'kprobaln.h',
+ 'kseq.h',
+ 'ksort.h',
+ 'kstring.h',
+ 'razf.h',
+ 'sam.h',
+ 'sam_header.h'])
+
+zlib_dep = dependency('zlib')
+
+bam_library = static_library(
+ 'libbam',
+ bam_sources,
+ dependencies : zlib_dep,
+ c_args : ['-D_USE_KNETFILE'],
+ install : false)
+
+bam_inc = include_directories('.')
diff --git a/src/scripts/dec.py.in b/src/scripts/dec.py.in
deleted file mode 100755
index f75a24e..0000000
--- a/src/scripts/dec.py.in
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env @PYTHON@
-
-from __future__ import print_function
-import argparse
-import os
-import shorah_dec
-
-# parse command line
-parser = argparse.ArgumentParser(description="Local haplotype recontruction: Dirichlet process mixture",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-# set the defaults (see http://bit.ly/2hCTQl)
-opts = shorah_dec.main.__defaults__
-
-group1 = parser.add_argument_group('Input files', 'Required input')
-group2 = parser.add_argument_group('Run options',
- 'Parameters that can (maybe should) be changed \
- according to the needs')
-group3 = parser.add_argument_group('More options',
- 'Do you really want to change this?')
-
-
-group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
- dest="in_bam", help="file with aligned reads in .bam format")
-
-group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
- dest="in_fasta", help="reference genome in fasta format")
-
-group2.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int,
- default=opts[0], dest="win_length", help="window size")
-
-group2.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
- default=opts[2], dest="region", help="region in format 'chrm:start-stop', \
- e.g. 'ch3:1000-3000'")
-
-group2.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float,
- default=opts[4], dest="alpha", help="alpha in dpm sampling")
-
-group2.add_argument("-S", "--seed", metavar='INT', required=False, type=int,
- default=opts[6], dest="seed", help="set seed for reproducible results")
-
-group3.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int,
- default=opts[1], dest="win_shifts", help="number of window shifts")
-
-group3.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
- default=opts[3], dest="max_coverage", help="approximate max coverage allowed")
-
-group3.add_argument("-k", "--keep_files", required=False, action='store_true',
- default=opts[5], dest="keep_files", help="keep all intermediate files")
-
-args = parser.parse_args()
-
-supported_formats = {
- 'bam': 'aligned reads',
- 'fasta': 'reference genome'
-}
-# check the input file is in supported format
-try:
- tmp_filename = os.path.split(args.in_bam)[1]
- [in_stem, in_format] = [tmp_filename.split('.')[0],
- tmp_filename.split('.')[-1]]
- t = supported_formats[in_format]
-except IndexError:
- shorah_dec.declog.error('The input file must be filestem.format')
- print('The input file must be filestem.format')
- print('Supported formats are')
- for sf in supported_formats.items():
- print(sf[0], ':', sf[1])
- sys.exit()
-except KeyError:
- shorah_dec.declog.error('format unknown')
- print(in_format, 'format unknown')
- print('Supported formats are')
- for sf in supported_formats.items():
- print(sf[0], ':', sf[1])
- sys.exit()
-
-shorah_dec.main(args.in_bam, args.in_fasta, args.win_length, args.win_shifts,
- args.region, args.max_coverage, args.alpha, args.keep_files, args.seed)
diff --git a/src/scripts/fas2read.pl.in b/src/scripts/fas2read.pl.in
deleted file mode 100755
index f948849..0000000
--- a/src/scripts/fas2read.pl.in
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env perl
-#translate a fasta file to read format
-
-use lib "@perldir@";
-use strict;
-use Getopt::Long;
-use DNA;
-use ReadUtil;
-
-my ($HELP, $VERBOSE, $FASTAFILE);
-
-GetOptions(
- "help" => \$HELP,
- "fasta=s" => \$FASTAFILE,
- );
-help() if (($HELP) or (! defined $FASTAFILE));
-my $outfile = $FASTAFILE;
-$outfile =~ s/fas$/read/;
-print "Reading $FASTAFILE\nOutput to $outfile\n";
-open OUT, ">$outfile";
-my $reads = readFastaFile($FASTAFILE);
-
-foreach my $rd (@$reads) {
- print OUT "$rd->{st} $rd->{seq}\n";
-}
-close OUT;
-
-sub help {
- print <.
-''' shorah.py is the program that performs the global reconstruction.
- It runs error correction, global reconstruction and frequency estimation
- by calling the relevant programs. Additionally, it performs SNV discovery
- by calling the program snv.py
-'''
-from __future__ import division
-import os
-import os.path
-import sys
-
-import logging
-import logging.handlers
-
-# Make a global logging object.
-sholog = logging.getLogger(__name__)
-
-dn = os.path.dirname(__file__)
-
-
-def run_child(exe_name, arg_string):
- '''use subrocess to run an external program with arguments'''
- import subprocess
-
- if not arg_string.startswith(' '):
- arg_string = ' ' + arg_string
-
- try:
- sholog.debug("running %s" % (exe_name + arg_string))
- retcode = subprocess.call(exe_name + arg_string, shell=True)
- if retcode < 0:
- sholog.error(exe_name + arg_string)
- sholog.error("Child %s terminated by signal" % exe_name, -retcode)
- else:
- sholog.debug("Child %s returned %i" % (exe_name, retcode))
- except OSError as ee:
- sholog.error("Execution of %s failed: %s" % (exe_name, ee))
-
- return retcode
-
-
-if __name__ == "__main__":
-
- import argparse
- import shutil
-
- from Bio import SeqIO
-
- import shorah_dec
- import shorah_mm
-
- # parse command line
- parser = argparse.ArgumentParser(description="ShoRAH: Short Read Assembly into Haplotypes",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
- # First define all option groups
- group1 = parser.add_argument_group('Input files', 'Required input')
- group2 = parser.add_argument_group('Run options',
- "Parameters that can (maybe should) be changed \
- according to the needs")
- group3 = parser.add_argument_group(
- 'More options', 'Do you really want to change this?')
-
- group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
- dest="b", help="sorted bam format alignment file")
-
- group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
- dest="f", help="reference genome in fasta format")
-
- group2.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float,
- dest="a", default=0.1, help="alpha in dpm sampling")
-
- group2.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int,
- dest="w", default=201, help="window size")
-
- group2.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
- dest="r", default='', help="region in format 'chr:start-stop', e.g.\
- 'chrm:1000-3000'")
-
- group2.add_argument("-S", "--seed", metavar='INT', required=False, type=int,
- dest="S", default=None, help="set seed for reproducible results")
-
- group3.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int,
- dest="s", default=3, help="number of window shifts")
-
- group3.add_argument("-i", "--sigma", metavar='FLOAT', required=False, type=float,
- dest="i", default=0.01, help="value of sigma to use when calling SNVs")
-
- group3.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
- dest="x", default=10000, help="approximate max coverage allowed")
-
- group3.add_argument("-k", "--keep_files", required=False, action="store_true", default=True,
- dest="k", help="keep intermediate files")
-
- args = parser.parse_args()
-
- # set logging level
- sholog.setLevel(logging.DEBUG)
- # This handler writes everything to a file.
- LOG_FILENAME = './shorah.log'
- h = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w',
- maxBytes=100000, backupCount=5)
- f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\
- %(lineno)d %(message)s")
- h.setFormatter(f)
- sholog.addHandler(h)
- sholog.info(' '.join(sys.argv))
-
- in_stem = '.'.join(os.path.split(args.b)[1].split('.')[:-1])
-
- # 1. run main from shorah_dec; it will also launch snv.py
- if not os.path.exists('%s.cor.fas' % in_stem):
- sholog.debug('running dec.py')
- shorah_dec.main(in_bam=args.b, in_fasta=args.f, win_length=args.w,
- win_shifts=args.s, max_coverage=args.x, region=args.r,
- alpha=args.a, keep_files=args.k, seed=args.S)
-
- # 2. copy file and run fas2read to convert from fasta
- shutil.move('%s.cor.fas' % in_stem, '%s_cor.fas' % in_stem)
- sholog.debug('running fas2reads')
- my_prog = 'perl -I %s %s' % (dn + '/perllib',
- os.path.join(dn, 'fas2read.pl'))
- my_arg = " -f %s_cor.fas" % in_stem
- assert os.path.isfile("%s_cor.fas" % in_stem), \
- 'File %s_cor.fas not found' % in_stem
- retcode_f2r = run_child(my_prog, my_arg)
- if retcode_f2r:
- sholog.error('fas2read did not return 0')
- sys.exit('Something went wrong in fas2read')
- else:
- sholog.debug('fas2read exited successfully')
-
- # 3. run contain to eliminate redundant reads
- # output: filein.rest
- sholog.debug('running contain')
- my_prog = os.path.join(dn, 'contain')
- my_arg = " -f %s_cor" % in_stem
- assert os.path.isfile('%s_cor.fas' % in_stem), \
- 'File %s_cor.fas not found' % in_stem
- retcode_c = run_child(my_prog, my_arg)
- if retcode_c:
- sholog.error('contain did not return 0')
- sys.exit('Something went wrong while running contain')
- else:
- sholog.debug('contain exited successfully')
-
- # 4. run main from mm.py to find minimal matching
- sholog.debug('running mm.py')
- shorah_mm.main('%s_cor.rest' % in_stem, maxhaplo=200)
-
- # 5. run EM freqEst output: sample.$nr.popl
- sholog.debug('running freqEst')
- my_prog = os.path.join(dn, 'freqEst')
- my_arg = " -f %s_cor" % in_stem
- assert os.path.isfile('%s_cor.rest' % in_stem), \
- 'File %s_cor.rest not found' % in_stem
- retcode_em = run_child(my_prog, my_arg)
- if retcode_em:
- sholog.error('freqEst did not return 0')
- sys.exit('Something went wrong in the EM step')
- else:
- sholog.debug('freqEst exited successfully')
-
- # 6. copy high frequency haps from .popl to global_haps.fasta
- n_rest = len(list(open('%s_cor.rest' % in_stem)))
- min_freq = 1.0 / n_rest
- g_recs = []
- for s in SeqIO.parse('%s_cor.popl' % in_stem, 'fasta'):
- f_here = float(s.id.split('_')[1])
- if f_here >= min_freq:
- s.description = '| global_haplotype frequency=%f\n' % f_here
- g_recs.append(s)
- SeqIO.write(g_recs, '%s_global_haps.fasta' % in_stem, 'fasta')
diff --git a/src/scripts/snv.py.in b/src/scripts/snv.py.in
deleted file mode 100755
index 7fbdbed..0000000
--- a/src/scripts/snv.py.in
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env @PYTHON@
-
-import argparse
-import shorah_snv
-
-# parse command line
-parser = argparse.ArgumentParser(description="Single nucleotide variant calling",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-# set the defaults (see http://bit.ly/2hCTQl)
-opts = shorah_snv.main.__defaults__
-
-requiredNamed = parser.add_argument_group('required named arguments')
-
-requiredNamed.add_argument("-r", "--ref", metavar='REF', type=str, required=True,
- dest="reference", help="reference file")
-
-requiredNamed.add_argument("-b", "--bam", metavar='BAM', type=str, required=True,
- dest="bam_file", help="sorted bam format alignment file")
-
-parser.add_argument("-s", "--sigma", metavar='FLOAT', default=opts[0], type=float,
- required=False, dest="sigma", help="value of sigma to use when calling\
- SNVs")
-
-parser.add_argument("-i", "--increment", metavar='INT', default=opts[1], type=int,
- required=False, dest="increment",
- help="value of increment to use when calling \
- SNVs (1 used by amplian.py)")
-
-parser.add_argument("-x", "--maxcov", metavar='INT', default=opts[2], type=int,
- dest="max_coverage", help="Maximum coverage allowed")
-
-args = parser.parse_args()
-
-shorah_snv.main(args.reference, args.bam_file, args.sigma,
- args.increment, args.max_coverage)
diff --git a/src/shorah/__main__.py b/src/shorah/__main__.py
new file mode 100644
index 0000000..42e6c65
--- /dev/null
+++ b/src/shorah/__main__.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+"""Entrypoint module, in case you use `python -mshorah`.
+
+Why does this file exist, and why __main__? For more info, read:
+
+- https://www.python.org/dev/peps/pep-0338/
+- https://docs.python.org/2/using/cmdline.html#cmdoption-m
+- https://docs.python.org/3/using/cmdline.html#cmdoption-m
+"""
+from shorah.cli import main
+
+if __name__ == "__main__":
+ main()
diff --git a/src/scripts/amplian.py.in b/src/shorah/amplicon.py
old mode 100755
new mode 100644
similarity index 67%
rename from src/scripts/amplian.py.in
rename to src/shorah/amplicon.py
index 81e1365..aef34e5
--- a/src/scripts/amplian.py.in
+++ b/src/shorah/amplicon.py
@@ -1,6 +1,6 @@
-#!/usr/bin/env @PYTHON@
+#!/usr/bin/env python3
-# Copyright 2007-2012
+# Copyright 2007-2018
# Niko Beerenwinkel,
# Nicholas Eriksson,
# Moritz Gerstung,
@@ -32,20 +32,30 @@
import os
import os.path
import sys
+import shlex
import logging
import logging.handlers
+from pkg_resources import resource_filename
from Bio import SeqIO
-# Make a global logging object.
-amplog = logging.getLogger(__name__)
+dn_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if __name__ == '__main__':
+ if __package__ is None:
+ os.sys.path.insert(1, dn_dir)
+ mod = __import__('shorah')
+ sys.modules["shorah"] = mod
+ import shorah_snv
+else:
+ from . import shorah_snv
+
+diri_exe = resource_filename(__name__, 'bin/diri_sampler')
+b2w_exe = resource_filename(__name__, 'bin/b2w')
win_min_ext = 0.95
cutoff_depth = 10000
-dn = os.path.dirname(__file__)
-
def run_child(exe_name, arg_string):
'''use subrocess to run an external program with arguments'''
@@ -54,17 +64,17 @@ def run_child(exe_name, arg_string):
if not arg_string.startswith(' '):
arg_string = ' ' + arg_string
- amplog.debug(exe_name + arg_string)
+ logging.debug(exe_name + arg_string)
try:
retcode = subprocess.call(exe_name + arg_string, shell=True)
if retcode > 0:
- amplog.error(exe_name + arg_string)
- amplog.error("Child %s terminated by signal" % exe_name, retcode)
+ logging.error(exe_name + arg_string)
+ logging.error("Child %s terminated by signal %s", exe_name, retcode)
else:
- amplog.debug("Child %s returned %i" % (exe_name, retcode))
+ logging.debug("Child %s returned %i", exe_name, retcode)
except OSError as ee:
- amplog.error("Execution of %s failed:" % exe_name, ee)
+ logging.error("Execution of %s failed: %s", exe_name, ee)
return retcode
@@ -103,15 +113,15 @@ def run_diagnostics(window_file, reads):
untouched.append(unt)
theta.append(the)
gamma.append(gam)
- del(it)
+ del it
- amplog.info('sample has %d reads' % reads)
+ logging.info('sample has %d reads', reads)
untouched_hst = untouched[-2000:]
unt_mean = sum(untouched_hst) / len(untouched_hst)
unt_ratio = 100 * unt_mean / q
unt_msg = '%3.1f %% of untouched objects ' % \
unt_ratio
- amplog.info(unt_msg)
+ logging.info(unt_msg)
if unt_ratio < 90.0:
warnings.warn(unt_msg)
@@ -123,7 +133,7 @@ def matchremove(matchobj):
c = int(match[1:])
else:
c = 0
- del(c)
+ del c
return ''
@@ -145,7 +155,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords):
try:
import matplotlib.pyplot as plt
except ImportError:
- amplog.error('could not import matplotlib, no pdf produced')
+ logging.error('could not import matplotlib, no pdf produced')
return
high_start, high_stop = win_coords
@@ -173,7 +183,7 @@ def plot_entropy(pos_ent, pos_coords, ave_ent, win_coords):
plt.title('chosen entropy window is %d-%d' % (high_start, high_stop))
plt.savefig('entropy.pdf')
-
+ del fig
def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
'''Parse reads to have their length distribution and compute the
@@ -189,13 +199,13 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
os.remove('rl.txt')
read_len = sorted(read_len)
n_reads = len(read_len)
- amplog.info('n_reads: %d' % n_reads)
+ logging.info('n_reads: %d', n_reads)
# max_len = max(read_len)
trimmed_mean = sum([read_len[i] for i in range(int(0.1 * n_reads),
int(0.9 * n_reads))])
trimmed_mean /= (0.8 * n_reads)
trimmed_mean = int(round(trimmed_mean, 0))
- amplog.info('trimmed_mean: %d' % trimmed_mean)
+ logging.info('trimmed_mean: %d', trimmed_mean)
# Build the mpileup and compute the entropy per position
ref_seq = list(SeqIO.parse(fasta_file, 'fasta'))[0]
entropy = [None] * (len(ref_seq) + 1)
@@ -232,7 +242,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
stop = i
break
stop = i
- amplog.info('start: %d, stop: %d' % (start, stop))
+ logging.info('start: %d, stop: %d', start, stop)
# mean entropy
ent_mean = [None] * (len(ref_seq) + 1)
@@ -246,7 +256,7 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
if entropy[i] > max_ent_per_pos:
max_ent_per_pos = entropy[i]
highest_ent_pos = i
- amplog.info('highest entropy found at position %d' % highest_ent_pos)
+ logging.info('highest entropy found at position %d', highest_ent_pos)
# the window is chosen as the absolute max mean_entropy or as the
# max mean entropy covering the position with max entropy
@@ -293,26 +303,24 @@ def highest_entropy(bam_file, fasta_file, ent_sel='relative'):
return high_ent_start, high_ent_stop
-def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
- alpha=0.5, s=0.01, region='', diversity=False):
+#def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
+# alpha=0.5, s=0.01, region='', diversity=False):
+def main(args):
'''
Performs the amplicon analysis, running diri_sampler
and analyzing the result
'''
-
- import shorah_snv
-
- # set logging level
- amplog.setLevel(logging.DEBUG)
- # This handler writes everything to a file.
- LOG_FILENAME = './amplian.log'
- hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w',
- maxBytes=100000, backupCount=5)
- f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\
- %(lineno)d %(message)s")
- hl.setFormatter(f)
- amplog.addHandler(hl)
- amplog.info(' '.join(sys.argv))
+ in_bam = args.b
+ in_fasta = args.f
+ region = args.r
+ max_coverage = args.max_coverage
+ alpha = args.a
+ seed = args.seed
+ sigma = args.sigma
+ diversity = args.diversity
+ min_overlap = args.min_overlap
+
+ logging.info(' '.join(sys.argv))
# info on reference and region if given, or discover high entropy one
ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0]
ref_name = ref_seq.id
@@ -329,101 +337,29 @@ def main(in_bam, in_fasta, min_overlap=0.95, max_coverage=50000,
ref_length = len(ref_seq)
reg_stop = ref_length
- amplog.info('analysing region from %d to %d' % (reg_start, reg_stop))
+ logging.info('analysing region from %d to %d', reg_start, reg_stop)
# output the reads, aligned to the amplicon
- b2w_exe = os.path.join(dn, 'b2w')
+
b2w_args = ' -i 0 -w %d -m %d -x %d %s %s %s' % \
(ref_length, int(min_overlap * ref_length),
max_coverage, in_bam, in_fasta, region)
- ret_b2w = run_child(b2w_exe, b2w_args)
- amplog.debug('b2w returned %d' % ret_b2w)
+ ret_b2w = run_child(shlex.quote(b2w_exe), b2w_args)
+ logging.debug('b2w returned %d', ret_b2w)
# run diri_sampler on the aligned reads
win_file = 'w-%s-%d-%d.reads.fas' % (ref_name, reg_start, reg_stop)
h = list(open('coverage.txt'))[0]
n_reads = int(h.split()[-1])
assert os.path.exists(win_file), 'window file %s not found' % win_file
- diri_exe = os.path.join(dn, 'diri_sampler')
+
iterations = min(30000, n_reads * 20)
diri_args = '-i %s -j %d -a %f -t 2000' % (win_file, iterations, alpha)
- ret_diri = run_child(diri_exe, diri_args)
- amplog.debug('diri_sampler returned %d' % ret_diri)
+ ret_diri = run_child(shlex.quote(diri_exe), diri_args)
+ logging.debug('diri_sampler returned %d', ret_diri)
# diagnostics on the convergence
run_diagnostics(win_file, n_reads)
# run snv.py to parse single nucleotide variants
- shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=s, increment=1)
-
-
-if __name__ == "__main__":
-
- import argparse
- # parse command line
- parser = argparse.ArgumentParser(description="Local haplotype reconstruction - amplicon mode",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- opts = main.__defaults__ # set the defaults (see http://bit.ly/2hCTQl)
-
- # First define all option groups
- group1 = parser.add_argument_group("Input files", "Required input")
-
- group2 = parser.add_argument_group("Type of run",
- "You can specify a region, or look for the highest diversity\
- region")
- group3 = parser.add_argument_group("Run options", "Fine tuning")
- group4 = parser.add_argument_group("More options",
- "Do you really want to change this?")
-
- group1.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
- dest="in_bam", help="file with aligned reads in .bam format")
-
- group1.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
- dest="in_fasta", help="reference genome in fasta format")
-
- group2.add_argument("-r", "--region", metavar='chrm:start-stop', default=opts[4], type=str,
- dest="region", help="region in format 'chrm:start-stop' e.g.\
- 'ch3:1000-1300'")
-
- group2.add_argument("-d", "--diversity", action="store_true", default=opts[5],
- dest="diversity", help="if set, automatically detects the highest entropy\
- region and runs there")
-
- group3.add_argument("-m", "--min_overlap", metavar='FLOAT', default=opts[0], type=float,
- dest="min_overlap", help="fraction of read overlap to be included")
-
- group3.add_argument("-a", "--alpha", metavar='FLOAT', default=opts[2], type=float,
- dest="alpha", help="alpha in dpm sampling")
-
- group4.add_argument("-x", "--maxcov", metavar='INT', default=opts[1], type=int,
- dest="max_coverage", help="approximate max coverage allowed")
-
- group4.add_argument("-s", "--sigma", metavar='FLOAT', default=opts[3], type=float,
- dest="s", help="sigma value to use when calling SNVs")
-
- args = parser.parse_args()
-
- supported_formats = {
- 'bam': 'aligned reads',
- 'fasta': 'reference genome'
- }
- # check the input file is in supported format
- try:
- tmp_filename = os.path.split(args.in_bam)[1]
- [in_stem, in_format] = [tmp_filename.split('.')[0],
- tmp_filename.split('.')[-1]]
- t = supported_formats[in_format]
- except IndexError:
- print('The input file must be filestem.format')
- print('Supported formats are')
- for sf in supported_formats.items():
- print(sf[0], ':', sf[1])
- sys.exit()
- except KeyError:
- print('usage: amplian.py -b bam_file -f fasta_reference')
- sys.exit('Please run with -h for all options')
- if args.diversity and args.region != '':
- sys.exit('Either detect the highest entropy region, or specify one')
-
- main(args.in_bam, args.in_fasta, args.min_overlap, args.max_coverage, args.alpha,
- args.s, args.region, args.diversity)
+ shorah_snv.main(reference=in_fasta, bam_file=in_bam, sigma=sigma, increment=1)
diff --git a/src/shorah/bin/.gitignore b/src/shorah/bin/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/src/shorah/cli.py b/src/shorah/cli.py
new file mode 100644
index 0000000..f2ab9a4
--- /dev/null
+++ b/src/shorah/cli.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+# Copyright 2007-2018
+# Niko Beerenwinkel,
+# Nicholas Eriksson,
+# Moritz Gerstung,
+# Lukas Geyrhofer,
+# Kerensa McElroy,
+# Osvaldo Zagordi,
+# ETH Zurich
+
+# This file is part of ShoRAH.
+# ShoRAH is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# ShoRAH is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with ShoRAH. If not, see .
+"""
+Module that contains the command line app.
+
+Why does this file exist, and why not put this in __main__?
+
+ You might be tempted to import things from __main__ later, but that will cause
+ problems: the code will get executed twice:
+
+ - When you run `python -mminvar` python will execute
+ ``__main__.py`` as a script. That means there won't be any
+ ``minvar.__main__`` in ``sys.modules``.
+ - When you import __main__ it will get executed again (as a module) because
+ there's no ``minvar.__main__`` in ``sys.modules``.
+
+ Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration
+"""
+import os
+import sys
+
+from pkg_resources import (get_distribution, DistributionNotFound)
+
+try:
+ __version__ = get_distribution('shorah').version
+except DistributionNotFound:
+ # probably installed using Autotools
+ with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), '.version'), 'r') as version_file:
+ __version__ = version_file.read()
+
+# manipulate path to import functions
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if __name__ == '__main__':
+ if __package__ is None:
+ os.sys.path.insert(1, parent_dir)
+ mod = __import__('shorah')
+ sys.modules["shorah"] = mod
+ #from common import hcv_map, hiv_map, org_dict, wobbles
+ #from stats import (genome_coverage, start_stop_coverage)
+#else:
+ #from .common import hcv_map, hiv_map, org_dict, wobbles
+ #from .stats import (genome_coverage, start_stop_coverage)
+
+# each subcommand takes one of these functions as default
+
+def shotgun_run(args):
+ """Default function for command line parser."""
+ from shorah import shotgun
+ shotgun.main(args)
+
+def amplicon_run(args):
+ """Default function for command line parser."""
+ from shorah import amplicon
+ amplicon.main(args)
+
+def main():
+ """Parse command line, run default functions."""
+ import argparse
+ # parse command line
+ # create the top-level parser
+ parser = argparse.ArgumentParser(usage='%(prog)s [options]',
+ epilog="Run `shorah subcommand -h` for more help")
+ parser.add_argument('-v', '--version', action='version', version=__version__)
+
+ parser.add_argument("-b", "--bam", metavar='BAM', required=True, type=str,
+ dest="b", help="sorted bam format alignment file")
+
+ parser.add_argument("-f", "--fasta", metavar='REF', required=True, type=str,
+ dest="f", help="reference genome in fasta format")
+
+ parser.add_argument("-a", "--alpha", metavar='FLOAT', required=False, type=float,
+ dest="a", default=0.1, help="alpha in dpm sampling")
+
+ parser.add_argument("-r", "--region", metavar='chrm:start-stop', required=False, type=str,
+ dest="r", default='', help="region in format 'chr:start-stop', e.g. 'chrm:1000-3000'")
+
+ parser.add_argument("-R", "--seed", metavar='INT', required=False, type=int,
+ dest="seed", default=None, help="set seed for reproducible results")
+
+ parser.add_argument("-x", "--maxcov", metavar='INT', required=False, type=int,
+ default=10000, dest="max_coverage", help="approximate max coverage allowed")
+
+ parser.add_argument("-S", "--sigma", metavar='FLOAT', default=0.01, type=float,
+ dest="sigma", help="sigma value to use when calling SNVs")
+
+ subparsers = parser.add_subparsers(help='available sub-commands')
+
+ # create the parser for command "shotgun"
+ parser_shotgun = subparsers.add_parser('shotgun', help='run local analysis in shotgun mode')
+
+ parser_shotgun.add_argument("-w", "--windowsize", metavar='INT', required=False, type=int,
+ dest="w", default=201, help="window size")
+
+ parser_shotgun.add_argument("-s", "--winshifts", metavar='INT', required=False, type=int,
+ default=3, dest="win_shifts", help="number of window shifts")
+
+ parser_shotgun.add_argument("-k", "--keep_files", required=False, action='store_true',
+ default=True, dest="keep_files", help="keep all intermediate files")
+
+ parser_shotgun.set_defaults(func=shotgun_run)
+
+ parser_amplicon = subparsers.add_parser('amplicon', help='run local analysis in amplicon mode')
+
+ parser_amplicon.add_argument("-d", "--diversity", action="store_true", default=False,
+ dest="diversity", help="detect the highest entropy region and run there")
+
+ parser_amplicon.add_argument("-m", "--min_overlap", metavar='FLOAT', default=0.95, type=float,
+ dest="min_overlap", help="fraction of read overlap to be included")
+
+ parser_amplicon.set_defaults(func=amplicon_run)
+
+ # exit so that log file is not written
+ if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
+ parser.print_help()
+ sys.exit()
+
+ # logging configuration
+ import logging
+ import logging.handlers
+ logging.basicConfig(filename='shorah.log', level=logging.DEBUG,
+ format='%(levelname)s %(asctime)s %(filename)s: %(funcName)s() %(lineno)d: \t%(message)s',
+ datefmt='%Y/%m/%d %H:%M:%S')
+
+ logging.info(' '.join(sys.argv))
+ logging.info('shorah version:%s', __version__)
+ # parse the args
+ args = parser.parse_args()
+ args.func(args)
+
+if __name__ == "__main__": # and __package__ is None:
+ main()
diff --git a/src/shorah/contain.cpp b/src/shorah/contain.cpp
deleted file mode 100644
index 219d0f8..0000000
--- a/src/shorah/contain.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * contain.cpp -- eliminates redundant reads
- *
- * usage: contain -f basename
- * input: basename.read
- * output: basename.rest
- * both in the format "start_pos read_string"
- *
- # Copyright 2007-2012
- # Niko Beerenwinkel,
- # Nicholas Eriksson,
- # Moritz Gerstung,
- # Lukas Geyrhofer,
- # Kerensa McElroy
- # Osvaldo Zagordi,
- # ETH Zurich
-
- # This file is part of ShoRAH.
- # ShoRAH is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
-
- # ShoRAH is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with ShoRAH. If not, see .
-*/
-
-#include
-#include
-#include
-#include
-#include
-#include