Skip to content

Commit

Permalink
BayesTyper (v1.5)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonassibbesen committed Apr 1, 2019
1 parent 62888d6 commit 6f2c7cf
Show file tree
Hide file tree
Showing 20 changed files with 402 additions and 243 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Expand Up @@ -26,13 +26,13 @@ SET(BUILD_STATIC 0 CACHE BOOL "Build static")

if(${BUILD_STATIC} EQUAL 0)

SET(CMAKE_CXX_FLAGS "--std=c++11 -lpthread -g -O3 -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5 beta ${GIT_LAST_COMMIT_HASH}\"'")
SET(CMAKE_CXX_FLAGS "--std=c++11 -lpthread -g -O3 -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5 ${GIT_LAST_COMMIT_HASH}\"'")

else(${BUILD_STATIC} EQUAL 0)

message(STATUS "Building BayesTyper static")

SET(CMAKE_CXX_FLAGS "--std=c++11 -pthread -O3 -static -static-libgcc -static-libstdc++ -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5 beta\"'")
SET(CMAKE_CXX_FLAGS "--std=c++11 -pthread -O3 -static -static-libgcc -static-libstdc++ -DBT_KMER_SIZE='${KMER_SIZE}' -DBT_VERSION='\"v1.5\"'")
SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive")

SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
Expand Down
2 changes: 1 addition & 1 deletion include/bayesTyper/GenotypeWriter.hpp
Expand Up @@ -62,7 +62,7 @@ class GenotypeWriter {
GenotypeWriter(const string &, const ushort, const vector<Sample> &, const Chromosomes &, const Filters &);

void addGenotypes(vector<Genotypes *> *);
void finalise(const string &, const Chromosomes &, const string &, const OptionsContainer &, const Filters &);
uint finalise(const string &, const Chromosomes &, const string &, const OptionsContainer &, const Filters &);

private:

Expand Down
16 changes: 9 additions & 7 deletions include/bayesTyper/InferenceEngine.hpp
Expand Up @@ -58,8 +58,10 @@ class InferenceEngine {

InferenceEngine(const vector<Sample> &, const ChromosomePloidy &, const OptionsContainer &);

void estimateNoiseParameters(CountDistribution *, InferenceUnit *, KmerCountsHash *, const string &);
void genotypeVariantClusterGroups(InferenceUnit *, KmerCountsHash *, const CountDistribution &, const Filters & filters, GenotypeWriter *);
void estimateNoise(CountDistribution *, InferenceUnit *, KmerCountsHash *, const string &);
void estimateGenotypes(InferenceUnit *, KmerCountsHash *, const CountDistribution &, const Filters & filters, GenotypeWriter *);
void estimateNoiseAndGenotypes(InferenceUnit *, CountDistribution *, KmerCountsHash *, const Filters &, GenotypeWriter *, const string &);


private:

Expand All @@ -74,7 +76,6 @@ class InferenceEngine {
const ushort num_gibbs_chains;
const float kmer_subsampling_rate;
const uint max_haplotype_variant_kmers;
const bool min_cover_haplotype_init;

struct VariantClusterGroupBatch {

Expand All @@ -88,11 +89,12 @@ class InferenceEngine {
VariantClusterGroupBatch(const uint first_variant_cluster_group_idx_in, const uint num_variants_in, const vector<VariantClusterGroup*>::iterator start_it_in, const vector<VariantClusterGroup*>::iterator end_it_in) : first_variant_cluster_group_idx(first_variant_cluster_group_idx_in), num_variants(num_variants_in), start_it(start_it_in), end_it(end_it_in) {}
};

void initNoiseEstimationGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, KmerCountsHash *, const ushort, const ushort, const uint);
void sampleNoiseCountsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, CountAllocation *, mutex *, const CountDistribution &, const ushort, const uint);
void resetNoiseEstimationGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const ushort, const uint);
void initGenotypersCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const uint, KmerCountsHash *, const ushort, const ushort);
void sampleGenotypesCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const uint, const CountDistribution &, const bool, CountAllocation *, mutex *, const ushort);
void resetGroupsCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const uint, const ushort);
void collectGenotypesCallback(vector<VariantClusterGroup *> *, const vector<uint> &, const uint, const Filters & filters, GenotypeWriter *, const ushort);

void genotypeVariantClusterGroupsCallback(ProducerConsumerQueue<VariantClusterGroupBatch> *, KmerCountsHash *, const CountDistribution &, const Filters & filters, GenotypeWriter *, uint *, mutex *);
void estimateGenotypesCallback(ProducerConsumerQueue<VariantClusterGroupBatch> *, KmerCountsHash *, const CountDistribution &, const Filters & filters, GenotypeWriter *, uint *, mutex *);
};

#endif
8 changes: 4 additions & 4 deletions include/bayesTyper/KmerCounter.hpp
Expand Up @@ -57,8 +57,8 @@ class KmerCounter {
KmerCounter(const vector<Sample> &, const OptionsContainer &);

void findVariantClusterPaths(InferenceUnit *, const ushort);
void countPathMultigroupKmers(BooleanKmerHash *, ThreadedKmerBloom<Utils::kmer_size> *, InferenceUnit *);
void countInterclusterParameterKmers(BooleanKmerHash *, const vector<VariantFileParser::InterClusterRegion> &, const Chromosomes &, const ThreadedKmerBloom<Utils::kmer_size> &, const float);
void countPathMultigroupKmers(KmerHash<bool> *, ThreadedKmerBloom<Utils::kmer_size> *, InferenceUnit *);
void countInterclusterParameterKmers(KmerHash<bool> *, const vector<VariantFileParser::InterClusterRegion> &, const Chromosomes &, const ThreadedKmerBloom<Utils::kmer_size> &, const float);

void countPathKmers(ThreadedKmerBloom<Utils::kmer_size> *, InferenceUnit *);
void countInterclusterKmers(KmerCountsHash *, ThreadedKmerBloom<Utils::kmer_size> *, const string &, const Chromosomes &, const ChromosomePloidy &);
Expand All @@ -85,8 +85,8 @@ class KmerCounter {
};

void findVariantClusterPathsCallback(vector<VariantClusterGroup *> *, KmerBloom<Utils::kmer_size> *, const ushort, const ushort, const ushort);
void countPathMultigroupKmersCallback(BooleanKmerHash *, ThreadedKmerBloom<Utils::kmer_size> *, vector<VariantClusterGroup *> *, mutex *, ulong *, const ushort);
void countInterclusterParameterKmersCallback(BooleanKmerHash *, const vector<VariantFileParser::InterClusterRegion> &, const Chromosomes &, const ThreadedKmerBloom<Utils::kmer_size> &, const float, const ushort);
void countPathMultigroupKmersCallback(KmerHash<bool> *, ThreadedKmerBloom<Utils::kmer_size> *, vector<VariantClusterGroup *> *, mutex *, ulong *, const ushort);
void countInterclusterParameterKmersCallback(KmerHash<bool> *, const vector<VariantFileParser::InterClusterRegion> &, const Chromosomes &, const ThreadedKmerBloom<Utils::kmer_size> &, const float, const ushort);

void countPathKmersCallback(ThreadedKmerBloom<Utils::kmer_size> *, vector<VariantClusterGroup *> *, const ushort);
void countInterclusterKmersCallback(KmerCountsHash *, ThreadedKmerBloom<Utils::kmer_size> *, const vector<VariantFileParser::InterClusterRegion> &, const Chromosomes &, const ChromosomePloidy &, const ushort);
Expand Down
17 changes: 9 additions & 8 deletions include/bayesTyper/KmerHash.hpp
Expand Up @@ -46,27 +46,28 @@ THE SOFTWARE.

using namespace std;

class BooleanKmerHash {
template<class T>
class KmerHash {

private:

ThreadedHybridHash<bool, Utils::kmer_size * 2> * _hash;
ThreadedHybridHash<T, Utils::kmer_size * 2> * _hash;

public:

BooleanKmerHash(const ulong, const ushort);
~BooleanKmerHash();
KmerHash(const ulong, const ushort);
~KmerHash();

unique_lock<mutex> getKmerLock(const bitset<Utils::kmer_size * 2> &);
pair<bool *, bool> addKmer(const bitset<Utils::kmer_size * 2> &);
bool * findKmer(const bitset<Utils::kmer_size * 2> &);
pair<T *, bool> addKmer(const bitset<Utils::kmer_size * 2> &);
T * findKmer(const bitset<Utils::kmer_size * 2> &);

void shuffle(const uint);
ulong size();

void writeRootSizeDistribution(const string &);
ulong writeKmersToFasta(const string &, const bool, const uint);
ulong addKmersToBloomFilter(KmerBloom<Utils::kmer_size> *, const bool);
ulong writeKmersToFasta(const string &, bool (*)(T), const uint);
ulong addKmersToBloomFilter(KmerBloom<Utils::kmer_size> *, bool (*)(T));
};

class KmerCountsHash {
Expand Down
6 changes: 2 additions & 4 deletions include/bayesTyper/VariantClusterGenotyper.hpp
Expand Up @@ -69,7 +69,8 @@ class VariantClusterGenotyper {
VariantClusterGenotyper(VariantClusterGraph *, KmerCountsHash *, const vector<Sample> &, const uint, const uchar);
~VariantClusterGenotyper();

void reset(const float, const uint, const bool);
void reset(const float, const uint);
void clearCache();

void sampleDiplotypes(const CountDistribution &, const vector<VariantClusterHaplotypes::NestedVariantClusterInfo> &, const bool);
void getNoiseCounts(CountAllocation *, const CountDistribution &);
Expand Down Expand Up @@ -101,9 +102,6 @@ class VariantClusterGenotyper {

VariantClusterHaplotypes variant_cluster_haplotypes;

SparsityEstimator sparsity_estimator;
Utils::RowVectorXbool non_zero_kmer_counts;

vector<VariantInfo> variant_cluster_info;
vector<vector<AlleleKmerStats> > allele_kmer_stats;

Expand Down
5 changes: 2 additions & 3 deletions include/bayesTyper/VariantClusterGroup.hpp
Expand Up @@ -122,9 +122,8 @@ class VariantClusterGroup {

bool hasExcludedKmers() const;

void initGenotyper(KmerCountsHash *, const vector<Sample> &, const uint, const uchar, const float, const uint, const bool);

void resetGenotyper(const float, const uint, const bool);
void initGenotyper(KmerCountsHash *, const vector<Sample> &, const uint, const uchar, const float, const uint);
void clearGenotyperCache();
void resetGroup();

void shuffleBranchOrdering(const uint);
Expand Down
2 changes: 1 addition & 1 deletion include/bayesTyperTools/ConvertAllele.hpp
Expand Up @@ -38,7 +38,7 @@ THE SOFTWARE.

namespace ConvertAllele {

void convertAllele(const string &, const string &, const string &, const string &, const bool);
void convertAllele(const string &, const string &, const string &, const string &, const string &, const bool, const bool);
}

#endif
64 changes: 31 additions & 33 deletions src/bayesTyper/CountDistribution.cpp
Expand Up @@ -40,10 +40,11 @@ THE SOFTWARE.
#include "Combinator.hpp"


static const uint min_nb_kmer_count = 1000;
static const uint max_nb_kmer_multiplicity = 32;
static const uint min_nb_kmer_count = 10000;

static const uchar num_genomic_rate_gc_bias_bins = 1;
static const uchar max_multiplicity = Utils::uchar_overflow;
static const uchar max_kmer_multiplicity = Utils::uchar_overflow;
static const uchar max_kmer_count = Utils::uchar_overflow;


Expand All @@ -54,15 +55,15 @@ CountDistribution::CountDistribution(const vector<Sample> & samples_in, const Op
genomic_count_distributions = vector<vector<NegativeBinomialDistribution> >(samples.size(), vector<NegativeBinomialDistribution>(num_genomic_rate_gc_bias_bins));

noise_rates = vector<double>(samples.size(), 0);
genomic_count_log_pmf_cache = vector<vector<vector<vector<double> > > >(samples.size(), vector<vector<vector<double> > >(num_genomic_rate_gc_bias_bins, vector<vector<double> >(max_multiplicity + 1, vector<double>(max_kmer_count + 1))));
genomic_count_log_pmf_cache = vector<vector<vector<vector<double> > > >(samples.size(), vector<vector<vector<double> > >(num_genomic_rate_gc_bias_bins, vector<vector<double> >(max_kmer_multiplicity + 1, vector<double>(max_kmer_count + 1))));
noise_count_log_pmf_cache = vector<vector<double> >(samples.size(), vector<double>(max_kmer_count + 1));

resetNoiseRates();
updateGenomicCache();
}


void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<KmerStats> > > & intercluster_diploid_kmer_stats, const string & output_prefix) {
void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<KmerStats> > > & intercluster_kmer_stats, const string & output_prefix) {

cout << "[" << Utils::getLocalTime() << "] Estimating genomic haploid kmer count distribution(s) from parameter kmers ...\n" << endl;

Expand All @@ -76,61 +77,58 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<

genomic_outfile << "Sample\tMean\tVariance" << endl;

assert(intercluster_diploid_kmer_stats.size() == samples.size());
assert(intercluster_diploid_kmer_stats.size() == genomic_count_distributions.size());
assert(intercluster_kmer_stats.size() == samples.size());
assert(intercluster_kmer_stats.size() == genomic_count_distributions.size());

for (ushort sample_idx = 0; sample_idx < samples.size(); sample_idx++) {
assert(num_genomic_rate_gc_bias_bins == 1);
assert(max_nb_kmer_multiplicity <= Utils::uchar_overflow);

assert(num_genomic_rate_gc_bias_bins == 1);
for (ushort sample_idx = 0; sample_idx < samples.size(); sample_idx++) {

assert(intercluster_diploid_kmer_stats.at(sample_idx).size() == num_genomic_rate_gc_bias_bins);
assert(intercluster_diploid_kmer_stats.at(sample_idx).size() == genomic_count_distributions.at(sample_idx).size());
assert(intercluster_kmer_stats.at(sample_idx).size() == num_genomic_rate_gc_bias_bins);
assert(intercluster_kmer_stats.at(sample_idx).size() == genomic_count_distributions.at(sample_idx).size());

for (ushort bias_idx = 0; bias_idx < num_genomic_rate_gc_bias_bins; bias_idx++) {

assert(intercluster_diploid_kmer_stats.at(sample_idx).at(bias_idx).size() == static_cast<uint>(Utils::Ploidy::PLOIDY_SIZE));
assert(intercluster_kmer_stats.at(sample_idx).at(bias_idx).size() == (Utils::uchar_overflow + 1));

uint max_kmer_count = 0;
ushort max_kmer_multiplicity = 0;
uint max_genomic_kmer_count = 0;
ushort max_genomic_kmer_multiplicity = 0;

for (ushort kmer_multiplicity = 1; kmer_multiplicity < static_cast<uint>(Utils::Ploidy::PLOIDY_SIZE); kmer_multiplicity++) {
for (ushort kmer_genomic_multiplicity = 1; kmer_genomic_multiplicity <= max_nb_kmer_multiplicity; kmer_genomic_multiplicity++) {

auto kmer_count = intercluster_diploid_kmer_stats.at(sample_idx).at(bias_idx).at(kmer_multiplicity).getCount();
auto kmer_genomic_count = intercluster_kmer_stats.at(sample_idx).at(bias_idx).at(kmer_genomic_multiplicity).getCount();

if (kmer_count > max_kmer_count) {
if (kmer_genomic_count > max_genomic_kmer_count) {

max_kmer_count = kmer_count;
max_kmer_multiplicity = kmer_multiplicity;
max_genomic_kmer_count = kmer_genomic_count;
max_genomic_kmer_multiplicity = kmer_genomic_multiplicity;
}
}

if (max_kmer_count < min_nb_kmer_count) {

cerr << "\nERROR: Insufficient number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count << "); the genome used is likely too small and/or too repetitive\n" << endl;
exit(1);

} else if (max_kmer_count < (min_nb_kmer_count * 10)) {
if (max_genomic_kmer_count < min_nb_kmer_count) {

cout << "\nWARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_kmer_count << " < " << min_nb_kmer_count * 10 << "); mean and variance estimate might be biased\n" << endl;
cout << "\nWARNING: Low number of kmers used for negative binomial parameters estimation for sample " << samples.at(sample_idx).name << " (" << max_genomic_kmer_count << " < " << min_nb_kmer_count << ")" << endl;
cout << "WARNING: The mean and variance estimates might be biased due to the genome used being too small, too variant dense and/or too repetitive\n" << endl;
}

assert(max_kmer_multiplicity > 0);
assert(max_genomic_kmer_multiplicity > 0);

auto kmer_mean = intercluster_diploid_kmer_stats.at(sample_idx).at(bias_idx).at(max_kmer_multiplicity).getMean();
auto kmer_mean = intercluster_kmer_stats.at(sample_idx).at(bias_idx).at(max_genomic_kmer_multiplicity).getMean();
assert(kmer_mean.second);

auto kmer_var = intercluster_diploid_kmer_stats.at(sample_idx).at(bias_idx).at(max_kmer_multiplicity).getVariance();
auto kmer_var = intercluster_kmer_stats.at(sample_idx).at(bias_idx).at(max_genomic_kmer_multiplicity).getVariance();
assert(kmer_var.second);

auto nb_para = NegativeBinomialDistribution::momentsToParameters(kmer_mean.first, kmer_var.first);
nb_para.second /= max_kmer_multiplicity;
nb_para.second /= max_genomic_kmer_multiplicity;

genomic_count_distributions.at(sample_idx).at(bias_idx).setParameters(nb_para);

auto nb_mean = genomic_count_distributions.at(sample_idx).at(bias_idx).mean();
auto nb_var = genomic_count_distributions.at(sample_idx).at(bias_idx).var();

cout << "[" << Utils::getLocalTime() << "] Estimated negative binomial (mean = " << nb_mean << ", var = " << nb_var << ") for sample " << samples.at(sample_idx).name << " using " << max_kmer_count << " parameter kmers (ploidy = " << max_kmer_multiplicity << ")" << endl;
cout << "[" << Utils::getLocalTime() << "] Estimated negative binomial (mean = " << nb_mean << ", var = " << nb_var << ") for sample " << samples.at(sample_idx).name << " using " << max_genomic_kmer_count << " parameter kmers (multiplicity = " << max_genomic_kmer_multiplicity << ")" << endl;

genomic_outfile << samples.at(sample_idx).name << "\t" << nb_mean << "\t" << nb_var << endl;
}
Expand All @@ -139,7 +137,7 @@ void CountDistribution::setGenomicCountDistributions(const vector<vector<vector<
genomic_outfile.close();
updateGenomicCache();

cout << "\n[" << Utils::getLocalTime() << "] Wrote parameters to " << output_prefix << ".txt" << endl;
cout << "\n[" << Utils::getLocalTime() << "] Wrote genomic parameters to " << output_prefix << ".txt" << endl;
}

const vector<vector<NegativeBinomialDistribution> > & CountDistribution::getGenomicCountDistributions() const {
Expand Down Expand Up @@ -224,9 +222,9 @@ void CountDistribution::updateGenomicCache() {

for (ushort bias_idx = 0; bias_idx < num_genomic_rate_gc_bias_bins; bias_idx++) {

assert(genomic_count_log_pmf_cache.at(sample_idx).at(bias_idx).size() == static_cast<uint>(max_multiplicity + 1));
assert(genomic_count_log_pmf_cache.at(sample_idx).at(bias_idx).size() == static_cast<uint>(max_kmer_multiplicity + 1));

for (ushort kmer_multiplicity = 0; kmer_multiplicity <= max_multiplicity; kmer_multiplicity++) {
for (ushort kmer_multiplicity = 0; kmer_multiplicity <= max_kmer_multiplicity; kmer_multiplicity++) {

assert(genomic_count_log_pmf_cache.at(sample_idx).at(bias_idx).at(kmer_multiplicity).size() == static_cast<uint>(max_kmer_count + 1));

Expand Down
10 changes: 6 additions & 4 deletions src/bayesTyper/GenotypeWriter.cpp
Expand Up @@ -349,7 +349,7 @@ void GenotypeWriter::addGenotypes(vector<Genotypes *> * variant_genotypes) {
genotypes_queue->push(variant_genotypes);
}

void GenotypeWriter::finalise(const string & output_prefix, const Chromosomes & chromosomes, const string & graph_options_header, const OptionsContainer & options_container, const Filters & filters) {
uint GenotypeWriter::finalise(const string & output_prefix, const Chromosomes & chromosomes, const string & graph_options_header, const OptionsContainer & options_container, const Filters & filters) {

genotypes_queue->pushedLast();

Expand All @@ -364,7 +364,7 @@ void GenotypeWriter::finalise(const string & output_prefix, const Chromosomes &
assert(!tmp_outfile.is_open());


cout << "[" << Utils::getLocalTime() << "] Sorting genotyped variants ..." << endl;
cout << "\n[" << Utils::getLocalTime() << "] Sorting genotyped variants ..." << endl;

assert(tmp_filename.substr(tmp_filename.size() - 3, 3) == ".gz");

Expand Down Expand Up @@ -453,7 +453,7 @@ void GenotypeWriter::finalise(const string & output_prefix, const Chromosomes &

variants_outfile_fstream << generateHeader(options_container.getValue<string>("genome-file"), chromosomes, graph_options_header, options_container.getHeader(), filters);

ulong num_genotyped_variants = 0;
uint num_genotyped_variants = 0;

auto chromosomes_it = chromosomes.cbegin();

Expand Down Expand Up @@ -486,7 +486,9 @@ void GenotypeWriter::finalise(const string & output_prefix, const Chromosomes &
variants_outfile_fstream.reset();
assert(!variants_outfile.is_open());

cout << "[" << Utils::getLocalTime() << "] Wrote " << num_genotyped_variants << " genotyped variants to " << genotype_filename << endl;
cout << "[" << Utils::getLocalTime() << "] Wrote genotyped variants to " << genotype_filename << endl;

return num_genotyped_variants;
}

string GenotypeWriter::generateHeader(const string & genome_filename, const Chromosomes & chromosomes, const string & graph_options_header, const string & genotype_options_header, const Filters & filters) {
Expand Down

0 comments on commit 6f2c7cf

Please sign in to comment.