Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AAHash #88

Merged
merged 38 commits into from
May 1, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
5eb33f0
Add aaHash code
parham-k Apr 7, 2023
346324e
Add tests
parham-k Apr 7, 2023
9ef6ae8
Fix copy constructor
parham-k Apr 9, 2023
7f093c4
Update wrappers
parham-k Apr 9, 2023
f4e7602
aahash: enable different levels of hash
jwcodee Apr 9, 2023
e3daed2
Separate low-level functions from aaHash class
parham-k Apr 10, 2023
099c986
Fix wrapper issues
parham-k Apr 10, 2023
b0f5ed5
update aaHash and make seq_reader and seq_writer compatible with amin…
Apr 11, 2023
1d672d8
tests: expand aaHash level tests
Apr 11, 2023
271337b
aaHash: update wrappers
Apr 11, 2023
6a9387d
aaHash: fix errors
Apr 11, 2023
704fdd3
add seedAAHash
Apr 12, 2023
f15fd8d
add AAHash and seedAAHash unit tests
Apr 12, 2023
c29f3ea
update wrappers
Apr 12, 2023
0699b4c
update docs
Apr 12, 2023
9b89c09
renamed seedAAHash to SeedAAHash
Apr 15, 2023
eebb2ae
wrappers: regenerated with swig 4.1.1
Apr 15, 2023
310fa2a
SeedAAHash: verify seeds only contains 0, 1, 2, or 3
Apr 15, 2023
61da14d
SeedAAHash: verify seeds are the same length as k
Apr 15, 2023
4a60d42
SeedAAHash: add more seed documentation
Apr 15, 2023
8ff6e08
tests: add non length 1 seq equivalence tests
Apr 16, 2023
8496712
tests: add aahash invariance tests
Apr 19, 2023
584d229
aahash: refactor static const to constexpr inline
Apr 26, 2023
d277d46
aahash: regenerate wrappers
Apr 26, 2023
ed8b5c6
meson.build: use c++17
Apr 26, 2023
2ffaf09
NOLINT(modernize-make-unique) unique_ptr functions
Apr 26, 2023
3611f1d
aahash_consts.hpp: localize clang-format off
Apr 26, 2023
851767d
fixed AAHash python and added python wrapper tests
Apr 28, 2023
e8deb5e
fix SeedAAHash
May 1, 2023
7bfd3ce
add SeedAAHash tests
May 1, 2023
6f6d569
remove include for bugfixing
May 1, 2023
c4fe101
add back python wrapper fix
May 1, 2023
8299659
regenerated docs
May 1, 2023
c5df381
Resolved merge conflict in wrapper files
May 1, 2023
83647a9
remove get_forward_hash
May 1, 2023
80ede90
expand python tests
May 1, 2023
8c92ee9
use std::string_view to fix dangling pointers issue with python wrappers
May 1, 2023
a18251e
regenerate docs
May 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
252 changes: 252 additions & 0 deletions include/btllib/aahash.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#ifndef BTLLIB_AAHASH_HPP
#define BTLLIB_AAHASH_HPP

#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <vector>

#include "aahash_consts.hpp"
#include "nthash_lowlevel.hpp"

namespace btllib {

inline uint64_t
aahash_base(const char* kmer_seq, unsigned k, unsigned level = 1)
{
uint64_t hash_value = 0;
for (unsigned i = 0; i < k; i++) {
hash_value = srol(hash_value);
hash_value ^= LEVEL_X_AA_SEED_TABLE[level][(unsigned char)kmer_seq[i]];
}
return hash_value;
}

inline uint64_t
aahash_roll(uint64_t hash_value,
unsigned k,
unsigned char char_out,
unsigned char char_in,
unsigned level = 1)
{
hash_value = srol(hash_value);
hash_value ^= LEVEL_X_AA_SEED_TABLE[level][char_in];
hash_value ^= AA_ROLL_TABLE(char_out, level, k);
return hash_value;
}

inline uint64_t
aa_modify_base_with_seed(uint64_t hash_value,
const SpacedSeed& seed,
const char* kmer_seq,
const unsigned k)
{
for (unsigned i = 0; i < k; i++) {
if (seed[i] != 1) {
hash_value ^=
(LEVEL_X_AA_SEED_LEFT_31BITS_ROLL_TABLE[1][(unsigned char)kmer_seq[i]]
[(k - 1 - i) % 31] | // NOLINT
LEVEL_X_AA_SEED_RIGHT_33BITS_ROLL_TABLE[1][(unsigned char)kmer_seq[i]]
[(k - 1 - i) % 33]); // NOLINT
if (seed[i] != 0) {
unsigned level = seed[i];
hash_value ^=
(LEVEL_X_AA_SEED_LEFT_31BITS_ROLL_TABLE[level][(
unsigned char)kmer_seq[i]][(k - 1 - i) % 31] | // NOLINT
LEVEL_X_AA_SEED_RIGHT_33BITS_ROLL_TABLE[level][(
unsigned char)kmer_seq[i]][(k - 1 - i) % 33]); // NOLINT
}
}
}
return hash_value;
}

std::vector<SpacedSeed>
aa_parse_seeds(const std::vector<std::string>& seeds);
class seedAAHash;

class AAHash
{
static constexpr const char* HASH_FN_NAME = "aahash1";

private:
friend class seedAAHash;

/** Initialize internal state of iterator */
bool init();

const char* seq;
size_t seq_len;
const uint8_t hash_num;
const uint16_t k;
unsigned level;

size_t pos;
bool initialized = false;
std::unique_ptr<uint64_t[]> hashes_array;

public:
/**
* Constructor.
* @param seq String of DNA sequence to be hashed.
* @param hash_num Number of hashes to produce per k-mer.
* @param k K-mer size.
* @param pos Position in seq to start hashing from.
* @param level seed level to generate hash.
*/
AAHash(const std::string& seq,
uint8_t hash_num,
uint16_t k,
unsigned level,
size_t pos = 0)
: seq(seq.data())
, seq_len(seq.size())
, hash_num(hash_num)
, k(k)
, level(level)
, pos(pos)
, hashes_array(new uint64_t[hash_num])
{
}

AAHash(const AAHash& aahash)
: seq(aahash.seq)
, seq_len(aahash.seq_len)
, hash_num(aahash.hash_num)
, k(aahash.k)
, level(aahash.level)
, pos(aahash.pos)
, initialized(aahash.initialized)
, hashes_array(new uint64_t[hash_num])
{
std::memcpy(hashes_array.get(),
aahash.hashes_array.get(),
hash_num * sizeof(uint64_t));
}

AAHash(AAHash&&) = default;

/**
* Calculate the hash values of current k-mer and advance to the next.
* AAHash advances one amino acid at a time until it finds a k-mer with valid
* characters and skips over those with invalid characters.
* This method must be called before hashes() is accessed, for
* the first and every subsequent hashed kmer. get_pos() may be called at any
* time to obtain the position of last hashed k-mer or the k-mer to be hashed
* if roll() has never been called on this AAHash object. It is important to
* note that the number of roll() calls is NOT necessarily equal to get_pos(),
* if there are N or invalid characters in the hashed sequence.
*
* @return true on success and false otherwise.
*/
bool roll();

const uint64_t* hashes() const { return hashes_array.get(); }
size_t get_pos() const { return pos; }
unsigned get_hash_num() const { return hash_num; }
unsigned get_k() const { return k; }
uint64_t get_forward_hash() const { return hashes_array[0]; }
};

class seedAAHash
jwcodee marked this conversation as resolved.
Show resolved Hide resolved
{
private:
AAHash aahash;
const unsigned hash_num_per_seed;
std::unique_ptr<uint64_t[]> hashes_array;
std::vector<SpacedSeed> seeds;

public:
/**
* Constructor.
* @param seq String of DNA sequence to be hashed.
* @param seeds Multi-level seeds to use for hashing.
jwcodee marked this conversation as resolved.
Show resolved Hide resolved
* @param hash_num_per_seed Number of hashes to produce per seed.
* @param k K-mer size.
* @param pos Position in seq to start hashing from.
*/
seedAAHash(const char* seq,
const std::vector<SpacedSeed>& seeds,
unsigned hash_num_per_seed,
unsigned k,
size_t pos = 0)
: aahash(seq, 1, k, 1, pos)
, hash_num_per_seed(hash_num_per_seed)
, hashes_array(new uint64_t[hash_num_per_seed * seeds.size()])
, seeds(seeds)
{
}
seedAAHash(const std::string& seq,
const std::vector<SpacedSeed>& seeds,
unsigned hash_num_per_seed,
unsigned k,
size_t pos = 0)
: aahash(seq, 1, k, 1, pos)
, hash_num_per_seed(hash_num_per_seed)
, hashes_array(new uint64_t[hash_num_per_seed * seeds.size()])
, seeds(seeds)
{
}
seedAAHash(const char* seq,
const std::vector<std::string>& seeds,
unsigned hash_num_per_seed,
unsigned k,
size_t pos = 0)
: aahash(seq, 1, k, 1, pos)
, hash_num_per_seed(hash_num_per_seed)
, hashes_array(new uint64_t[hash_num_per_seed * seeds.size()])
, seeds(aa_parse_seeds(seeds))
{
}
seedAAHash(const std::string& seq,
const std::vector<std::string>& seeds,
unsigned hash_num_per_seed,
unsigned k,
size_t pos = 0)
: aahash(seq, 1, k, 1, pos)
, hash_num_per_seed(hash_num_per_seed)
, hashes_array(new uint64_t[hash_num_per_seed * seeds.size()])
, seeds(aa_parse_seeds(seeds))
{
}

seedAAHash(const seedAAHash& seed_aahash)
: aahash(seed_aahash.aahash)
, hash_num_per_seed(seed_aahash.hash_num_per_seed)
, hashes_array(new uint64_t[hash_num_per_seed * seed_aahash.seeds.size()])
, seeds(seed_aahash.seeds)
{
std::memcpy(hashes_array.get(),
seed_aahash.hashes_array.get(),
hash_num_per_seed * seeds.size() * sizeof(uint64_t));
}
seedAAHash(seedAAHash&&) = default;

/**
* Calculate the hash values of current k-mer based on the seed(s) and advance
* to the next. seedAAHash advances one amino acid at a time until it finds a
* k-mer with valid characters and skips over those with invalid characters.
* This method must be called before hashes() is accessed, for
* the first and every subsequent hashed kmer. get_pos() may be called at any
* time to obtain the position of last hashed k-mer or the k-mer to be hashed
* if roll() has never been called on this AAHash object. It is important to
* note that the number of roll() calls is NOT necessarily equal to get_pos(),
* if there are N or invalid characters in the hashed sequence.
*
* @return true on success and false otherwise.
*/
bool roll();

const uint64_t* hashes() const { return hashes_array.get(); }

size_t get_pos() const { return aahash.get_pos(); }
unsigned get_hash_num() const { return aahash.get_hash_num(); }
unsigned get_hash_num_per_seed() const { return hash_num_per_seed; }
unsigned get_k() const { return aahash.get_k(); }
};

} // namespace btllib

#endif