This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
403 additions
and
267 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#include "genotype.h" | ||
#include "genotype_encoding.h" | ||
|
||
namespace gamgee { | ||
|
||
using namespace std; | ||
|
||
Genotype::Genotype(const std::shared_ptr<bcf1_t>& body, const bcf_fmt_t* const format_ptr, const uint8_t* data_ptr) : | ||
m_body {body}, | ||
m_format_ptr {format_ptr}, | ||
m_data_ptr {data_ptr} { | ||
} | ||
|
||
Genotype::Genotype(Genotype&& other) noexcept : | ||
m_body {move(other.m_body)}, | ||
m_format_ptr {other.m_format_ptr}, | ||
m_data_ptr {other.m_data_ptr} | ||
{} | ||
|
||
bool Genotype::is_het() const { | ||
return (allele(0) != allele(1)); | ||
} | ||
|
||
bool Genotype::is_hom_var() const { | ||
return (allele(0) != 0) && (allele(0) == allele(1)); | ||
} | ||
|
||
bool Genotype::is_hom_ref() const { | ||
return (allele(0) == 0 && allele(1) == 0); | ||
} | ||
|
||
GenotypeEncoding Genotype::value() const { | ||
return GenotypeEncoding{static_cast<uint32_t>(allele(0) << 16) | allele(1)}; | ||
} | ||
|
||
GenotypeEncodingValue Genotype::operator [](const uint32_t index) const { | ||
const auto const_this = const_cast<Genotype *>(this); | ||
const auto allele_int = const_this->allele(index); | ||
return GenotypeEncodingValue{const_this->allele_string(allele_int)}; | ||
} | ||
|
||
uint32_t Genotype::size() const { | ||
return m_format_ptr->n; | ||
} | ||
|
||
int32_t Genotype::allele(const uint32_t allele_index) const { | ||
switch (m_format_ptr->type) { | ||
case BCF_BT_INT8: | ||
return allele<int8_t>(allele_index); | ||
case BCF_BT_INT16: | ||
return allele<int16_t>(allele_index); | ||
case BCF_BT_INT32: | ||
return allele<int32_t>(allele_index); | ||
default: | ||
throw "unknown GT field type"; | ||
} | ||
} | ||
|
||
template<class TYPE> | ||
int32_t Genotype::allele(const uint32_t allele_index) const { | ||
return (((TYPE*)m_data_ptr)[allele_index]>>1)-1; | ||
} | ||
|
||
vector<string> Genotype::alleles() const { | ||
switch (m_format_ptr->type) { | ||
case BCF_BT_INT8: | ||
return alleles<int8_t>(bcf_int8_missing, bcf_int8_vector_end); | ||
case BCF_BT_INT16: | ||
return alleles<int16_t>(bcf_int16_missing, bcf_int16_vector_end); | ||
case BCF_BT_INT32: | ||
return alleles<int32_t>(bcf_int32_missing, bcf_int32_vector_end); | ||
default: | ||
throw "unknown GT field type"; | ||
} | ||
} | ||
|
||
template<class TYPE> | ||
vector<string> Genotype::alleles(TYPE missing, TYPE vector_end) const { | ||
const auto p = (TYPE*) m_data_ptr; | ||
// mostly copied from htslib | ||
auto results = vector<string>{}; // TODO: reserve {m_format_ptr->p_len}; | ||
bcf_unpack(m_body.get(), BCF_UN_STR); | ||
for (int ial=0; ial<m_format_ptr->size; ial++) | ||
{ | ||
if ( p[ial]==vector_end ) break; /* smaller ploidy */ | ||
if ( !(p[ial]>>1) || p[ial]==missing ) { | ||
/* missing allele */ | ||
results.push_back(""); | ||
continue; | ||
} | ||
if ( (p[ial]>>1)-1 >= m_body.get()->n_allele ) { | ||
results.push_back(""); | ||
continue; | ||
} | ||
results.push_back(allele_string((p[ial]>>1)-1)); | ||
} | ||
return results; | ||
} | ||
|
||
string Genotype::allele_string(int32_t allele_int) const { | ||
bcf_unpack(m_body.get(), BCF_UN_STR); | ||
return string{m_body.get()->d.allele[allele_int]}; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#ifndef __gamgee__genotype__ | ||
#define __gamgee__genotype__ | ||
|
||
#include "utils/hts_memory.h" | ||
#include "genotype_encoding_value.h" | ||
#include "genotype_encoding.h" | ||
|
||
#include "utils/utils.h" | ||
#include "utils/format_field_type.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
|
||
namespace gamgee { | ||
|
||
/** | ||
* @brief Encodes a genotype. | ||
*/ | ||
class Genotype { | ||
public: | ||
/** | ||
* @brief Creates a genotype. | ||
* @param body The shared memory variant "line" from a vcf, or bcf. | ||
* @param format_ptr The GT field from the line. | ||
* @param data_ptr The GT for this sample. | ||
*/ | ||
Genotype(const std::shared_ptr<bcf1_t>& body, const bcf_fmt_t* const format_ptr, const uint8_t* data_ptr); | ||
|
||
/** | ||
* @brief Moves a genotype. | ||
* @param other The other genotype object to move. | ||
*/ | ||
Genotype(Genotype&& other) noexcept; | ||
|
||
// only for diploids | ||
/** | ||
* @brief Checks if this genotype is any type of heterozygous call. | ||
* @return True if this GT is a het. | ||
* @note only for diploids. | ||
*/ | ||
bool is_het() const; | ||
/** | ||
* @brief Checks if this genotype is a homozygous call that is non-reference. | ||
* @return True if this GT is a hom var. | ||
* @note only for diploids. | ||
*/ | ||
bool is_hom_var() const; | ||
/** | ||
* @brief Checks if this genotype is a homozygous call that is reference. | ||
* @return True if this GT is a hom ref. | ||
* @note only for diploids. | ||
*/ | ||
bool is_hom_ref() const; | ||
|
||
/** | ||
* @brief Returns a bit encoding of the first two alleles. | ||
* @return A bit encoding of the first two alleles. | ||
* @note only for diploids. | ||
*/ | ||
GenotypeEncoding value() const; // returns the number of the genotype. => 0/1 = 1 | 0/0 = 0 | 1/1 = 2 ... | 0/2 | ||
|
||
/** | ||
* @brief Returns a string decoding of each allele. | ||
* @param index From [0,size()) | ||
* @return A string decoding of the allele at index. | ||
*/ | ||
GenotypeEncodingValue operator[](const uint32_t index) const; // getter | ||
|
||
/** | ||
* @brief Number of alleles in this genotype. | ||
* @return Number of alleles in this genotype. | ||
*/ | ||
uint32_t size() const; | ||
|
||
/** | ||
* @brief The alleles in this genotype. | ||
* @return The alleles in this genotype as strings. | ||
*/ | ||
vector<string> alleles() const; // returns the actual alleles (e.g. A/T or TG/C or T/T/C or T/T/T/T/T ... ) | ||
|
||
private: | ||
const std::shared_ptr<bcf1_t> m_body; | ||
const bcf_fmt_t* const m_format_ptr; | ||
const uint8_t* m_data_ptr; | ||
|
||
int32_t allele(const uint32_t allele_index) const; | ||
|
||
template<class TYPE> | ||
int32_t allele(const uint32_t allele_index) const; | ||
|
||
template<class TYPE> | ||
vector<string> alleles(TYPE missing, TYPE vector_end) const; | ||
|
||
string allele_string(int32_t allele_int) const; | ||
}; | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#ifndef __gamgee__genotype_encoding__ | ||
#define __gamgee__genotype_encoding__ | ||
|
||
#include "utils/hts_memory.h" | ||
#include "utils/utils.h" | ||
#include "utils/format_field_type.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
#include <utility> | ||
|
||
namespace gamgee { | ||
|
||
/** | ||
* @brief Encodes a diploid genotype as bits. | ||
* @note Missing genotypes are not implemented yet. | ||
*/ | ||
class GenotypeEncoding { | ||
|
||
public: | ||
/** | ||
* @brief Constructs a genotype encoding. | ||
* @param A bit encoding for the first two alleles. | ||
*/ | ||
GenotypeEncoding(const uint32_t bits) : | ||
m_bits {bits} { | ||
} | ||
|
||
/** | ||
* @brief A bit encoding for the first two alleles. | ||
* @return A bit encoding for the first two alleles. | ||
*/ | ||
uint32_t bits() const { | ||
return m_bits; | ||
} | ||
|
||
/** | ||
* @brief Checks if another genotype encoding does not equal this encoding. | ||
* @param other The other genotype encoding to compare. | ||
* @return False if other genotype encoding equals this encoding. | ||
*/ | ||
bool operator!=(const GenotypeEncoding& other) const { | ||
return m_bits != other.m_bits; | ||
} | ||
|
||
/** | ||
* @brief Checks if another genotype encoding equals this encoding. | ||
* @param other The other genotype encoding to compare. | ||
* @return True if other genotype encoding equals this encoding. | ||
*/ | ||
bool operator==(const GenotypeEncoding& other) const { | ||
return m_bits == other.m_bits; | ||
} | ||
|
||
private: | ||
const uint32_t m_bits; | ||
}; | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#ifndef __gamgee__genotype_encoding_value__ | ||
#define __gamgee__genotype_encoding_value__ | ||
|
||
#include "utils/hts_memory.h" | ||
#include "utils/utils.h" | ||
#include "utils/format_field_type.h" | ||
|
||
#include "htslib/vcf.h" | ||
|
||
#include <memory> | ||
#include <string> | ||
|
||
namespace gamgee { | ||
|
||
using namespace std; | ||
|
||
/** | ||
* @brief Decodes a genotype allele. | ||
*/ | ||
class GenotypeEncodingValue { | ||
public: | ||
GenotypeEncodingValue(const string allele): m_allele{allele} {} | ||
|
||
/** | ||
* @brief A single allele from a genotype. | ||
*/ | ||
string allele() const // returns the actual allele (e.g. A or T or AT or ATCAG...) returns empty string when is missing. | ||
{ | ||
return m_allele; | ||
} | ||
|
||
private: | ||
const string m_allele; | ||
}; | ||
|
||
} | ||
|
||
#endif |
Oops, something went wrong.