Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Commit for Jul 17
Browse files Browse the repository at this point in the history
  • Loading branch information
kshakir committed Jul 17, 2014
1 parent 04b03aa commit 3b0c2d5
Show file tree
Hide file tree
Showing 12 changed files with 403 additions and 267 deletions.
12 changes: 0 additions & 12 deletions gamgee/diploid_pl_genotype.h

This file was deleted.

105 changes: 105 additions & 0 deletions gamgee/genotype.cpp
@@ -0,0 +1,105 @@
#include "genotype.h"
#include "genotype_encoding.h"

namespace gamgee {

using namespace std;

Genotype::Genotype(const std::shared_ptr<bcf1_t>& body, const bcf_fmt_t* const format_ptr, const uint8_t* data_ptr) :
m_body {body},
m_format_ptr {format_ptr},
m_data_ptr {data_ptr} {
}

Genotype::Genotype(Genotype&& other) noexcept :
m_body {move(other.m_body)},
m_format_ptr {other.m_format_ptr},
m_data_ptr {other.m_data_ptr}
{}

bool Genotype::is_het() const {
return (allele(0) != allele(1));
}

bool Genotype::is_hom_var() const {
return (allele(0) != 0) && (allele(0) == allele(1));
}

bool Genotype::is_hom_ref() const {
return (allele(0) == 0 && allele(1) == 0);
}

GenotypeEncoding Genotype::value() const {
return GenotypeEncoding{static_cast<uint32_t>(allele(0) << 16) | allele(1)};
}

GenotypeEncodingValue Genotype::operator [](const uint32_t index) const {
const auto const_this = const_cast<Genotype *>(this);
const auto allele_int = const_this->allele(index);
return GenotypeEncodingValue{const_this->allele_string(allele_int)};
}

uint32_t Genotype::size() const {
return m_format_ptr->n;
}

int32_t Genotype::allele(const uint32_t allele_index) const {
switch (m_format_ptr->type) {
case BCF_BT_INT8:
return allele<int8_t>(allele_index);
case BCF_BT_INT16:
return allele<int16_t>(allele_index);
case BCF_BT_INT32:
return allele<int32_t>(allele_index);
default:
throw "unknown GT field type";
}
}

template<class TYPE>
int32_t Genotype::allele(const uint32_t allele_index) const {
return (((TYPE*)m_data_ptr)[allele_index]>>1)-1;
}

vector<string> Genotype::alleles() const {
switch (m_format_ptr->type) {
case BCF_BT_INT8:
return alleles<int8_t>(bcf_int8_missing, bcf_int8_vector_end);
case BCF_BT_INT16:
return alleles<int16_t>(bcf_int16_missing, bcf_int16_vector_end);
case BCF_BT_INT32:
return alleles<int32_t>(bcf_int32_missing, bcf_int32_vector_end);
default:
throw "unknown GT field type";
}
}

template<class TYPE>
vector<string> Genotype::alleles(TYPE missing, TYPE vector_end) const {
const auto p = (TYPE*) m_data_ptr;
// mostly copied from htslib
auto results = vector<string>{}; // TODO: reserve {m_format_ptr->p_len};
bcf_unpack(m_body.get(), BCF_UN_STR);
for (int ial=0; ial<m_format_ptr->size; ial++)
{
if ( p[ial]==vector_end ) break; /* smaller ploidy */
if ( !(p[ial]>>1) || p[ial]==missing ) {
/* missing allele */
results.push_back("");
continue;
}
if ( (p[ial]>>1)-1 >= m_body.get()->n_allele ) {
results.push_back("");
continue;
}
results.push_back(allele_string((p[ial]>>1)-1));
}
return results;
}

string Genotype::allele_string(int32_t allele_int) const {
bcf_unpack(m_body.get(), BCF_UN_STR);
return string{m_body.get()->d.allele[allele_int]};
}

}
100 changes: 100 additions & 0 deletions gamgee/genotype.h
@@ -0,0 +1,100 @@
#ifndef __gamgee__genotype__
#define __gamgee__genotype__

#include "utils/hts_memory.h"
#include "genotype_encoding_value.h"
#include "genotype_encoding.h"

#include "utils/utils.h"
#include "utils/format_field_type.h"

#include "htslib/vcf.h"

#include <memory>

namespace gamgee {

/**
* @brief Encodes a genotype.
*/
class Genotype {
public:
/**
* @brief Creates a genotype.
* @param body The shared memory variant "line" from a vcf, or bcf.
* @param format_ptr The GT field from the line.
* @param data_ptr The GT for this sample.
*/
Genotype(const std::shared_ptr<bcf1_t>& body, const bcf_fmt_t* const format_ptr, const uint8_t* data_ptr);

/**
* @brief Moves a genotype.
* @param other The other genotype object to move.
*/
Genotype(Genotype&& other) noexcept;

// only for diploids
/**
* @brief Checks if this genotype is any type of heterozygous call.
* @return True if this GT is a het.
* @note only for diploids.
*/
bool is_het() const;
/**
* @brief Checks if this genotype is a homozygous call that is non-reference.
* @return True if this GT is a hom var.
* @note only for diploids.
*/
bool is_hom_var() const;
/**
* @brief Checks if this genotype is a homozygous call that is reference.
* @return True if this GT is a hom ref.
* @note only for diploids.
*/
bool is_hom_ref() const;

/**
* @brief Returns a bit encoding of the first two alleles.
* @return A bit encoding of the first two alleles.
* @note only for diploids.
*/
GenotypeEncoding value() const; // returns the number of the genotype. => 0/1 = 1 | 0/0 = 0 | 1/1 = 2 ... | 0/2

/**
* @brief Returns a string decoding of each allele.
* @param index From [0,size())
* @return A string decoding of the allele at index.
*/
GenotypeEncodingValue operator[](const uint32_t index) const; // getter

/**
* @brief Number of alleles in this genotype.
* @return Number of alleles in this genotype.
*/
uint32_t size() const;

/**
* @brief The alleles in this genotype.
* @return The alleles in this genotype as strings.
*/
vector<string> alleles() const; // returns the actual alleles (e.g. A/T or TG/C or T/T/C or T/T/T/T/T ... )

private:
const std::shared_ptr<bcf1_t> m_body;
const bcf_fmt_t* const m_format_ptr;
const uint8_t* m_data_ptr;

int32_t allele(const uint32_t allele_index) const;

template<class TYPE>
int32_t allele(const uint32_t allele_index) const;

template<class TYPE>
vector<string> alleles(TYPE missing, TYPE vector_end) const;

string allele_string(int32_t allele_int) const;
};

}

#endif
62 changes: 62 additions & 0 deletions gamgee/genotype_encoding.h
@@ -0,0 +1,62 @@
#ifndef __gamgee__genotype_encoding__
#define __gamgee__genotype_encoding__

#include "utils/hts_memory.h"
#include "utils/utils.h"
#include "utils/format_field_type.h"

#include "htslib/vcf.h"

#include <memory>
#include <utility>

namespace gamgee {

/**
* @brief Encodes a diploid genotype as bits.
* @note Missing genotypes are not implemented yet.
*/
class GenotypeEncoding {

public:
/**
* @brief Constructs a genotype encoding.
* @param A bit encoding for the first two alleles.
*/
GenotypeEncoding(const uint32_t bits) :
m_bits {bits} {
}

/**
* @brief A bit encoding for the first two alleles.
* @return A bit encoding for the first two alleles.
*/
uint32_t bits() const {
return m_bits;
}

/**
* @brief Checks if another genotype encoding does not equal this encoding.
* @param other The other genotype encoding to compare.
* @return False if other genotype encoding equals this encoding.
*/
bool operator!=(const GenotypeEncoding& other) const {
return m_bits != other.m_bits;
}

/**
* @brief Checks if another genotype encoding equals this encoding.
* @param other The other genotype encoding to compare.
* @return True if other genotype encoding equals this encoding.
*/
bool operator==(const GenotypeEncoding& other) const {
return m_bits == other.m_bits;
}

private:
const uint32_t m_bits;
};

}

#endif
38 changes: 38 additions & 0 deletions gamgee/genotype_encoding_value.h
@@ -0,0 +1,38 @@
#ifndef __gamgee__genotype_encoding_value__
#define __gamgee__genotype_encoding_value__

#include "utils/hts_memory.h"
#include "utils/utils.h"
#include "utils/format_field_type.h"

#include "htslib/vcf.h"

#include <memory>
#include <string>

namespace gamgee {

using namespace std;

/**
* @brief Decodes a genotype allele.
*/
class GenotypeEncodingValue {
public:
GenotypeEncodingValue(const string allele): m_allele{allele} {}

/**
* @brief A single allele from a genotype.
*/
string allele() const // returns the actual allele (e.g. A or T or AT or ATCAG...) returns empty string when is missing.
{
return m_allele;
}

private:
const string m_allele;
};

}

#endif

0 comments on commit 3b0c2d5

Please sign in to comment.