Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Add ReferenceIterator class: loads one chromosome at a time
Browse files Browse the repository at this point in the history
- Add tests
- Update ReferenceMap tests
  • Loading branch information
jmthibault79 committed Sep 22, 2014
1 parent 8d73279 commit d891e5b
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 37 deletions.
18 changes: 18 additions & 0 deletions gamgee/exceptions.h
Expand Up @@ -26,6 +26,24 @@ class HtslibException : public std::runtime_error {
std::runtime_error{(boost::format("Error: htslib failed with error code %d. See stderr for details.") % error_code).str()} { }
};

/**
* @brief an exception class for the case where a chromosome is not found in the reference
*/
class ChromosomeNotFoundException : public std::runtime_error {
public:
ChromosomeNotFoundException(const std::string& chrom_name) :
std::runtime_error{(boost::format("Error: chromosome %s was not found in the given reference") % chrom_name).str()} { }
};

/**
* @brief an exception class for the case where a chromosome is not found in the reference
*/
class ChromosomeSizeException : public std::runtime_error {
public:
ChromosomeSizeException(const std::string& chrom_name, const size_t chrom_size, const int desired_location) :
std::runtime_error{(boost::format("Error: chromosome %s is of size %d but location %d was requested") % chrom_name % chrom_size % desired_location).str()} { }
};

} // end of namespace gamgee

#endif // end of gamgee__exceptions__guard
Expand Down
22 changes: 22 additions & 0 deletions gamgee/reference_iterator.cpp
@@ -0,0 +1,22 @@
#include "reference_iterator.h"

#include "exceptions.h"
#include "fastq_iterator.h"

namespace gamgee {

const char ReferenceIterator::ref_base(const std::string& chromosome, const int one_based_location) {
while (m_iterator != FastqIterator{} && chromosome != m_sequence.name())
m_sequence = ++m_iterator;

if (m_iterator == FastqIterator{})
throw new ChromosomeNotFoundException{chromosome};

if (one_based_location > m_sequence.sequence().size())
throw new ChromosomeSizeException{chromosome, m_sequence.sequence().size(), one_based_location};

return m_sequence.sequence()[one_based_location-1];
}

} // namespace gamgee

35 changes: 35 additions & 0 deletions gamgee/reference_iterator.h
@@ -0,0 +1,35 @@
#ifndef gamgee__reference_iterator__guard
#define gamgee__reference_iterator__guard

#include "fastq_iterator.h"
#include "fastq_reader.h"

namespace gamgee {

/**
* @brief Utility class to access reference bases in a FastA-formatted reference genome
*
* @warn the chromosomes in this reference must be accessed in order, or an exception will be thrown
*/
class ReferenceIterator {
public:
ReferenceIterator(const std::string& filename) :
m_iterator {FastqReader{filename}.begin()},
m_sequence {*m_iterator}
{}

/**
* @brief return the reference base character at the desired location
* @param chromosome the chromosome of the desired base
* @param one_based_location the one-based genomic location of the base
*/
const char ref_base(const std::string& chromosome, const int one_based_location);

private:
FastqIterator m_iterator; ///< @brief the current state of the iterator through the FastA input file
Fastq& m_sequence; ///< @brief a reference to the iterator's current sequence
};

} // namespace gamgee

#endif /* gamgee__reference_iterator__guard */
37 changes: 0 additions & 37 deletions test/reference_map_test.cpp

This file was deleted.

91 changes: 91 additions & 0 deletions test/reference_test.cpp
@@ -0,0 +1,91 @@
#include "reference_map.h"
#include "reference_iterator.h"

#include "utils/utils.h"

#include <boost/test/unit_test.hpp>

#include <vector>
#include <string>
#include <unordered_map>

using namespace std;
using namespace gamgee;

// truth data for test reference (if file changes, this has to change too)

const auto FILE1 = string{"testdata/test_reference.fa"};
const auto CHROMOSOMES1 = vector<string>{
"chrA", "chrB", "chrC", "chrD", "chrE", "chrF", "chrG", "chrH",
"chrI", "chrJ", "chrK", "chrL", "chrM", "chrN", "chrO", "chrQ", "chrR", "chrS", "chrT", "chrU"
};
const auto SEQ1 = string{"AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT"};

const auto FILE2 = string{"testdata/test_reference2.fa"};
const auto CHROMOSOMES2 = vector<string>{ "chr1", "chr2" };
const auto CHR1_SEQ = string{"AGGGATCCCCCCCCCCAGTACCNNNNAGTT"};
const auto CHR2_SEQ = string{"NNNNNNAGGGATCCCNCCCCCCCAGTACCNNNNAGTT"};

BOOST_AUTO_TEST_CASE( reference_map_constructor_test )
{
auto ref1 = ReferenceMap {FILE1};
for (const auto& chr_seq : ref1) {
BOOST_CHECK_EQUAL(chr_seq.first.substr(0,3), "chr");
BOOST_CHECK_EQUAL(chr_seq.second, SEQ1);
}

auto ref2 = ReferenceMap {FILE2};
BOOST_CHECK_EQUAL(ref2["chr1"], CHR1_SEQ);
BOOST_CHECK_EQUAL(ref2["chr2"], CHR2_SEQ);
// verify that it can access in arbitrary order
BOOST_CHECK_EQUAL(ref2["chr1"], CHR1_SEQ);
}

BOOST_AUTO_TEST_CASE( reference_map_get_sequence_test )
{
auto reference_map1 = ReferenceMap{FILE1};
for (auto start = 1u; start != SEQ1.length(); ++start) {
for (auto len = 1u; len <= SEQ1.length() - start; ++len) {
const auto interval = Interval{"chrA", start, start+len-1};
BOOST_CHECK_EQUAL(reference_map1.get_sequence(interval), SEQ1.substr(start-1, len));
BOOST_CHECK_EQUAL(reference_map1.get_sequence(interval, true), gamgee::utils::complement(SEQ1.substr(start-1, len)));
}
}

auto reference_map2 = ReferenceMap{FILE2};
for (auto start = 1u; start != CHR1_SEQ.length(); ++start) {
for (auto len = 1u; len <= CHR1_SEQ.length() - start; ++len) {
const auto interval = Interval{"chr1", start, start+len-1};
BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval), CHR1_SEQ.substr(start-1, len));
BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval, true), gamgee::utils::complement(CHR1_SEQ.substr(start-1, len)));
}
}
for (auto start = 1u; start != CHR2_SEQ.length(); ++start) {
for (auto len = 1u; len <= CHR2_SEQ.length() - start; ++len) {
const auto interval = Interval{"chr2", start, start+len-1};
BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval), CHR2_SEQ.substr(start-1, len));
BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval, true), gamgee::utils::complement(CHR2_SEQ.substr(start-1, len)));
}
}
}

BOOST_AUTO_TEST_CASE( reference_iterator_test ) {
auto reference1 = ReferenceIterator{FILE1};
for (const auto chr : CHROMOSOMES1) {
for (auto counter = 1; counter <= SEQ1.size(); counter++) {
const char truth = SEQ1[counter - 1];
BOOST_CHECK(truth == reference1.ref_base(chr, counter));
}
}

auto reference2 = ReferenceIterator{FILE2};
for (auto counter = 1; counter <= CHR1_SEQ.size(); counter++) {
const char truth = CHR1_SEQ[counter - 1];
BOOST_CHECK(truth == reference2.ref_base("chr1", counter));
}
for (auto counter = 1; counter <= CHR2_SEQ.size(); counter++) {
const char truth = CHR2_SEQ[counter - 1];
BOOST_CHECK(truth == reference2.ref_base("chr2", counter));
}
}

4 changes: 4 additions & 0 deletions testdata/test_reference2.fa
@@ -0,0 +1,4 @@
>chr1 text comments 1 2 3 4 5
AGGGATCCCCCCCCCCAGTACCNNNNAGTT
>chr2 more text comments
NNNNNNAGGGATCCCNCCCCCCCAGTACCNNNNAGTT

0 comments on commit d891e5b

Please sign in to comment.