Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions include/bio/ann_io/header.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
// Copyright (c) 2020-2021, deCODE Genetics
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

/*!\file
* \brief Provides bio::var_io::header and various auxiliary classes.
* \author Hannes Hauswedell <hannes.hauswedell AT decode.is>
*/

#pragma once

#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include <bio/detail/charconv.hpp>
#include <bio/detail/views_eager_split.hpp>
#include <bio/exception.hpp>
#include <bio/misc.hpp>
#include <bio/ann_io/misc.hpp>

namespace bio::ann_io
{
class header
{
public:
std::vector<std::pair<std::string, std::string>> browser_values{};
std::vector<std::pair<std::string, std::string>> track_values{};

/*!\name Constructors, destructor and assignment
* \{
*/
//!\brief Default construction.
header() = default; //!< Defaulted.
header(header const &) = default; //!< Defaulted.
header(header &&) = default; //!< Defaulted.
~header() = default; //!< Defaulted.
header & operator=(header const &) = default; //!< Defaulted.
header & operator=(header &&) = default; //!< Defaulted.

//!\brief Construct from a header given as plaintext.
explicit header(std::string_view plaintext_header)
{
if (plaintext_header.ends_with("\r\n"))
plaintext_header = plaintext_header.substr(0, plaintext_header.size() - 2);
else if (plaintext_header.ends_with("\n"))
plaintext_header = plaintext_header.substr(0, plaintext_header.size() - 1);

for (std::string_view const line : plaintext_header | detail::eager_split('\n'))
parse_line(line);
}

/*!\name Convert to plaintext ("raw") header
* \{
*/
//!\brief Converts the header to plaintext (includes IDX entries).
std::string to_plaintext() const { return to_plaintext_impl(); }
//!\}

private:
void parse_line(std::string_view const l)
{
if (l.starts_with("browser"))
{
auto pair_split = l.substr(8) | detail::eager_split(' ');
auto it1 = pair_split.begin();
auto it2 = std::ranges::next(it1);
auto it3 = std::ranges::next(it2); // TODO whats going on here?

if (it1 == std::default_sentinel || it2 == std::default_sentinel) //|| it3 != std::default_sentinel)
{
throw format_error{std::string{"Could not parse the following string into a dictionary: "} +
std::string{l}};
}

browser_values.emplace_back(static_cast<std::string>(*it1), static_cast<std::string>(*it2));
}
else if (l.starts_with("track"))
{
for (std::string_view const pair : l.substr(6) | detail::eager_split(' ', true))
{
auto pair_split = pair | detail::eager_split('=');
auto it1 = pair_split.begin();
auto it2 = std::ranges::next(it1);

if (it1 == std::default_sentinel || it2 == std::default_sentinel) //|| it3 != std::default_sentinel)
{
throw format_error{std::string{"Could not parse the following string into a dictionary: "} +
std::string{pair}};
}

track_values.emplace_back(static_cast<std::string>(*it1), static_cast<std::string>(strip_quotes(*it2)));
}
}
}

//!\brief Return a substring from the argument that does not contain enclosing quotes (if present).
static inline std::string_view strip_quotes(std::string_view const in)
{
return (in.size() < 2 || in.front() != '"' || in.back() != '"') ? in : in.substr(1, in.size() - 2);
}

/*!\name Functions for converting to text
* \{
*/
//!\brief Implementation function for creating the plaintext header.
std::string to_plaintext_impl() const
{
std::string raw_data;

constexpr auto quote_wrap = [](std::string in)
{
if (in.size() == 0)
in = "\"\"";
else if (in.front() != '\"')
in.insert(in.begin(), '\"');

if (in.size() == 1 || in.back() != '\"')
in.push_back('\"');

return in;
};

constexpr auto is_number = [](const std::string& s)
{
std::string::const_iterator it = s.begin();
while (it != s.end() && std::isdigit(*it)) ++it;
return !s.empty() && it == s.end();
};

/* First print out browser settings one per line */

for (auto const & e : browser_values)
{
(((((raw_data += "browser ") += e.first) += ' ') += e.second) += '\n');
}
/* Then print out track settings all on the same line. */
raw_data += "track ";
for (auto const & e : track_values)
{
((((raw_data += e.first) += '=') += is_number(e.second) ? e.second : quote_wrap(e.second)) += ' ');
}

return raw_data.substr(0, raw_data.size() - 1) + '\n';
}
//
// //!\brief Turn bio::value_type_id into string.
// static std::string unparse_type(value_type_id const id)
// {
// // TODO replace with string_view
// switch (id)
// {
// case value_type_id::int8:
// case value_type_id::vector_of_int8:
// case value_type_id::int16:
// case value_type_id::vector_of_int16:
// case value_type_id::int32:
// case value_type_id::vector_of_int32:
// return "Integer";
// case value_type_id::float32:
// case value_type_id::vector_of_float32:
// return "Float";
// case value_type_id::char8:
// case value_type_id::vector_of_char8:
// return "Character";
// case value_type_id::string:
// case value_type_id::vector_of_string:
// return "String";
// case value_type_id::flag:
// return "Flag";
// default:
// throw format_error{"Illegal type in INFO or FILTER header line."};
// }
// return "";
// }
//!\}
};
}
67 changes: 67 additions & 0 deletions include/bio/ann_io/misc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
// Copyright (c) 2020-2021, deCODE Genetics
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

/*!\file
* \brief Provides the bio::var_io::tag_dictionary class and auxiliaries.
* \author Joshua Kim <kim_j AT molgen.mpg.de>
*/

#pragma once

#include <bio/misc.hpp>
#include <bio/record.hpp>

namespace bio::ann_io
{
//-----------------------------------------------------------------------------
// default_field_ids
//-----------------------------------------------------------------------------

//!\brief Default fields for bio::var_io::reader_options.
//!\ingroup var_io
inline constinit auto default_field_ids = vtag<field::chrom,
field::chromStart,
field::chromEnd>;

//-----------------------------------------------------------------------------
// Pre-defined field types (reader)
//-----------------------------------------------------------------------------

/*!\name Pre-defined field types
* \brief These can be used to configure the behaviour of the bio::var_io::reader via bio::var_io::reader_options.
* \{
*/
/*!\brief The default field types for variant io.
*!\ingroup var_io
*
* \details
*
* These traits define a record type with minimal memory allocations for all input formats.
* It is the recommended record type when iterating ("streaming") over files that ca be any variant IO format.
*
* The "style" of the record resembles the VCF specification, i.e. contigs, FILTERs and INFO identifiers are
* represented as string/string_views. **However,** the genotypes are encoded by-genotype (BCF-style) and not by-sample
* (VCF-style) for performance reasons.
*
* See bio::var_io::genotypes_bcf_style for more information on the latter.
*/
template <ownership own = ownership::shallow>
inline constinit auto field_types =
ttag<std::string_view, // field::chrom,
uint32_t, // field::chromStart,
uint32_t>; // field::chromEnd

//!\brief Deep version of bio::var_io::field_types.
//!\ingroup var_io
template <>
inline constinit auto field_types<ownership::deep> =
ttag<std::string, // field::chrom,
uint32_t, // field::chromStart,
uint32_t>; // field::chromEnd

}
92 changes: 92 additions & 0 deletions include/bio/ann_io/reader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
// Copyright (c) 2020-2021, deCODE Genetics
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

/*!\file
* \brief Provides bio::ann_io::reader.
* \author Joshua Kim <kim_j AT molgen.mpg.de>
*/

#pragma once

#include <filesystem>

#include <bio/detail/reader_base.hpp>
#include <bio/ann_io/reader_options.hpp>
#include <bio/ann_io/header.hpp>
#include <bio/format/bed_input_handler.hpp>

namespace bio::ann_io
{
template <typename... option_args_t>
class reader : public reader_base<reader<option_args_t...>, reader_options<option_args_t...>>
{
private:
//!\brief The base class.
using base_t = reader_base<reader<option_args_t...>, reader_options<option_args_t...>>;
//!\brief Inherit the format_type definition.
using format_type = typename base_t::format_type;
/* Implementation note
* format_type is "inherited" as private here to avoid appearing twice in the documentation.
* Its actual visibility is public because it is public in the base class.
*/
//!\brief Make the format handler visible.
using base_t::format_handler;

//!\brief A pointer to the header inside the format.
bio::ann_io::header const * header_ptr = nullptr;
public:
// clang-format off
//!\copydoc bio::reader_base::reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{})
// clang-format on
reader(std::filesystem::path const & filename,
format_type const & fmt,
reader_options<option_args_t...> const & opt = reader_options<option_args_t...>{}) :
base_t{filename, fmt, opt}
{}

//!\overload
explicit reader(std::filesystem::path const & filename,
reader_options<option_args_t...> const & opt = reader_options<option_args_t...>{}) :
base_t{filename, opt}
{}

// clang-format off
//!\copydoc bio::reader_base::reader_base(std::istream & str, format_type const & fmt, options_t const & opt = options_t{})
// clang-format on
reader(std::istream & str,
format_type const & fmt,
reader_options<option_args_t...> const & opt = reader_options<option_args_t...>{}) :
base_t{str, fmt, opt}
{}

//!\overload
template <movable_istream temporary_stream_t>
//!\cond REQ
requires(!std::is_lvalue_reference_v<temporary_stream_t>)
//!\endcond
reader(temporary_stream_t && str,
format_type const & fmt,
reader_options<option_args_t...> const & opt = reader_options<option_args_t...>{}) :
base_t{std::move(str), fmt, opt}
{}

//!\brief Access the header.
bio::ann_io::header const & header()
{
if (header_ptr == nullptr)
{
// ensure that the format_handler is created
this->begin();

header_ptr = std::visit([](auto const & handler) { return &handler.get_header(); }, format_handler);
}

return *header_ptr;
}
};
}
Loading