diff --git a/include/bio/ann_io/header.hpp b/include/bio/ann_io/header.hpp new file mode 100644 index 0000000..aab05fe --- /dev/null +++ b/include/bio/ann_io/header.hpp @@ -0,0 +1,183 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::var_io::header and various auxiliary classes. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace bio::ann_io +{ +class header +{ +public: + std::vector> browser_values{}; + std::vector> track_values{}; + + /*!\name Constructors, destructor and assignment + * \{ + */ + //!\brief Default construction. + header() = default; //!< Defaulted. + header(header const &) = default; //!< Defaulted. + header(header &&) = default; //!< Defaulted. + ~header() = default; //!< Defaulted. + header & operator=(header const &) = default; //!< Defaulted. + header & operator=(header &&) = default; //!< Defaulted. + + //!\brief Construct from a header given as plaintext. + explicit header(std::string_view plaintext_header) + { + if (plaintext_header.ends_with("\r\n")) + plaintext_header = plaintext_header.substr(0, plaintext_header.size() - 2); + else if (plaintext_header.ends_with("\n")) + plaintext_header = plaintext_header.substr(0, plaintext_header.size() - 1); + + for (std::string_view const line : plaintext_header | detail::eager_split('\n')) + parse_line(line); + } + + /*!\name Convert to plaintext ("raw") header + * \{ + */ + //!\brief Converts the header to plaintext (includes IDX entries). + std::string to_plaintext() const { return to_plaintext_impl(); } + //!\} + +private: + void parse_line(std::string_view const l) + { + if (l.starts_with("browser")) + { + auto pair_split = l.substr(8) | detail::eager_split(' '); + auto it1 = pair_split.begin(); + auto it2 = std::ranges::next(it1); + auto it3 = std::ranges::next(it2); // TODO whats going on here? + + if (it1 == std::default_sentinel || it2 == std::default_sentinel) //|| it3 != std::default_sentinel) + { + throw format_error{std::string{"Could not parse the following string into a dictionary: "} + + std::string{l}}; + } + + browser_values.emplace_back(static_cast(*it1), static_cast(*it2)); + } + else if (l.starts_with("track")) + { + for (std::string_view const pair : l.substr(6) | detail::eager_split(' ', true)) + { + auto pair_split = pair | detail::eager_split('='); + auto it1 = pair_split.begin(); + auto it2 = std::ranges::next(it1); + + if (it1 == std::default_sentinel || it2 == std::default_sentinel) //|| it3 != std::default_sentinel) + { + throw format_error{std::string{"Could not parse the following string into a dictionary: "} + + std::string{pair}}; + } + + track_values.emplace_back(static_cast(*it1), static_cast(strip_quotes(*it2))); + } + } + } + + //!\brief Return a substring from the argument that does not contain enclosing quotes (if present). + static inline std::string_view strip_quotes(std::string_view const in) + { + return (in.size() < 2 || in.front() != '"' || in.back() != '"') ? in : in.substr(1, in.size() - 2); + } + + /*!\name Functions for converting to text + * \{ + */ + //!\brief Implementation function for creating the plaintext header. + std::string to_plaintext_impl() const + { + std::string raw_data; + + constexpr auto quote_wrap = [](std::string in) + { + if (in.size() == 0) + in = "\"\""; + else if (in.front() != '\"') + in.insert(in.begin(), '\"'); + + if (in.size() == 1 || in.back() != '\"') + in.push_back('\"'); + + return in; + }; + + constexpr auto is_number = [](const std::string& s) + { + std::string::const_iterator it = s.begin(); + while (it != s.end() && std::isdigit(*it)) ++it; + return !s.empty() && it == s.end(); + }; + + /* First print out browser settings one per line */ + + for (auto const & e : browser_values) + { + (((((raw_data += "browser ") += e.first) += ' ') += e.second) += '\n'); + } + /* Then print out track settings all on the same line. */ + raw_data += "track "; + for (auto const & e : track_values) + { + ((((raw_data += e.first) += '=') += is_number(e.second) ? e.second : quote_wrap(e.second)) += ' '); + } + + return raw_data.substr(0, raw_data.size() - 1) + '\n'; + } + // + // //!\brief Turn bio::value_type_id into string. + // static std::string unparse_type(value_type_id const id) + // { + // // TODO replace with string_view + // switch (id) + // { + // case value_type_id::int8: + // case value_type_id::vector_of_int8: + // case value_type_id::int16: + // case value_type_id::vector_of_int16: + // case value_type_id::int32: + // case value_type_id::vector_of_int32: + // return "Integer"; + // case value_type_id::float32: + // case value_type_id::vector_of_float32: + // return "Float"; + // case value_type_id::char8: + // case value_type_id::vector_of_char8: + // return "Character"; + // case value_type_id::string: + // case value_type_id::vector_of_string: + // return "String"; + // case value_type_id::flag: + // return "Flag"; + // default: + // throw format_error{"Illegal type in INFO or FILTER header line."}; + // } + // return ""; + // } + //!\} +}; +} diff --git a/include/bio/ann_io/misc.hpp b/include/bio/ann_io/misc.hpp new file mode 100644 index 0000000..78887cb --- /dev/null +++ b/include/bio/ann_io/misc.hpp @@ -0,0 +1,67 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the bio::var_io::tag_dictionary class and auxiliaries. + * \author Joshua Kim + */ + + #pragma once + + #include + #include + +namespace bio::ann_io +{ +//----------------------------------------------------------------------------- +// default_field_ids +//----------------------------------------------------------------------------- + +//!\brief Default fields for bio::var_io::reader_options. +//!\ingroup var_io +inline constinit auto default_field_ids = vtag; + +//----------------------------------------------------------------------------- +// Pre-defined field types (reader) +//----------------------------------------------------------------------------- + +/*!\name Pre-defined field types +* \brief These can be used to configure the behaviour of the bio::var_io::reader via bio::var_io::reader_options. +* \{ +*/ +/*!\brief The default field types for variant io. +*!\ingroup var_io +* +* \details +* +* These traits define a record type with minimal memory allocations for all input formats. +* It is the recommended record type when iterating ("streaming") over files that ca be any variant IO format. +* +* The "style" of the record resembles the VCF specification, i.e. contigs, FILTERs and INFO identifiers are +* represented as string/string_views. **However,** the genotypes are encoded by-genotype (BCF-style) and not by-sample +* (VCF-style) for performance reasons. +* +* See bio::var_io::genotypes_bcf_style for more information on the latter. +*/ +template +inline constinit auto field_types = +ttag; // field::chromEnd + +//!\brief Deep version of bio::var_io::field_types. +//!\ingroup var_io +template <> +inline constinit auto field_types = +ttag; // field::chromEnd + +} diff --git a/include/bio/ann_io/reader.hpp b/include/bio/ann_io/reader.hpp new file mode 100644 index 0000000..a35daec --- /dev/null +++ b/include/bio/ann_io/reader.hpp @@ -0,0 +1,92 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::ann_io::reader. + * \author Joshua Kim + */ + + #pragma once + + #include + + #include + #include + #include + #include + +namespace bio::ann_io +{ +template +class reader : public reader_base, reader_options> +{ +private: + //!\brief The base class. + using base_t = reader_base, reader_options>; + //!\brief Inherit the format_type definition. + using format_type = typename base_t::format_type; + /* Implementation note + * format_type is "inherited" as private here to avoid appearing twice in the documentation. + * Its actual visibility is public because it is public in the base class. + */ + //!\brief Make the format handler visible. + using base_t::format_handler; + + //!\brief A pointer to the header inside the format. + bio::ann_io::header const * header_ptr = nullptr; +public: + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::filesystem::path const & filename, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{filename, fmt, opt} + {} + + //!\overload + explicit reader(std::filesystem::path const & filename, + reader_options const & opt = reader_options{}) : + base_t{filename, opt} + {} + + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::istream & str, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::istream & str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{str, fmt, opt} + {} + + //!\overload + template + //!\cond REQ + requires(!std::is_lvalue_reference_v) + //!\endcond + reader(temporary_stream_t && str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{std::move(str), fmt, opt} + {} + + //!\brief Access the header. + bio::ann_io::header const & header() + { + if (header_ptr == nullptr) + { + // ensure that the format_handler is created + this->begin(); + + header_ptr = std::visit([](auto const & handler) { return &handler.get_header(); }, format_handler); + } + + return *header_ptr; + } +}; +} diff --git a/include/bio/ann_io/reader_options.hpp b/include/bio/ann_io/reader_options.hpp new file mode 100644 index 0000000..a2a7676 --- /dev/null +++ b/include/bio/ann_io/reader_options.hpp @@ -0,0 +1,209 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::ann_io::reader_options. + * \author Joshua Kim + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include + +namespace bio::ann_io +{ + +//----------------------------------------------------------------------------- +// reader_options +//----------------------------------------------------------------------------- + +/*!\brief Options that can be used to configure the behaviour of bio::ann_io::reader. + * \tparam field_ids_t Type of the field_ids member (usually deduced). + * \tparam field_types_t Type of the field_types member (usually deduced). + * \tparam formats_t Type of the formats member (usually deduced). + * \ingroup ann_io + * + * \details + * + * This object can be configured in a similar way as bio::seq_io::reader_options. + * If you are new to the way options are set in this library, have a look bio::seq_io::reader + * and bio::seq_io::reader_options first, as those are much simpler. + * + * ## Field types + * + * The internal representation of VCF and BCF are different. To be able to freely + * interchange between these formats, this library needs to choose one representation that + * everything is converted to when being read. + * + * Changing the field_types member configures the reader to return data in different types. + * One thing that is fixed for all configurations in this library is the layout of the GENOTYPES field + * which is always grouped "by-field" (BCF-style) and not "by-sample" (VCF-style). + * Another important choice is that **numbers are always 1-based,** because this is the default in VCF + * and all other tools that deal with VCF/BCF. + * + * Beyond that, a wide variety of types are supported per field (see below), but most users will be happy + * with one of the predefined sets. + * + * ### Pre-defined tags + * + * Two "styles" of field types are predefined: + * + * 1. bio::ann_io::field_types (the default) + * * All "strings" are represented as strings. + * 2. bio::ann_io::field_types_bcf_style (BCF-style) + * * Most "strings" are represented by their in-header IDX value (see the BCF spec for more details). + * * When reading and writing, you need to make sure that the IDX values in the output header are the same as in the + * input header, otherwise your record fields might change meaning or even become invalid. + * + * Both styles are "shallow" by default, but can be configured to be "deep": + * + * 1. shallow (bio::ownership::shallow) + * * The record contains light-weight data structures like views. + * * Record cannot be "stored"; it depends on internal caches and buffers, and it becomes invalid + * as soon as the next record is read from the file. + * 2. deep (bio::ownership::deep) + * * The record is self-contained; sequences and strings are stored in containers. + * * Record can be copied or stored and can "live on" independently of the reader. + * + * This example shows how to use deep records: + * + * \snippet test/snippet/ann_io/ann_io_reader_options.cpp field_types_deep + * + * Performance implications: + * * Shallow records imply fewer allocations and lower overhead during reading. + * * If you know that you need to copy your fields anyway, using a deep record can be faster than using a shallow + * record and copying the data "manually" out of that (because certain internal caches are re-used to create deep + * records). + * * field_types_bcf_style is faster than field_types, but for the shallow variants + * there is almost no difference. + * + * TODO some of this should be moved to a general documentation page on configuring records; shallow vs deep; etc + * + * ### Manual configuration + * + * This section is only relevant if you specify the #field_types member manually via + * a bio::ttag, i.e. if you change the field_types but do not use one of the predefined tags + * (see above). + * + * The following types are valid for the respective fields and you can mix-and-match shallow/deep and integral/text IDs: + * + * 1. bio::field::chrom + * * string or string_view: The chromosome string is returned. + * * `int32_t`: The IDX value for the chromosome is returned. + * 2. bio::field::pos + * * any integral: The position is returned as a number (`int32_t` recommended). + * 3. bio::field::id + * * string or string_view: The ID as a string. + * 4. bio::field::ref + * * string or string_view: plaintext. + * * back-insertable range over seqan3::alphabet (a container with converted elements). + * * `decltype(std::string_view{} | seqan3::views::char_strictly_to)`: A view + * over a SeqAn3 alphabet. Other alphabets and/or transform views are also possible. + * 5. bio::field::alt + * * back-insertable range of string or string_view: The ALTs as plaintext. + * * back-insertable range over views: similar views as for field::ref are supported but only + * use this if you are sure there are no breakpoint strings etc. in the file! + * 6. bio::field::qual + * * any arithmetic type: The quality as a number. + * 7. bio::field::filter + * * back-insertable range of string or string_view: The filters as strings. + * * back-insertable range of `int32_t`: The IDX values of the filters. + * 8. bio::field::info + * * back-insertable range of elements "similar" to bio::ann_io::info_element: + * * The elements must be decomposable into two subelements (`struct` or tuple). + * * The first subelement must be either a string[_view] (ID) or `int32_t` (IDX). + * * The second subelement must be bio::ann_io::info_element_value_type. + * 9. field::genotypes + * * back-insertable range of elements "similar" to bio::ann_io::genotype_element: + * * The elements must be decomposable into exactly two sub-elements (either `struct` or tuple). + * * The first subelement must be either a string[_view] (ID) or `int32_t` (IDX). + * * The second subelement must bio::ann_io::genotype_element_value_type. + * + * This example shows how to read only a subset of the available fields and manually specify their type: + * + * \snippet test/snippet/ann_io/ann_io_reader_options.cpp field_types_expert + * + * Reading fewer fields than available may provide a noticeable speed-up since only the + * requested fields are actually parsed. Any field may also be set to `std::span` which + * results in no parsing happening for that field. + * + */ +template ), + typename formats_t = seqan3::type_list> +struct reader_options +{ + //!\brief The fields that shall be contained in each record; a seqan3::tag over seqan3::field. + field_ids_t field_ids = default_field_ids; + + /*!\brief The types corresponding to each field; a bio::ttag over the types. + * + * \details + * + * See bio::ann_io::reader_options for an overview of the supported field/type combinations. + */ + field_types_t field_types = bio::ann_io::field_types; + + /*!\brief The formats that input files can take; a bio::ttag over the types. + * + * \details + * + * See bio::ann_io::reader for an overview of the the supported formats. + */ + formats_t formats = ttag; + + //!\brief Whether to print non-critical file format warnings. + bool print_warnings = true; + + //!\brief Options that are passed on to the internal stream oject. + transparent_istream_options stream_options{}; + +private: + static_assert(detail::is_fields_tag, "field_ids must be a bio::vtag over bio::field."); + + static_assert(detail::is_type_list, "field_types must be a bio::ttag / seqan3::type_list."); + + static_assert(detail::is_type_list, "formats must be a bio::ttag / seqan3::type_list."); + + static_assert(field_ids_t::size == field_types_t::size(), "field_ids and field_types must have the same size."); + + //!\brief Type of the record. + using record_t = record; + + static_assert( + detail::lazy_concept_checker([](auto) requires( + !field_ids_t::contains(field::chrom) || + detail::back_insertable_with, char> || + detail::one_of, std::string_view>) { return std::true_type{}; }), + "Requirements for the field-type of the CHROM-field not met. See documentation for bio::ann_io::reader_options."); + + static_assert( + detail::lazy_concept_checker([](auto) requires( + !field_ids_t::contains(field::chromStart) || + std::integral>>) { return std::true_type{}; }), + "Requirements for the field-type of the chromStart-field not met. See documentation for bio::ann_io::reader_options."); + + static_assert( + detail::lazy_concept_checker([](auto) requires( + !field_ids_t::contains(field::chromEnd) || + std::integral>>) { return std::true_type{}; }), + "Requirements for the field-type of the chromEnd-field not met. See documentation for bio::ann_io::reader_options."); +}; + +} // namespace bio::ann_io diff --git a/include/bio/ann_io/writer.hpp b/include/bio/ann_io/writer.hpp new file mode 100644 index 0000000..fa75b80 --- /dev/null +++ b/include/bio/ann_io/writer.hpp @@ -0,0 +1,232 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::var_io::writer. + * \author Joshua Kim + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace bio::ann_io +{ + +// ---------------------------------------------------------------------------- +// writer +// ---------------------------------------------------------------------------- + +/*!\brief A class for writing annotation files, e.g. BED, BigBED, WIG, BigWIG. + * \tparam option_args_t Arguments that are forwarded to bio::ann_io::writer_options. + * \ingroup ann_io + * + * \details + * + * ### Introduction + * + * Annotation files are files that contain sequence variation information. Well-known formats include + * BED and BigBED. + * + * The Annotation I/O writer supports writing the following fields: + * + * 1. bio::field::chrom + * 2. bio::field::chromStart + * 3. bio::field::chromEnd + * + * These fields correspond to the order and names defined in the BED specification. The value conventions + * also correspond to the BED specification (i.e. 1-based positions). + * See below for the list of supported types and the semantic implications. + * + * This writer supports the following formats: + * + * 1. BED (see also bio::bed) + * + * If you only need to write BED and not BigBED and you have all your column data as strings, + * you can use bio::plain_io::writer instead of this writer (it will be easier to use and faster). + * + * ### Creating a writer + * + * This creates a writer with a valid header: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp creation + * + * **If you copy'n'paste from this example, make sure that columns in the last line are tab-separated and not + * space-separated!** + * + * This example is used as "prefix" for some of the following examples. + * + * ### Writing a record + * + * Create a record using the bio::ann_io::default_record and setting the members: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp simple_usage_file + * + * ### Writing a record without having a record + * + * You can use #emplace_back() to write the fields directly without creating a record first: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp emplace_back + * + * This is especially helpful if your fields exist in other separate data structures already. + * Be aware that it is easier to mess up the order of the arguments this way. + * The order/composition can specified by bio::vtag as first argument (see next example). + * If it is omitted, it is equal to bio::ann_io::default_field_ids. + * + * The #emplace_back() function can be used to write fewer fields: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp emplace_back2 + * + * These three fields are required; the missing fields are replaced with ".". + * + * ### Specifying options + * + * This snippet demonstrates how to create VCF files with windows line-endings: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp options + * + * For more advanced options, see bio::ann_io::writer_options. + * + * ### Combining reading and writing + * + * This simple snippet demonstrates how to pipe from a reader into a writer (transparently converting): + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp inout + * + * Views can be used to modify or filter the records before being written: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp inout2 + * + * A more "traditional" programming style with the same semantic would look like this: + * + * \snippet test/snippet/ann_io/ann_io_writer.cpp inout3 + * + * ### Field type requirements + * + * TODO + * + */ +template +class writer : public writer_base> +{ +private: + //!\brief The base class. + using base_t = writer_base>; + //!\brief Inherit the format_type definition. + using format_type = typename base_t::format_type; + /* Implementation note + * format_type is "inherited" as private here to avoid appearing twice in the documentation. + * Its actual visibility is public because it is public in the base class. + */ + + //!\brief Make the format handler visible. + using base_t::format_handler; + //!\brief Make the init_state handler visible. + using base_t::init_state; + +public: + // TODO wrap this, so we don't return reference to base + using base_t::operator=; + + // clang-format off + //!\copydoc bio::writer_base::writer_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + writer(std::filesystem::path const & filename, + format_type const & fmt, + writer_options const & opt = writer_options{}) : + base_t{filename, fmt, opt} + {} + + //!\overload + explicit writer(std::filesystem::path const & filename, + writer_options const & opt = writer_options{}) : + base_t{filename, opt} + {} + + // clang-format off + //!\copydoc bio::writer_base::writer_base(std::ostream & str, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + writer(std::ostream & str, + format_type const & fmt, + writer_options const & opt = writer_options{}) : + base_t{str, fmt, opt} + {} + + //!\overload + template + //!\cond REQ + requires(!std::is_lvalue_reference_v) + //!\endcond + writer(temporary_stream_t && str, + format_type const & fmt, + writer_options const & opt = writer_options{}) : + base_t{std::move(str), fmt, opt} + {} + + //!\brief Destructor which can potentially throw. + ~writer() noexcept(false) = default; + + // prevent the overload below from removing the overload from base_t + using base_t::emplace_back; + + /*!\brief Write a record to the file by passing individual fields. + * \param[in] args The fields to be written. + * + * \details + * + * This function is the same as bio::writer_base::emplace_back, except that the field_ids can be + * omitted. If the number of arguments 10, bio::var_io::default_field_ids is chosen; if it is + */ + void emplace_back(auto &&... args) + { + static_assert(sizeof...(args) == default_field_ids.size || sizeof...(args) == default_field_ids.size - 1, + "emplace_back() has to be called with 3 arguments."); + + if constexpr (sizeof...(args) == default_field_ids.size - 1) + { + // TODO replace this with some metaprogramming? + base_t::emplace_back(vtag, + args...); + } + else + { + base_t::emplace_back(default_field_ids, args...); + } + } + + //!\brief Get the header used by the format. + bio::ann_io::header const & header() + { + return std::visit( + detail::overloaded{[](std::monostate) {}, [](auto const & handler) { return handler.get_header(); }}, + format_handler); + } + + //!\brief Set the header to the given value. + template + requires std::same_as> + void set_header(header_t && hdr) + { + if (!init_state) + throw bio_error{"You cannot change the header after I/O has happened."}; + + std::visit(detail::overloaded{[](std::monostate) {}, + [&hdr](auto & handler) { handler.set_header(std::forward(hdr)); }}, + format_handler); + } +}; + +} // namespace bio::ann_io diff --git a/include/bio/ann_io/writer_options.hpp b/include/bio/ann_io/writer_options.hpp new file mode 100644 index 0000000..1b08631 --- /dev/null +++ b/include/bio/ann_io/writer_options.hpp @@ -0,0 +1,131 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::ann_io::writer_options. + * \author Joshua Kim + */ + +#pragma once + +#include +#include +#include + +// namespace bio::detail +// { +// +// template +// concept ann_io_legal_type_aux = +// std::same_as || std::signed_integral || std::floating_point || std::same_as < std::decay_t, +// char const * > ; +// +// /*!\interface bio::detail::ann_io_legal_type <> +// * \tparam t The type to check. +// * \brief A type that is similar to one of the alternatives of bio::ann_io::info_element_value_type +// */ +// //!\cond CONCEPT_DEF +// template +// concept ann_io_legal_type = ann_io_legal_type_aux> || std::same_as || +// (std::ranges::forward_range && (ann_io_legal_type_aux>> || +// (std::ranges::forward_range> && +// std::same_as const &>))); +// //!\endcond +// +// /*!\interface bio::detail::ann_io_legal_vector_type <> +// * \tparam t The type to check. +// * \brief A type that is similar to one of the alternatives of bio::ann_io::info_element_value_type +// */ +// //!\cond CONCEPT_DEF +// template +// concept ann_io_legal_vector_type = +// std::ranges::forward_range && ann_io_legal_type> && +// !std::same_as>; +// //!\endcond +// +// /*!\interface bio::detail::ann_io_legal_or_dynamic <> +// * \tparam t The type to check. +// * \brief A type that is similar to one of the alternatives of bio::ann_io::info_element_value_type +// */ +// //!\cond CONCEPT_DEF +// template +// concept ann_io_legal_or_dynamic = ann_io_legal_type || is_info_element_value_type; +// //!\endcond +// +// /*!\interface bio::detail::ann_io_vector_legal_or_dynamic <> +// * \tparam t The type to check. +// * \brief A type that is similar to one of the alternatives of bio::ann_io::info_element_value_type +// */ +// //!\cond CONCEPT_DEF +// template +// concept ann_io_vector_legal_or_dynamic = ann_io_legal_vector_type || is_genotype_element_value_type; +// //!\endcond +// +// /*!\interface bio::detail::info_element_writer_concept <> +// * \tparam t The type to check. +// * \brief Types "similar" to bio::ann_io::info_element / bio::ann_io::info_element_bcf. +// */ +// //!\cond CONCEPT_DEF +// template +// concept info_element_writer_concept = detail::decomposable_into_two && +// (detail::char_range_or_cstring> || +// std::same_as>)&&detail::ann_io_legal_or_dynamic>; +// //!\endcond +// +// /*!\interface bio::detail::genotype_writer_concept <> +// * \tparam t The type to check. +// * \brief Types "similar" to bio::ann_io::genotype_element / bio::ann_io::genotype_element_bcf. +// */ +// //!\cond CONCEPT_DEF +// template +// concept genotype_writer_concept = detail::decomposable_into_two && +// (detail::char_range_or_cstring> || +// std::same_as>)&&detail::ann_io_vector_legal_or_dynamic>; +// //!\endcond +// +// } // namespace bio::detail + +namespace bio::ann_io +{ + +/*!\brief Options that can be used to configure the behaviour of bio::ann_io::writer. + * \tparam formats_t Type of the formats member (usually deduced). + * \ingroup ann_io + * + * \details + * + * TODO describe how to easily initialise this + */ +template > +struct writer_options +{ + /*!\brief The formats that output files can take; a bio::ttag over the types. + * + * \details + * + * See bio::ann_io::writer for an overview of the the supported formats. + */ + formats_t formats = ttag; + + //!\brief Options that are passed on to the internal stream oject. + transparent_ostream_options stream_options{}; + + /*!\brief Write legacy Windows line-endings including carriage return. + * + * \details + * + * This option results in old Windows-style line-endings ("\r\n"). Since Windows supports the typical UNIX + * line-endigns ("\n") nowadays, this option is is highly discouraged. + */ + bool windows_eol = false; + +private: + static_assert(detail::is_type_list, "formats must be a bio::ttag / seqan3::type_list."); +}; + +} // namespace bio::ann_io diff --git a/include/bio/io/detail/tuple_record.hpp b/include/bio/io/detail/tuple_record.hpp index a63ff15..78e349d 100644 --- a/include/bio/io/detail/tuple_record.hpp +++ b/include/bio/io/detail/tuple_record.hpp @@ -75,6 +75,18 @@ enum class field : uint64_t filter, //!< FILTER field in Var I/O. info, //!< INFO field in Var I/O. genotypes, //!< GENOTYPES in Var I/O. + + // Fields unique to annotation io + chromStart = pos, + chromEnd, + score, + strand, + thickStart, + thickEnd, //!< Ending position + itemRgb, //!< An RGB value to determine the color of the displayed track in the browser. + blockCount, //!< The number of blocks (exons) in the BED file. + blockSizes, //!< A list of the block sizes, corresponding to blockCount. + blockStarts, //!< A list of block starts, relative to offset. /*_private*/ // User defined field aliases @@ -249,6 +261,16 @@ struct tuple_record : bio::meta::transfer_template_args_onto_t + */ + +#pragma once + +#include +#include + +#include + +namespace bio +{ + +/*!\brief The Browser Extensible Data (BED) format. + * \ingroup format + * + * \details + * + * This is the BED format tag. If you want to read BED files, use bio::ann_io::reader, and if you want + * to write BED files, use bio::ann_io::writer. + * + * ### Fields + * + * The format consists of the following fields: + * + * 1. bio::field::chrom + * 2. bio::field::chromStart + * 3. bio::field::chromEnd + * + * See bio::ann_io::reader and bio::ann_io::writer for more details. + * + * ### Implementation + * + * The implementation target [version 4.3 of the VCF specification](https://samtools.github.io/hts-specs/VCFv4.3.pdf). + * However, reading version 4.2 should be possible, too. + * Little testing has been done on handling structural variants and breakend strings, but in theory the values + * should be parsed correctly (as strings). + * + * No testing has been done on gVCF files, but in theory all values should be parsed correctly. + * + * Please report any issues you find. + */ +struct bed +{ + //!\brief The valid file extensions for this format; note that you can modify this value. + static inline std::vector file_extensions{{"bed"}}; +}; + +} // namespace bio diff --git a/include/bio/io/format/bed_input_handler.hpp b/include/bio/io/format/bed_input_handler.hpp new file mode 100644 index 0000000..70fe1a6 --- /dev/null +++ b/include/bio/io/format/bed_input_handler.hpp @@ -0,0 +1,168 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::ann_io::reader. + * \author Joshua Kim + */ + +#pragma once + +#include + +namespace bio +{ +template <> +class format_input_handler : public format_input_handler_base> +{ +private: + /*!\name CRTP related entities + * \{ + */ + //!\brief The type of the CRTP base class. + using base_t = format_input_handler_base>; + using base_t::parse_field; + using base_t::parse_field_aux; + using base_t::stream; + + //!\brief Befriend the base class to enable CRTP. + friend base_t; + //!\} + + //!\brief Print an error message with current line number in diagnostic. + [[noreturn]] void error(auto const &... messages) const + { + std::string message = "[SeqAn3 BED format error in line " + detail::to_string(line) + "] "; + ((message += detail::to_string(messages)), ...); + + throw format_error{message}; + } + + /*!\name Options + * \{ + */ + //!\brief Whether to print warnings or not. + bool print_warnings = true; + //!\} + + /*!\name Raw record handling + * \{ + */ + //!\brief The fields that this format supports [the base class accesses this type]. + using format_fields = decltype(ann_io::default_field_ids); + //!\brief Type of the raw record. + using raw_record_type = + record>; + + //!\brief Type of the low-level iterator. + using lowlevel_iterator = detail::plaintext_input_iterator; + + //!\brief The raw record. + raw_record_type raw_record; + //!\brief The header. + ann_io::header header; + //!\brief Lowlevel stream iterator. + lowlevel_iterator file_it; + //!\brief Cache of the chromosome string. + std::string last_chrom; + //!\brief Current line number in file. + size_t line = 0; + + //!\brief Read the raw record [the base class invokes this function]. + void read_raw_record() + { + ++line; + ++file_it; + + if (size_t field_num = file_it->fields.size(); field_num < 3) + error("Expected at least 3 fields but got ", field_num); + + get(raw_record) = (*file_it).fields[0]; + get(raw_record) = (*file_it).fields[1]; + get(raw_record) = (*file_it).fields[2]; + } + //!\} + + /*!\name Parsed record handling + * \brief This is mostly done via the defaults in the base class. + * \{ + */ + + // implementation after class + // template + // requires detail::is_info_element_value_type || detail::is_genotype_element_value_type + // static void init_element_value_type(ann_io::value_type_id const id, t & output); + + // implementation after class + // struct parse_element_value_type_fn; + + // implementation after class + // static size_t parse_element_value_type(var_io::value_type_id const id, + // std::string_view const input_string, + // detail::is_info_element_value_type auto & output); + + //!\brief Parse the CHROM field. Note there is no index, as BED files don't store them. + // void parse_field(vtag_t const & /**/, auto & parsed_field) + // { + // using parsed_field_t = std::remove_cvref_t; + // std::string_view raw_field = get(raw_record); + // + // if (raw_field != last_chrom) last_chrom = raw_field; + // parsed_field = static_cast(raw_field); + // } + + /* chromStart and chromEnd are handled correctly by default */ + //!\} +public: + /*!\name Constructors, destructor and assignment. + * \{ + */ + format_input_handler() = default; //!< Defaulted. + format_input_handler(format_input_handler const &) = delete; //!< Deleted. + format_input_handler(format_input_handler &&) = default; //!< Defaulted. + ~format_input_handler() = default; //!< Defaulted. + format_input_handler & operator=(format_input_handler const &) = delete; //!< Deleted. + format_input_handler & operator=(format_input_handler &&) = default; //!< Defaulted. + + /*!\brief Construct with an options object. + * \param[in,out] str The input stream. + * \param[in] options An object with options for the input handler. + * \details + * + * The options argument is typically bio::var_io::reader_options, but any object with a subset of similarly named + * members is also accepted. See bio::format_input_handler for the supported options and defaults. + */ + template + format_input_handler(std::istream & str, options_t const & options) : base_t{str}, file_it{str, false /*no_init!*/} + + { + // extract options + if constexpr (requires { (bool)options.print_warnings; }) + { + print_warnings = options.print_warnings; + } + + std::string header_string; + while (file_it != std::default_sentinel && (file_it.peak() == 't' || file_it.peak() == 'b')) + { + ++file_it; + ++line; + header_string += file_it->line; + header_string += "\n"; + } + header = ann_io::header{std::move(header_string)}; + } + + //!\brief Construct with only an input stream. + format_input_handler(std::istream & str) : format_input_handler{str, int{}} {} + //!\} + + //!\brief Return a reference to the header contained in the input handler. + ann_io::header const & get_header() const { return header; } +}; +} diff --git a/include/bio/io/format/bed_output_handler.hpp b/include/bio/io/format/bed_output_handler.hpp new file mode 100644 index 0000000..92a8ec8 --- /dev/null +++ b/include/bio/io/format/bed_output_handler.hpp @@ -0,0 +1,276 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the bio::format_output_handler. + * \author Joshua Kim + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace bio +{ + +/*!\brief Format output handler for the BED format (bio::bed). + * \ingroup format + * \details + * + * ### Attention + * + * Most users should not perform I/O through input/output handlers but should instead use the respective + * readers/writers. See the overview (TODO link) for more information. + * + * ### Options + * + * The following options are considered if the respective member variable is availabele in the object passed to + * the constructor: + * + * | Member | Type | Default | Description | + * |-----------------|---------|---------|-------------------------------------------------------------------| + * |`windows_eol` |`bool` | `false` | Whether old-Windows style carriage return characters are printed. | + * + * ### Performance + * + * TODO after genotype redesign + */ +template <> +class format_output_handler : public format_output_handler_base> +{ +private: + /*!\name CRTP related entities + * \{ + */ + //!\brief The base class. + using base_t = format_output_handler_base>; + //!\brief Befriend the base class so we can instantiate. + friend base_t; + + using base_t::it; + using base_t::stream; + using base_t::write_field; + using base_t::write_field_aux; + //!\} + + /*!\name State + * \{ + */ + //!\brief Can be used to infer if the object is in moved-from state. + detail::move_tracker move_tracker; + //!\brief Whether the header has been written or not. + bool header_has_been_written = false; + + //!\brief Pointer to header that can be owning or non-owning. + std::unique_ptr header = {nullptr, + [](ann_io::header const *) {}}; + //!\} + + /*!\name Options + * \{ + */ + //!\brief Write legacy Windows line-endings including carriage return. + bool windows_eol = false; + //!\} + + /*!\name Arbitrary helpers + * \{ + */ + //!\brief A range adaptor that gets the first element in a decomposable type. + static constexpr auto views_get_first = + std::views::transform([](auto & pair) -> decltype(auto) { return detail::get_first(pair); }); + + //!\brief A range adaptor that gets the second element in a decomposable type. + static constexpr auto views_get_second = + std::views::transform([](auto & pair) -> decltype(auto) { return detail::get_second(pair); }); + + //!\brief Write the elements of the range or tuple, char-delimited. + void write_delimited(std::ranges::input_range auto && range, char const delim, auto && func) + { + if (std::ranges::empty(range)) + it = '.'; + else + { + auto b = std::ranges::begin(range); + auto e = std::ranges::end(range); + func(*b); + ++b; + for (; b != e; ++b) + { + it = delim; + func(*b); + } + } + } + + //!\overload + void write_delimited(std::ranges::input_range auto && range, char const delim) + { + if (std::ranges::empty(range)) + it = '.'; + else + { + auto b = std::ranges::begin(range); + auto e = std::ranges::end(range); + write_field_aux(*b); + ++b; + for (; b != e; ++b) + { + it = delim; + write_field_aux(*b); + } + } + } + + //!\overload + void write_delimited(auto && tup, char const delim, auto && func) + { + if constexpr (std::tuple_size_v> == 0) + it = '.'; + else + { + auto pack_for_each = [&](auto &&... args) + { + bool first_elem = true; + (((first_elem ? (first_elem = false, it) : it = delim), func(std::forward(args))), ...); + }; + std::apply(pack_for_each, std::forward(tup)); + } + } + //!\} + + //!\brief Write the header. + void write_header() + { + if (header != nullptr) + it->write_range(header->to_plaintext()); + + header_has_been_written = true; + } + + //!\brief Write the record (supports const and non-const lvalue ref). + void write_record_impl(auto & record) + { + using field_ids = typename std::remove_cvref_t::field_ids; + + if (!header_has_been_written) + write_header(); + + static_assert(field_ids::contains(field::chrom), "The record must contain the chrom field."); + write_field(vtag, get(record)); + it = '\t'; + + static_assert(field_ids::contains(field::chromStart), "The record must contain the chromStart field."); + write_field(vtag, get(record)); + it = '\t'; + + static_assert(field_ids::contains(field::chromEnd), "The record must contain the chromEnd field."); + write_field(vtag, get(record)); + + it->write_end_of_line(windows_eol); + } + +public: + /*!\name Constructors, destructor and assignment. + * \brief These are all private to prevent wrong instantiation. + * \{ + */ + format_output_handler() = delete; //!< Defaulted. + format_output_handler(format_output_handler const &) = delete; //!< Deleted. + format_output_handler(format_output_handler &&) = default; //!< Defaulted. + format_output_handler & operator=(format_output_handler const &) = delete; //!< Deleted. + format_output_handler & operator=(format_output_handler &&) = default; //!< Defaulted. + + /*!\brief Construct with an options object. + * \param[in,out] str The output stream. + * \param[in] options An object with options for the output handler. + * \details + * + * The options argument is typically bio::ann_io::writer_options, but any object with a subset of similarly named + * members is also accepted. See bio::format_output_handler for the supported options and defaults. + */ + format_output_handler(std::ostream & str, auto const & options) : base_t{str} + { + // extract options + if constexpr (requires { (bool)options.windows_eol; }) + windows_eol = options.windows_eol; + } + + //!\brief Construct with only an output stream. + format_output_handler(std::ostream & str) : format_output_handler(str, 1) {} + + //!\brief The destructor writes the header if necessary and cleans up. + ~format_output_handler() noexcept(false) + { + // never throw if the stack is unwinding + if (std::uncaught_exceptions() > 0) + return; + + // no cleanup is needed if we are in moved-from state + if (move_tracker.moved_from) + return; + + // if no records were written, the header also wasn't written, but needs to be: + if (!header_has_been_written) + write_header(); + } + //!\} + + //!\brief Get the header. + ann_io::header const & get_header() const + { + if (header == nullptr) + throw missing_header_error{"Attempting to read header, but no header was set."}; + + return *header; + } + + //!\brief Set the header. + void set_header(ann_io::header const & hdr) + { + header = {&hdr, [](ann_io::header const *) {}}; + } + //!\overload + void set_header(ann_io::header const && hdr) + { + header = {new ann_io::header(std::move(hdr)), [](ann_io::header const * ptr) { delete ptr; }}; + } + //!\overload + void set_header(ann_io::header & hdr) + { + set_header(std::as_const(hdr)); + } + //!\overload + void set_header(ann_io::header && hdr) + { + set_header(std::move(std::as_const(hdr))); + } + + //!\brief Write the record. + template + void write_record(record const & record) + { + write_record_impl(record); + } + + //!\overload + template + void write_record(record & record) + { + write_record_impl(record); + } +}; + +} // namespace bio diff --git a/test/snippet/ann_io/ann_io_reader.cpp b/test/snippet/ann_io/ann_io_reader.cpp new file mode 100644 index 0000000..88b44a5 --- /dev/null +++ b/test/snippet/ann_io/ann_io_reader.cpp @@ -0,0 +1,39 @@ +#include + +#include +#include + +#include "../../unit/format/ann_data.hpp" + +int main() +{ +//================= PRE ========================== + { + std::ofstream os{"example.bed", std::ios::binary}; + os << full_example; + } + + std::ifstream in{"example.bed"}; + std::cin.rdbuf(in.rdbuf()); // rewire stdin + +//================= SNIPPETS ====================== + +{ +//![simple_usage_file] +bio::ann_io::reader reader{"example.bed"}; + +for (auto & rec : reader) +{ + seqan3::debug_stream << rec.chrom() << ':' + << rec.chromStart() << ':' + << rec.chromEnd() << '\n'; +} + +seqan3::debug_stream << reader.header().browser_values << '\n'; +seqan3::debug_stream << reader.header().track_values << '\n'; +//![simple_usage_file] +} + +//================= POST ========================== + std::filesystem::remove("example.bed"); +} diff --git a/test/snippet/ann_io/ann_io_reader.err b/test/snippet/ann_io/ann_io_reader.err new file mode 100644 index 0000000..d9ef1db --- /dev/null +++ b/test/snippet/ann_io/ann_io_reader.err @@ -0,0 +1,11 @@ +chr7:127471196:127472363 +chr7:127472363:127473530 +chr7:127473530:127474697 +chr7:127474697:127475864 +chr7:127475864:127477031 +chr7:127477031:127478198 +chr7:127478198:127479365 +chr7:127479365:127480532 +chr7:127480532:127481699 +[(position,chr7:127471196-127495720),(hide,all)] +[(name,ItemRGBDemo),(description,Item RGB demonstration),(visibility,2),(itemRgb,On)] diff --git a/test/unit/ann_io/CMakeLists.txt b/test/unit/ann_io/CMakeLists.txt new file mode 100644 index 0000000..5dbc4bc --- /dev/null +++ b/test/unit/ann_io/CMakeLists.txt @@ -0,0 +1,2 @@ +bio_test(ann_io_reader_test.cpp) +bio_test(ann_io_writer_test.cpp) diff --git a/test/unit/ann_io/ann_io_reader_test.cpp b/test/unit/ann_io/ann_io_reader_test.cpp new file mode 100644 index 0000000..385fbed --- /dev/null +++ b/test/unit/ann_io/ann_io_reader_test.cpp @@ -0,0 +1,244 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include +#include + +#include + +#include "../format/ann_data.hpp" + +TEST(ann_io_reader, concepts) +{ + using t = bio::ann_io::reader<>; + EXPECT_TRUE((std::ranges::input_range)); + + using ct = bio::ann_io::reader<> const; + // not const-iterable + EXPECT_FALSE((std::ranges::input_range)); +} + +void ann_io_reader_filename_constructor(bool ext_check, auto &&... args) +{ + /* just the filename */ + { + seqan3::test::tmp_filename filename{"ann_io_reader_constructor.bed"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + EXPECT_NO_THROW((bio::ann_io::reader{filename.get_path(), std::forward(args)...})); + } + + // correct format check is done by tests of that format + + /* non-existent file */ + { + EXPECT_THROW((bio::ann_io::reader{"/dev/nonexistant/foobarOOO", std::forward(args)...}), + bio::file_open_error); + } + + /* wrong extension */ + if (ext_check) + { + seqan3::test::tmp_filename filename{"ann_io_reader_constructor.xyz"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + EXPECT_THROW((bio::ann_io::reader{filename.get_path(), std::forward(args)...}), + bio::unhandled_extension_error); + } +} + +TEST(ann_io_reader, constructor1_just_filename) +{ + ann_io_reader_filename_constructor(true); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_reader, constructor1_with_opts) +{ + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types<>}; + using control_t = bio::ann_io::reader), + seqan3::type_list>; + + ann_io_reader_filename_constructor(true, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_reader, constructor2_just_filename_direct_format) +{ + ann_io_reader_filename_constructor(false, bio::bed{}); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_reader, constructor2_with_opts_direct_format) +{ + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types<>}; + using control_t = bio::ann_io::reader), + seqan3::type_list>; + + ann_io_reader_filename_constructor(false, bio::bed{}, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_reader, constructor2_just_filename_format_variant) +{ + std::variant var{}; + + ann_io_reader_filename_constructor(false, var); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_reader, constructor2_with_opts_format_variant) +{ + std::variant var{}; + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types<>}; + using control_t = bio::ann_io::reader), + seqan3::type_list>; + + ann_io_reader_filename_constructor(false, var, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_reader, constructor3) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::ann_io::reader{str, bio::bed{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_reader, constructor3_with_opts) +{ + std::istringstream str; + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types<>}; + using control_t = bio::ann_io::reader), + seqan3::type_list>; + + EXPECT_NO_THROW((bio::ann_io::reader{str, bio::bed{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_reader, constructor4) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::ann_io::reader{std::move(str), bio::bed{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_reader, constructor4_with_opts) +{ + std::istringstream str; + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types<>}; + using control_t = bio::ann_io::reader), + seqan3::type_list>; + + EXPECT_NO_THROW((bio::ann_io::reader{std::move(str), bio::bed{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_reader, iteration) +{ + { + std::istringstream str{static_cast(minimal_example)}; + bio::ann_io::reader reader{str, bio::bed{}}; + + EXPECT_EQ(std::ranges::distance(reader), 9); + } + + { + std::istringstream str{static_cast(minimal_example)}; + bio::ann_io::reader reader{str, bio::bed{}}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_EQ(rec.chrom(), "chr7"); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 9); + } +} + +TEST(ann_io_reader, empty_file) +{ + { + seqan3::test::tmp_filename filename{"ann_io_reader_constructor.bed"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + bio::ann_io::reader reader{filename.get_path()}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +TEST(ann_io_reader, empty_stream) +{ + { + std::istringstream str{""}; + bio::ann_io::reader reader{str, bio::bed{}}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +TEST(ann_io_reader, get_header) +{ + // get header before calling begin() + { + std::istringstream str{static_cast(minimal_example_with_header)}; + bio::ann_io::reader reader{str, bio::bed{}}; + + bio::ann_io::header const & hdr = reader.header(); + + EXPECT_EQ(hdr.to_plaintext(), minimal_example_header_regenerated); + } + + // get header after calling begin() + { + std::istringstream str{static_cast(minimal_example_with_header)}; + bio::ann_io::reader reader{str, bio::bed{}}; + + auto it = reader.begin(); + EXPECT_EQ(it->chrom(), "chr7"); + + bio::ann_io::header const & hdr = reader.header(); + + EXPECT_EQ(hdr.to_plaintext(), minimal_example_header_regenerated); + } +} + +TEST(ann_io_reader, custom_field_types) +{ + bio::ann_io::reader_options opt{.field_types = bio::ann_io::field_types}; + + std::istringstream str{static_cast(minimal_example)}; + bio::ann_io::reader reader{str, bio::bed{}, opt}; + + EXPECT_TRUE((std::same_as, + bio::record)>>)); +} + +TEST(ann_io_reader, custom_field_ids_structured_bindings) +{ + bio::ann_io::reader_options opt{.field_ids = bio::vtag, + .field_types = bio::ttag}; + + std::istringstream str{static_cast(minimal_example)}; + bio::ann_io::reader reader{str, bio::bed{}, opt}; + + for (auto & [chrom, start, end] : reader) + EXPECT_EQ(chrom, "chr7"); +} diff --git a/test/unit/ann_io/ann_io_writer_test.cpp b/test/unit/ann_io/ann_io_writer_test.cpp new file mode 100644 index 0000000..7a86d5a --- /dev/null +++ b/test/unit/ann_io/ann_io_writer_test.cpp @@ -0,0 +1,324 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include +#include +#include + +#include +#include + +#include "../format/ann_data.hpp" + +using custom_field_ids_t = bio::vtag_t; + +TEST(ann_io_writer, concepts) +{ + using rec_t = bio::record)>; + + using t = bio::ann_io::writer<>; + EXPECT_TRUE((std::ranges::output_range)); + + using ct = bio::ann_io::writer<> const; + // not const-iterable + EXPECT_FALSE((std::ranges::output_range)); +} + +void ann_io_writer_filename_constructor(bool ext_check, auto &&... args) +{ + using t = + decltype(bio::ann_io::writer{std::declval(), std::forward(args)...}); + [[maybe_unused]] t * ptr = nullptr; + + /* just the filename */ + { + seqan3::test::tmp_filename filename{"ann_io_writer_constructor.bed"}; + + // constructor + EXPECT_NO_THROW((ptr = new bio::ann_io::writer{filename.get_path(), std::forward(args)...})); + + // destructor + EXPECT_NO_THROW(delete ptr); + ptr = nullptr; + } + + /* wrong extension */ + if (ext_check) + { + seqan3::test::tmp_filename filename{"ann_io_writer_constructor.xyz"}; + EXPECT_THROW((ptr = new bio::ann_io::writer{filename.get_path(), std::forward(args)...}), + bio::unhandled_extension_error); + + // destructor, nothrow because already thrown during construction + EXPECT_NO_THROW(delete ptr); + } +} + +TEST(ann_io_writer, constructor1_just_filename) +{ + ann_io_writer_filename_constructor(true); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_writer, constructor1_with_opts) +{ + bio::ann_io::writer_options opt{.formats = bio::ttag}; + using control_t = bio::ann_io::writer>; + + ann_io_writer_filename_constructor(true, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_writer, constructor2_just_filename_direct_format) +{ + ann_io_writer_filename_constructor(false, bio::bed{}); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_writer, constructor2_with_opts_direct_format) +{ + bio::ann_io::writer_options opt{.formats = bio::ttag}; + using control_t = bio::ann_io::writer>; + + ann_io_writer_filename_constructor(false, bio::bed{}, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_writer, constructor2_just_filename_format_variant) +{ + std::variant var{}; + + ann_io_writer_filename_constructor(false, var); + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_writer, constructor2_with_opts_format_variant) +{ + std::variant var{}; + bio::ann_io::writer_options opt{.formats = bio::ttag}; + using control_t = bio::ann_io::writer>; + + ann_io_writer_filename_constructor(false, var, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +void ann_io_writer_stream_constructor(auto &&... args) +{ + using t = decltype(bio::ann_io::writer{std::forward(args)...}); + [[maybe_unused]] t * ptr = nullptr; + + { + std::ostringstream str; + + // constructor + EXPECT_NO_THROW((ptr = new bio::ann_io::writer{std::forward(args)...})); + + // destructor + EXPECT_NO_THROW(delete ptr); + ptr = nullptr; + } +} + +TEST(ann_io_writer, constructor3) +{ + std::ostringstream str; + ann_io_writer_stream_constructor(str, bio::bed{}); + + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_writer, constructor3_with_opts) +{ + std::ostringstream str; + bio::ann_io::writer_options opt{.formats = bio::ttag}; + ann_io_writer_stream_constructor(str, bio::bed{}, opt); + + using control_t = bio::ann_io::writer>; + EXPECT_TRUE((std::same_as)); +} + +TEST(ann_io_writer, constructor4) +{ + std::ostringstream str; + ann_io_writer_stream_constructor(std::move(str), bio::bed{}); + + EXPECT_TRUE((std::same_as>)); +} + +TEST(ann_io_writer, constructor4_with_opts) +{ + std::ostringstream str; + bio::ann_io::writer_options opt{.formats = bio::ttag}; + ann_io_writer_stream_constructor(std::move(str), bio::bed{}, opt); + + using control_t = bio::ann_io::writer>; + EXPECT_TRUE((std::same_as)); +} + +template +void write_record_test_impl() +{ + bio::ann_io::header hdr{minimal_example_header_regenerated}; + + std::ostringstream stream{}; + bio::ann_io::writer writer{stream, bio::bed{}}; + + auto records = example_records_default_style(); + + writer.set_header(hdr); + + if constexpr (i == 0) + { + writer.push_back(records[0]); + writer.push_back(records[1]); + writer.push_back(records[2]); + writer.push_back(records[3]); + writer.push_back(records[4]); + writer.push_back(records[5]); + writer.push_back(records[6]); + writer.push_back(records[7]); + writer.push_back(records[8]); + } + else if constexpr (i == 1) + { + auto it = writer.begin(); + it = records[0]; + it = records[1]; + it = records[2]; + it = records[3]; + it = records[4]; + it = records[5]; + it = records[6]; + it = records[7]; + it = records[8]; + } + else if constexpr (i == 2) + { + auto it = writer.begin(); + *it = records[0]; + *it = records[1]; + *it = records[2]; + *it = records[3]; + *it = records[4]; + *it = records[5]; + *it = records[6]; + *it = records[7]; + *it = records[8]; + } + else if constexpr (i == 4) + { + auto fn = [&writer](auto &... args) { writer.emplace_back(args...); }; + std::apply(fn, records[0]); + std::apply(fn, records[1]); + std::apply(fn, records[2]); + std::apply(fn, records[3]); + std::apply(fn, records[4]); + std::apply(fn, records[5]); + std::apply(fn, records[6]); + std::apply(fn, records[7]); + std::apply(fn, records[8]); + } + + EXPECT_EQ(stream.str(), minimal_example_header_regenerated + minimal_example); +} + +TEST(ann_io_writer, push_back_record) +{ + write_record_test_impl<0>(); +} + +TEST(ann_io_writer, assign_to_iterator) +{ + write_record_test_impl<1>(); +} + +TEST(ann_io_writer, assign_to_deref_iterator) +{ + write_record_test_impl<2>(); +} + +TEST(ann_io_writer, emplace_back) +{ + write_record_test_impl<4>(); +} + +// TEST(ann_io_writer, minimal_fields) +// { +// std::ostringstream stream{}; +// bio::ann_io::writer writer{stream, bio::bed{}}; +// +// writer.set_header(bio::ann_io::header{minimal_example_header_regenerated}); +// +// writer.emplace_back(custom_field_ids_t{}, "20", 14370, "G"); +// writer.emplace_back(custom_field_ids_t{}, "20", 17330, "T"); +// writer.emplace_back(custom_field_ids_t{}, "20", 1110696, "A"); +// writer.emplace_back(custom_field_ids_t{}, "20", 1230237, "T"); +// writer.emplace_back(custom_field_ids_t{}, "20", 1234567, "GTC"); +// +// std::string compare = example_from_spec_header_regenerated_no_IDX; +// compare += minimal_field_rows; +// EXPECT_EQ(stream.str(), compare); +// } + +TEST(ann_io_writer, no_header1) // record contains header_ptr but this is == nullptr +{ + std::ostringstream stream{}; + auto * writer = new bio::ann_io::writer{stream, bio::bed{}}; + + auto records = example_records_default_style(); + + EXPECT_NO_THROW(writer->push_back(records[0])); + + // destructor + EXPECT_NO_THROW(delete writer); + writer = nullptr; +} + +TEST(var_io_writer, no_header2) // record does not contain header_ptr +{ + std::ostringstream stream{}; + auto * writer = new bio::ann_io::writer{stream, bio::bed{}}; + + EXPECT_NO_THROW(writer->emplace_back(custom_field_ids_t{}, 20, 1, 5)); + + // destructor + EXPECT_NO_THROW(delete writer); + writer = nullptr; +} + +// TEST(var_io_writer, compression) +// { +// std::ostringstream stream{}; +// +// { +// bio::var_io::writer writer{stream, +// bio::vcf{}, +// bio::var_io::writer_options{.stream_options = bio::transparent_ostream_options{ +// .compression = bio::compression_format::bgzf}}}; +// +// writer.set_header(bio::var_io::header{example_from_spec_header}); +// +// auto records = example_records_bcf_style(); +// +// writer.push_back(records[0]); +// writer.push_back(records[1]); +// writer.push_back(records[2]); +// writer.push_back(records[3]); +// writer.push_back(records[4]); +// } +// +// std::string str = stream.str(); +// EXPECT_TRUE(str.starts_with("\x1f\x8b\x08")); // Gzip header +// +// std::istringstream control_stream{str}; +// bio::transparent_istream decompressor{control_stream}; +// std::string buffer(std::istreambuf_iterator{decompressor}, std::istreambuf_iterator{}); +// EXPECT_RANGE_EQ(buffer, example_from_spec_header_regenerated_no_IDX + example_from_spec_records); +// } diff --git a/test/unit/format/ann_data.hpp b/test/unit/format/ann_data.hpp new file mode 100644 index 0000000..5ac5b3e --- /dev/null +++ b/test/unit/format/ann_data.hpp @@ -0,0 +1,80 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include + +inline std::string const full_example = + R"(browser position chr7:127471196-127495720 +browser hide all +track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" +chr7 127471196 127472363 Pos1 0 + 127471196 127472363 255,0,0 +chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0 +chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0 +chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0 +chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255 +chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255 +chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255 +chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0 +chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255 +)"; + +inline std::string const minimal_example = + R"(chr7 127471196 127472363 +chr7 127472363 127473530 +chr7 127473530 127474697 +chr7 127474697 127475864 +chr7 127475864 127477031 +chr7 127477031 127478198 +chr7 127478198 127479365 +chr7 127479365 127480532 +chr7 127480532 127481699 +)"; + +inline std::string const minimal_example_with_header = + R"(browser position chr7:127471196-127495720 +browser hide all +track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" +chr7 127471196 127472363 +chr7 127472363 127473530 +chr7 127473530 127474697 +chr7 127474697 127475864 +chr7 127475864 127477031 +chr7 127477031 127478198 +chr7 127478198 127479365 +chr7 127479365 127480532 +chr7 127480532 127481699 +)"; + +inline std::string const minimal_example_header_regenerated = + R"(browser position chr7:127471196-127495720 +browser hide all +track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" +)"; + +template +auto example_records_default_style() +{ + using record_t = bio::record)>; + + // clang-format off + std::vector recs{ + {"chr7", 127471196, 127472363}, + {"chr7", 127472363, 127473530}, + {"chr7", 127473530, 127474697}, + {"chr7", 127474697, 127475864}, + {"chr7", 127475864, 127477031}, + {"chr7", 127477031, 127478198}, + {"chr7", 127478198, 127479365}, + {"chr7", 127479365, 127480532}, + {"chr7", 127480532, 127481699} + }; + // clang-format on + + return recs; +}