From a6b66ae1b95e4bf701621d5fd2b3c7e02500e4e6 Mon Sep 17 00:00:00 2001 From: Hannes Hauswedell Date: Wed, 24 Nov 2021 17:14:54 +0100 Subject: [PATCH 1/2] [feature] seq_io::reader and lots of required code --- include/bio/detail/in_file_iterator.hpp | 166 +++++++++++ include/bio/detail/misc.hpp | 76 +++++ include/bio/detail/reader_base.hpp | 268 ++++++++++++++++++ include/bio/misc.hpp | 34 +++ include/bio/plain_io/all.hpp | 2 +- include/bio/plain_io/misc.hpp | 6 +- include/bio/record.hpp | 15 + include/bio/seq_io/all.hpp | 25 ++ include/bio/seq_io/misc.hpp | 27 ++ include/bio/seq_io/reader.hpp | 142 ++++++++++ include/bio/seq_io/reader_options.hpp | 193 +++++++++++++ test/snippet/seq_io/snippet_reader.cpp | 86 ++++++ test/snippet/seq_io/snippet_reader.err | 48 ++++ .../snippet/seq_io/snippet_reader_options.cpp | 49 ++++ test/unit/seq_io/CMakeLists.txt | 1 + test/unit/seq_io/data.hpp | 32 +++ test/unit/seq_io/seq_io_reader_test.cpp | 261 +++++++++++++++++ 17 files changed, 1429 insertions(+), 2 deletions(-) create mode 100644 include/bio/detail/in_file_iterator.hpp create mode 100644 include/bio/detail/misc.hpp create mode 100644 include/bio/detail/reader_base.hpp create mode 100644 include/bio/misc.hpp create mode 100644 include/bio/seq_io/all.hpp create mode 100644 include/bio/seq_io/misc.hpp create mode 100644 include/bio/seq_io/reader.hpp create mode 100644 include/bio/seq_io/reader_options.hpp create mode 100644 test/snippet/seq_io/snippet_reader.cpp create mode 100644 test/snippet/seq_io/snippet_reader.err create mode 100644 test/snippet/seq_io/snippet_reader_options.cpp create mode 100644 test/unit/seq_io/CMakeLists.txt create mode 100644 test/unit/seq_io/data.hpp create mode 100644 test/unit/seq_io/seq_io_reader_test.cpp diff --git a/include/bio/detail/in_file_iterator.hpp b/include/bio/detail/in_file_iterator.hpp new file mode 100644 index 0000000..707319f --- /dev/null +++ b/include/bio/detail/in_file_iterator.hpp @@ -0,0 +1,166 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the seqan3::detail::in_file_iterator class template. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include + +#include + +namespace bio::detail +{ + +/*!\brief Input iterator that provides a range-like interface for readers. + * \tparam file_type The data structure on which the iterator operates. + * \implements std::input_Iterator + * \ingroup bio + * + * This iterator is a single-pass input iterator for input files. All member types are resolved + * via `file_type`'s member types, dereference is implemented via file's `front()` member + * function, and increment calls the `buffer_next_record()` member of file. + * + * Note that since this is a single-pass input iterator, post-increment returns void because + * previous iterators are always invalid (all iterators point to the current position in single-pass + * ranges). + * + * This iterator may be compared against std::default_sentinel_t, this check delegates to + * calling the `eof()` member function on the file's stream. + */ +template +class in_file_iterator +{ + static_assert(!std::is_const_v, + "You cannot iterate over const files, because the iterator changes the file."); + +public: + /*!\name Member types + * \brief The associated types are derived from the `file_type`. + * \{ + */ + + //!\brief The value type. + using value_type = typename file_type::record_type; + //!\brief The reference type. + using reference = typename file_type::record_type &; + //!\brief The const reference type. + using const_reference = typename file_type::record_type &; + //!\brief The size type. + using size_type = size_t; + //!\brief The difference type. A signed integer type, usually std::ptrdiff_t. + using difference_type = ptrdiff_t; + //!\brief The pointer type. + using pointer = typename file_type::record_type *; + //!\brief Tag this class as an input iterator. + using iterator_category = std::input_iterator_tag; + //!\} + + /*!\name Constructors, destructor and assignment. + * \{ + */ + in_file_iterator() = default; //!< Defaulted. + in_file_iterator(in_file_iterator const &) = default; //!< Defaulted. + in_file_iterator(in_file_iterator &&) = default; //!< Defaulted. + ~in_file_iterator() = default; //!< Defaulted. + in_file_iterator & operator=(in_file_iterator const &) = default; //!< Defaulted. + in_file_iterator & operator=(in_file_iterator &&) = default; //!< Defaulted. + + //!\brief Construct with reference to host. + in_file_iterator(file_type & _host) noexcept : host{&_host} {} + //!\} + + /*!\name Iterator operations + * \{ + */ + //!\brief Move to the next record in the file and return a reference to it. + in_file_iterator & operator++() + { + assert(host != nullptr); + host->read_next_record(); + return *this; + } + + //!\brief Post-increment is the same as pre-increment, but returns void. + void operator++(int) + { + assert(host != nullptr); + ++(*this); + } + + //!\brief Dereference returns the currently buffered record. + reference operator*() noexcept + { + assert(host != nullptr); + return host->record_buffer; + } + + //!\brief Dereference returns the currently buffered record. + reference operator*() const noexcept + { + assert(host != nullptr); + return host->record_buffer; + } + + //!\brief Dereference returns the currently buffered record. + value_type * operator->() noexcept + { + assert(host != nullptr); + return &host->record_buffer; + } + + //!\brief Dereference returns the currently buffered record. + value_type const * operator->() const noexcept + { + assert(host != nullptr); + return &host->record_buffer; + } + + //!\} + + /*!\name Comparison operators + * \brief Only (in-)equality comparison of iterator with end() is supported. + * \{ + */ + + //!\brief Checks whether `*this` is equal to the sentinel. + constexpr bool operator==(std::default_sentinel_t const &) const noexcept + { + assert(host != nullptr); + return host->at_end; + } + + //!\brief Checks whether `*this` is not equal to the sentinel. + constexpr bool operator!=(std::default_sentinel_t const &) const noexcept + { + assert(host != nullptr); + return !host->at_end; + } + + //!\brief Checks whether `it` is equal to the sentinel. + constexpr friend bool operator==(std::default_sentinel_t const &, in_file_iterator const & it) noexcept + { + return (it == std::default_sentinel); + } + + //!\brief Checks whether `it` is not equal to the sentinel. + constexpr friend bool operator!=(std::default_sentinel_t const &, in_file_iterator const & it) noexcept + { + return (it != std::default_sentinel); + } + //!\} + +private: + //!\brief Pointer to file host. + file_type * host{}; +}; + +} // namespace bio::detail diff --git a/include/bio/detail/misc.hpp b/include/bio/detail/misc.hpp new file mode 100644 index 0000000..2bedcd0 --- /dev/null +++ b/include/bio/detail/misc.hpp @@ -0,0 +1,76 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/bio/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides miscellaneous utilities. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace bio::detail +{ + +/*!\addtogroup bio + * \{ + */ + +/*!\brief Sets the file format according to the file name extension. + * \param[out] format The format to set. + * \param[in] file_name The file name to extract the extension from. + * + * \throws seqan3::unhandled_extension_error If the extension in file_name does + * not occur in any valid extensions of the formats specified in the + * \p format_variant_type template argument list. + */ +void set_format(auto & format, std::filesystem::path const & file_name) +{ + using format_variant_type = std::remove_cvref_t; + using valid_formats = seqan3::detail::transfer_template_args_onto_t; + + bool format_found = false; + std::string extension = file_name.extension().string(); + if (extension.size() > 1) + { + extension = extension.substr(1); // drop leading "." + seqan3::detail::for_each( + [&](auto fmt) + { + using fm_type = typename decltype(fmt)::type; // remove type_identity wrapper + + for (auto const & ext : fm_type::file_extensions) + { + if (std::ranges::equal(ext, extension)) + { + format.template emplace(); + format_found = true; + return; + } + } + }); + } + + if (!format_found) + throw unhandled_extension_error("No valid format found for this extension."); +} + +//!\} + +} // namespace bio::detail diff --git a/include/bio/detail/reader_base.hpp b/include/bio/detail/reader_base.hpp new file mode 100644 index 0000000..7b25d72 --- /dev/null +++ b/include/bio/detail/reader_base.hpp @@ -0,0 +1,268 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides seqan3::reader_base and corresponding traits classes. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace bio +{ + +// ---------------------------------------------------------------------------- +// reader_base +// ---------------------------------------------------------------------------- + +/*!\brief This is a (non-CRTP) base-class for I/O readers. + * \tparam options_t Type of the reader options. + * \details + * + * Most I/O readers inherit from this class to reduce implementation overhead. It is not relevant for most users + * of the library. + */ +template +class reader_base +{ +protected: + //!\privatesection + /*!\name Format handling + * \{ + */ + //!\brief A seqan3::type_list with the possible formats. + using valid_formats = std::remove_cvref_t; + //!\brief The seqan3::format_input_handler corresponding to the format. + using format_handler_type = + seqan3::detail::transfer_template_args_onto_t, + std::variant>; + //!\} + +public: + /*!\name Format handling + * \{ + */ + //!\brief Type of the format, a std::variant over the `valid_formats`. + using format_type = seqan3::detail::transfer_template_args_onto_t; + //!\brief The seqan3::format_input_handler corresponding to the format. + //!\} + + /*!\name Field types and record type + * \brief The exact type of the record depends on the options! + * \{ + */ + //!\brief The type of the record, a specialisation of bio::record; acts as a tuple of the selected field types. + using record_type = detail::record_from_typelist; + //!\brief The iterator type of this view (an input iterator). + using iterator = detail::in_file_iterator; + //!\brief The type returned by end(). + using sentinel = std::default_sentinel_t; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + //!\brief Default constructor is explicitly deleted, you need to give a stream or file name. + reader_base() = delete; + //!\brief Copy construction is explicitly deleted, because you can't have multiple access to the same file. + reader_base(reader_base const &) = delete; + //!\brief Copy assignment is explicitly deleted, because you can't have multiple access to the same file. + reader_base & operator=(reader_base const &) = delete; + //!\brief Move construction is defaulted. + reader_base(reader_base &&) = default; + //!\brief Move assignment is defaulted. + reader_base & operator=(reader_base &&) = default; + //!\brief Destructor is defaulted. + ~reader_base() = default; + + /*!\brief Construct from filename. + * \param[in] filename Path to the file you wish to open. + * \param[in] fmt The file format given as e.g. `fasta{}` [optional] + * \param[in] opt Reader options (exact type depends on specialisation). [optional] + * \throws seqan3::file_open_error If the file could not be opened, e.g. non-existant, non-readable, unknown format. + * + * \details + * + * In addition to the file name, you may fix the format and/or provide options. + * + * ### Decompression + * + * This constructor transparently applies a decompression stream on top of the file stream in case + * the file is detected as being compressed. + * See the section on compression and decompression (TODO) for more information. + */ + reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{}) : + options{opt}, stream{filename, opt.stream_options}, format{fmt} + {} + + //!\overload + explicit reader_base(std::filesystem::path const & filename, options_t const & opt = options_t{}) : + options{opt}, stream{filename, opt.stream_options} + { + // initialise format handler or throw if format is not found + detail::set_format(format, stream.truncated_filename()); + } + + /*!\brief Construct from an existing stream and with specified format. + * \param[in] str The stream to operate on. + * \param[in] fmt The file format given as e.g. `fasta{}`. [required] + * \param[in] opt Reader options (exact type depends on specialisation). [optional] + * + * \details + * + * In addition to the stream, you must fix the format and you may optionally provide options. + * + * ### Decompression + * + * This constructor transparently applies a decompression stream on top of the stream in case + * it is detected as being compressed. + * See the section on compression and decompression (TODO) for more information. + */ + reader_base(std::istream & str, format_type const & fmt, options_t const & opt = options_t{}) : + options{opt}, stream{str, opt.stream_options}, format{fmt} + {} + + //!\overload + template + //!\cond REQ + requires(!std::is_lvalue_reference_v) + //!\endcond + reader_base(temporary_stream_t && str, format_type const & fmt, options_t const & opt = options_t{}) : + options{opt}, stream{std::move(str), opt.stream_options}, format{fmt} + {} + //!\} + + /*!\name Range interface + * \brief Provides functions for record based reading of the file. + * \{ + */ + /*!\brief Returns an iterator to current position in the file. + * \returns An iterator pointing to the current position in the file. + * \throws seqan3::format_error + * + * It is safe to call this function repeatedly, but it will always return an iterator pointing to the current + * record in the file (and not seek back to the beginning). + * + * Equals end() if the file is at end. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * Throws seqan3::format_error if the first record could not be read into the buffer. + */ + iterator begin() + { + // buffer first record + if (init_state) + { + // set format-handler + std::visit([&](auto f) { format_handler = format_input_handler{stream, options}; }, format); + + // read first record + read_next_record(); + init_state = false; + } + + return {*this}; + } + + /*!\brief Returns a sentinel for comparison with iterator. + * \returns Iterator to the first element. + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + sentinel end() noexcept { return {}; } + + /*!\brief Return the record we are currently at in the file. + * \returns A reference to the currently buffered record. + * + * This function returns a reference to the currently buffered record, it is identical to calling `*begin()`. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + record_type & front() noexcept { return *begin(); } + //!\} + +protected: + //!\privatesection + + /*!\name State + * \{ + */ + //!\brief The object holding the options. + options_t options; + //!\brief The input stream. + transparent_istream stream; + //!\brief Buffer for a single record. + record_type record_buffer; + //!\brief Tracks whether the very first record is buffered when calling begin(). + bool init_state = true; + //!\brief File is at position 1 behind the last record. + bool at_end = false; + + //!\brief The actual std::variant holding a pointer to the detected/selected format. + format_type format; + //!\brief The respective input handler specialisation. + format_handler_type format_handler; + //!\} + + //!\brief Tell the format to move to the next record and update the buffer. + void read_next_record() + { + if (at_end) + return; + + // at end if we could not read further + if (std::istreambuf_iterator{stream} == std::istreambuf_iterator{}) + { + at_end = true; + return; + } + + assert(!format_handler.valueless_by_exception()); + std::visit([&](auto & f) { f.parse_next_record_into(record_buffer); }, format_handler); + } + + //!\brief Befriend iterator so it can access the buffers. + friend iterator; +}; + +} // namespace bio diff --git a/include/bio/misc.hpp b/include/bio/misc.hpp new file mode 100644 index 0000000..bcf612d --- /dev/null +++ b/include/bio/misc.hpp @@ -0,0 +1,34 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#pragma once + +#include + +/*!\file + * \brief Provides various minor utilities. + * \author Hannes Hauswedell + */ + +namespace bio +{ + +/*!\brief An enum used as an argument for templates that switch between owning and non-owning behaviour. + * \details + * + * Typically used to configure a class template to have members that are vectors/strings VS members that are views. + * The "shallow" version of such a class is typically cheap to copy (no dynamic memory) while the "deep" version + * is exppensive to copy (holds dynamic memory). + */ +enum class ownership +{ + shallow, //< Cheap to copy. + deep //< Expensive to copy. +}; + +} // namespace bio diff --git a/include/bio/plain_io/all.hpp b/include/bio/plain_io/all.hpp index b6c75ba..719c169 100644 --- a/include/bio/plain_io/all.hpp +++ b/include/bio/plain_io/all.hpp @@ -7,7 +7,7 @@ // ----------------------------------------------------------------------------------------------------- /*!\file - * \brief Meta-include that includes the whole library. + * \brief Meta-include that includes the whole Plain I/O module. * \author Hannes Hauswedell */ diff --git a/include/bio/plain_io/misc.hpp b/include/bio/plain_io/misc.hpp index b0c67f0..c84a0a3 100644 --- a/include/bio/plain_io/misc.hpp +++ b/include/bio/plain_io/misc.hpp @@ -26,7 +26,11 @@ namespace bio::plain_io * \{ */ -//!\brief The value type of bio::plaintext_file_input if every line is split into fields. +/*!\brief The value type of bio::plaintext_file_input if every line is split into fields. + * \details + * + * Plain I/O records are always shallow. + */ struct record { //!\brief The entire line (exluding EOL characters but including delimiters). diff --git a/include/bio/record.hpp b/include/bio/record.hpp index 02b12da..7af2966 100644 --- a/include/bio/record.hpp +++ b/include/bio/record.hpp @@ -237,6 +237,21 @@ struct record : std::tuple } // namespace bio +namespace bio::detail +{ + +//!\brief Implementation for bio::detail::record_from_typelist. +template +auto record_from_typelist_impl(field_ids_t const &, seqan3::type_list) + -> record; + +//!\brief Easy metaprogramming to get the type of a record from type_list of the field_types. +template +using record_from_typelist = + decltype(record_from_typelist_impl(std::declval(), std::declval())); + +} // namespace bio::detail + //------------------------------------------------------------------------------- // tuple traits //------------------------------------------------------------------------------- diff --git a/include/bio/seq_io/all.hpp b/include/bio/seq_io/all.hpp new file mode 100644 index 0000000..0e5823e --- /dev/null +++ b/include/bio/seq_io/all.hpp @@ -0,0 +1,25 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Meta-include that includes the whole Seq I/O module. + * \author Hannes Hauswedell + */ + +#pragma once + +// TODO + +/*!\defgroup seq_io Seq I/O + * \ingroup bio + * \brief Reader and writer for plaintext and CSV/TSV files. + */ + +/*!\namespace bio::seq_io + * \brief Namespace for the Seq I/O submodule. + */ diff --git a/include/bio/seq_io/misc.hpp b/include/bio/seq_io/misc.hpp new file mode 100644 index 0000000..f1f34e0 --- /dev/null +++ b/include/bio/seq_io/misc.hpp @@ -0,0 +1,27 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides miscellaneous content for sequence IO. + * \author Hannes Hauswedell + */ + +#pragma once + +#include + +#include + +namespace bio::seq_io +{ + +//!\brief Default fields for seqan3::seq_io::reader_options. +//!\ingroup seq_io +inline constexpr auto default_field_ids = seqan3::vtag; + +} // namespace bio::seq_io diff --git a/include/bio/seq_io/reader.hpp b/include/bio/seq_io/reader.hpp new file mode 100644 index 0000000..685e983 --- /dev/null +++ b/include/bio/seq_io/reader.hpp @@ -0,0 +1,142 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::seq_io::reader and corresponding traits classes. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace bio::seq_io +{ + +// ---------------------------------------------------------------------------- +// reader +// ---------------------------------------------------------------------------- + +/*!\brief A class for reading sequence files, e.g. FASTA, FASTQ. + * \ingroup seq_io + * + * \details + * + * ### Introduction + * + * Sequence files are the most generic and common biological files. Well-known formats include + * FastA and FastQ, but sometimes you may also be interested in treating SAM or BAM files as sequence + * files, discarding the alignment. + * + * The Sequence I/O reader supports reading the following fields: + * + * 1. bio::field::seq + * 2. bio::field::id + * 3. bio::field::qual + * + * And it supports the following formats: + * + * 1. FASTA (see also bio::fasta) + * + * ### Simple usage + * + * Iterate over a sequence file via the reader and print the record's contents: + * + * \snippet test/snippet/seq_io/snippet_reader.cpp simple_usage_file + * + * Read from standard input instead of a file: + * + * \snippet test/snippet/seq_io/snippet_reader.cpp simple_usage_stream + * + * ### Decomposed iteration + * + * The records can be decomposed on-the-fly using + * [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding): + * + * \snippet test/snippet/seq_io/snippet_reader.cpp decomposed + * + * Note that the order of the fields is defined by bio::seq_io::default_field_ids and independent + * of the names you give to the binding. + * + * ### Views on files + * + * This iterates over the first five records where the sequence length is at least 10: + * + * \snippet test/snippet/seq_io/snippet_reader.cpp views + * + * ### Specifying options + * + * This snippet demonstrates how to read sequence data as protein data and have the IDs truncated + * at the first whitespace: + * \snippet test/snippet/seq_io/snippet_reader.cpp options + * + * For more advanced options, see bio::seq_io::reader_options. + */ +template +class reader : public reader_base> +{ +private: + //!\brief The base class. + using base_t = reader_base>; + //!\brief Inherit the format_type definition. + using format_type = typename base_t::format_type; + /* Implementation note + * format_type is "inherited" as private here to avoid appearing twice in the documentation. + * Its actual visibility is public because it is public in the base class. + */ +public: + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::filesystem::path const & filename, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{filename, fmt, opt} + {} + + //!\overload + explicit reader(std::filesystem::path const & filename, + reader_options const & opt = reader_options{}) : + base_t{filename, opt} + {} + + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::istream & str, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::istream & str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{str, fmt, opt} + {} + + //!\overload + template + //!\cond REQ + requires(!std::is_lvalue_reference_v) + //!\endcond + reader(temporary_stream_t && str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{std::move(str), fmt, opt} + {} +}; + +} // namespace bio::seq_io diff --git a/include/bio/seq_io/reader_options.hpp b/include/bio/seq_io/reader_options.hpp new file mode 100644 index 0000000..2436eae --- /dev/null +++ b/include/bio/seq_io/reader_options.hpp @@ -0,0 +1,193 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::seq_io::reader_options and various field type . + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// TODO replace seqan3::views::char_to with seqan3::views::char_strictly_to +namespace bio::seq_io +{ + +/*!\addtogroup seq_io + * \{ + */ + +/*!\brief The generic field types template. + * \tparam ownership Return shallow or deep types. + * \details + * + * You can use this to configure a deep record or one with custom alphabets. + * + * ### Example + * + * \snippet test/snippet/seq_io/snippet_reader_options.cpp example_simple + * + * Type of the ID will be std::string, type of the sequence will be std::vector and + * type of the qualities will be std::vector. + */ +template +inline constexpr auto field_types = []() +{ + if constexpr (ownership == bio::ownership::deep) + { + return seqan3::ttag, std::string, std::vector>, + std::conditional_t, std::string, std::vector>>; + } + else + { + return seqan3::ttag, + std::string_view, + decltype(std::string_view{} | seqan3::views::char_to)>, + std::conditional_t, + std::string_view, + decltype(std::string_view{} | seqan3::views::char_to)>>; + } +}(); + +/*!\brief The field types for reading DNA data. + * \details + * + * This is the default. + * + * Configures a shallow record where sequence data is seqan3::dna5 and quality data is seqan3::phred63. + */ +inline constexpr auto field_types_dna = field_types<>; + +/*!\brief The field types for reading protein data. + * \tparam ownership Return shallow or deep types. + * \details + * + * Configures a shallow record where sequence data is seqan3::aa27 and quality data is seqan3::phred63. + */ +inline constexpr auto field_types_protein = field_types; + +/*!\brief The field types for reading any data. + * \details + * + * Configures a shallow record where sequence and quality data are plain characters. + */ +inline constexpr auto field_types_char = field_types; + +/*!\brief The field types for raw I/O. + * \details + * + * Every field is configured as a std::span of std::byte (this enables "raw" io). + * + * ATTENTION: The exact content of this byte-span depends on the format and is likely not + * compatible between formats. Use at your own risk! + */ +inline constexpr auto field_types_raw = + seqan3::ttag, std::span, std::span>; +// TODO use seqan3::list_traits::repeat as soon as available + +/*!\brief Options that can be used to configure the behaviour of seqan3::am_io::reader. + * \tparam field_ids_t Type of the field_ids member (usually deduced). + * \tparam field_types_t Type of the field_types member (usually deduced). + * \tparam formats_t Type of the formats member (usually deduced). + * + * \details + * + * By default, the reader options assume DNA data. You can select bio::seq_io::field_types_protein to + * read protein data or bio::seq_io::field_types_char to store in an agnostic type. + * + * ### Example + * + * The reader options can be easily set via [designated + * initialisers](https://en.cppreference.com/w/cpp/language/aggregate_initialization). + * + * To switch from DNA reading (the default) to protein reading and activate truncating of IDs, do + * the following: + * + * \snippet test/snippet/seq_io/snippet_reader_options.cpp example_simple + * + * It is not required to specify all options; default values are documented. + * + * Please be aware that those options that you modify need to be set in the correct order -- **which is + * alphabetical order** for all option classes in this library. + * + * Typically, the options are set as part of the bio::seq_io::reader construction. See the respective + * documentation page for more information. + * + * ### Example (advanced) + * + * This code switches from seqan3::dna5 to seqan3::dna4 alphabet, from seqan3::phred63 to + * seqan3::phred42, and reduces the amount of threads used: + * + * \snippet test/snippet/seq_io/snippet_reader_options.cpp example_advanced1 + * + * This code makes FASTA the only legal format and creates records with only the sequence field as + * a std::string: + * + * \snippet test/snippet/seq_io/snippet_reader_options.cpp example_advanced2 + */ +template , + typename field_types_t = std::remove_cvref_t, + typename formats_t = seqan3::type_list> +struct reader_options +{ + /*!\brief The fields that shall be contained in each record; a seqan3::tag over bio::field. + * \details + * + * It is usually not necessary to change this. + * **If you do, you need to adapt field_types, as well!** + */ + field_ids_t field_ids = default_field_ids; + + /*!\brief The types corresponding to each field; a seqan3::ttag over the types. + * + * \details + * + * See seqan3::am_io::reader for an overview of the supported field/type combinations. + */ + field_types_t field_types = field_types_dna; + + /*!\brief The formats that input files can take; a seqan3::ttag over the types. + * + * \details + * + * See seqan3::am_io::reader for an overview of the the supported formats. + */ + formats_t formats = seqan3::ttag; + + //!\brief Options that are passed on to the internal stream oject. + transparent_istream_options stream_options{}; + + //!\brief Truncate IDs at first whitespace. + bool truncate_ids = false; + + // TODO static_assert +}; + +//!\} + +} // namespace bio::seq_io diff --git a/test/snippet/seq_io/snippet_reader.cpp b/test/snippet/seq_io/snippet_reader.cpp new file mode 100644 index 0000000..2604e73 --- /dev/null +++ b/test/snippet/seq_io/snippet_reader.cpp @@ -0,0 +1,86 @@ +#include + +#include +#include + +#include "../../unit/seq_io/data.hpp" + +int main() +{ + //================= PRE ========================== + { + std::ofstream os{"example.fasta", std::ios::binary}; + os << input; + } + + std::ifstream in{"example.fasta"}; + std::cin.rdbuf(in.rdbuf()); // rewire stdin + + //================= SNIPPETS ====================== +{ +//![simple_usage_file] +bio::seq_io::reader reader{"example.fasta"}; + +for (auto & rec : reader) +{ + seqan3::debug_stream << "ID: " << rec.id() << '\n'; + seqan3::debug_stream << "Seq: " << rec.seq() << '\n'; +} +//![simple_usage_file] +} + +{ +//![simple_usage_stream] +bio::seq_io::reader reader{std::cin, bio::fasta{}}; + +for (auto & rec : reader) +{ + seqan3::debug_stream << "ID: " << rec.id() << '\n'; + seqan3::debug_stream << "Seq: " << rec.seq() << '\n'; +} +//![simple_usage_stream] +} + +{ +//![decomposed] +bio::seq_io::reader reader{"example.fasta"}; + +for (auto & [ id, seq, qual ] : reader) +{ + seqan3::debug_stream << "ID: " << id << '\n'; + seqan3::debug_stream << "Seq: " << seq << '\n'; +} +//![decomposed] +} + +{ +//![views] +bio::seq_io::reader reader{"example.fasta"}; + +auto min_length = [](auto & rec) { return rec.seq().size() > 10; }; + +for (auto & rec : reader | std::views::filter(min_length) | std::views::take(5)) +{ + seqan3::debug_stream << "ID: " << rec.id() << '\n'; + seqan3::debug_stream << "Seq: " << rec.seq() << '\n'; +} +//![views] +} + +{ +//![options] +bio::seq_io::reader reader{"example.fasta", + bio::seq_io::reader_options{ .field_types = bio::seq_io::field_types_protein, + .truncate_ids = true }}; + +for (auto & rec : reader) +{ + seqan3::debug_stream << "ID: " << rec.id() << '\n'; + seqan3::debug_stream << "Seq: " << rec.seq() << '\n'; +} +//![options] +} + + //================= POST ========================== + std::filesystem::remove("example.fasta"); +} diff --git a/test/snippet/seq_io/snippet_reader.err b/test/snippet/seq_io/snippet_reader.err new file mode 100644 index 0000000..5c05c65 --- /dev/null +++ b/test/snippet/seq_io/snippet_reader.err @@ -0,0 +1,48 @@ +ID: ID1 +Seq: ACGTTTTTTTTTTTTTTT +ID: ID2 +Seq: ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +ID: ID3 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID4 +Seq: ACGTTTA +ID: ID5 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID1 +Seq: ACGTTTTTTTTTTTTTTT +ID: ID2 +Seq: ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +ID: ID3 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID4 +Seq: ACGTTTA +ID: ID5 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID1 +Seq: ACGTTTTTTTTTTTTTTT +ID: ID2 +Seq: ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +ID: ID3 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID4 +Seq: ACGTTTA +ID: ID5 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID1 +Seq: ACGTTTTTTTTTTTTTTT +ID: ID2 +Seq: ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +ID: ID3 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID5 lala +Seq: ACGTTTAACGTTTTTTTT +ID: ID1 +Seq: ACGTTTTTTTTTTTTTTT +ID: ID2 +Seq: ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +ID: ID3 +Seq: ACGTTTAACGTTTTTTTT +ID: ID4 +Seq: ACGTTTA +ID: ID5 +Seq: ACGTTTAACGTTTTTTTT diff --git a/test/snippet/seq_io/snippet_reader_options.cpp b/test/snippet/seq_io/snippet_reader_options.cpp new file mode 100644 index 0000000..1379441 --- /dev/null +++ b/test/snippet/seq_io/snippet_reader_options.cpp @@ -0,0 +1,49 @@ +#include +#include +#include + +int main() +{ +{ +//![example_custom] +bio::seq_io::reader_options options +{ + .field_types = bio::seq_io::field_types, +}; +//![example_custom] +} + +{ +//![example_simple] +bio::seq_io::reader_options options +{ + .field_types = bio::seq_io::field_types_protein, + .truncate_ids = true +}; +//![example_simple] +} + +{ +//![example_advanced1] +bio::seq_io::reader_options options +{ + .field_types = bio::seq_io::field_types, + .stream_options = bio::transparent_istream_options{ .threads = 1 } +}; +//![example_advanced1] +} + +//TODO if https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101803 gets backported to GCC10 we can omit +// bio::transparent_istream_options from the above example + +{ +//![example_advanced2] +bio::seq_io::reader_options options +{ + .field_ids = seqan3::vtag, + .field_types = seqan3::ttag, + .formats = seqan3::ttag +}; +//![example_advanced2] +} +} diff --git a/test/unit/seq_io/CMakeLists.txt b/test/unit/seq_io/CMakeLists.txt new file mode 100644 index 0000000..692e753 --- /dev/null +++ b/test/unit/seq_io/CMakeLists.txt @@ -0,0 +1 @@ +bio_test(seq_io_reader_test.cpp) diff --git a/test/unit/seq_io/data.hpp b/test/unit/seq_io/data.hpp new file mode 100644 index 0000000..516badc --- /dev/null +++ b/test/unit/seq_io/data.hpp @@ -0,0 +1,32 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#include + +inline constexpr std::string_view input = + R"raw(>ID1 +ACGTTTTTTTTTTTTTTT +>ID2 +ACGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +>ID3 lala +ACGTTTA +ACGTTTTTTTT + +>ID4 +ACGTTTA +>ID5 lala +ACGTTTA +ACGTTTTTTTT +)raw"; + +inline constexpr std::string_view input_bgzipped{ + "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x47\x00\xb3\xf3\x74\x31\xe4\x72\x74\x76\x0f\x41\x05" + "\x5c\x76\x9e\x2e\x46\x58\xc4\x29\x05\x20\x73\x8d\x15\x72\x12\x73\x12\xa1\x86\x3b\x22\x5b\xc2\x05\x92\x36\x81\xcb\x00" + "\x39\xa6\xb8\xd5\x02\x00\xcd\x3b\x57\x80\xba\x00\x00\x00\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02" + "\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 100}; diff --git a/test/unit/seq_io/seq_io_reader_test.cpp b/test/unit/seq_io/seq_io_reader_test.cpp new file mode 100644 index 0000000..fc9f0b8 --- /dev/null +++ b/test/unit/seq_io/seq_io_reader_test.cpp @@ -0,0 +1,261 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +#include "data.hpp" + +TEST(seq_io_reader, concepts) +{ + using t = bio::seq_io::reader<>; + EXPECT_TRUE((std::ranges::input_range)); + + using ct = bio::seq_io::reader<> const; + // not const-iterable + EXPECT_FALSE((std::ranges::input_range)); +} + +void seq_io_reader_filename_constructor(bool ext_check, auto &&... args) +{ + /* just the filename */ + { + seqan3::test::tmp_filename filename{"seq_io_reader_constructor.fasta"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + EXPECT_NO_THROW((bio::seq_io::reader{filename.get_path(), std::forward(args)...})); + } + + // correct format check is done by tests of that format + + /* non-existent file */ + { + EXPECT_THROW((bio::seq_io::reader{"/dev/nonexistant/foobarOOO", std::forward(args)...}), + bio::file_open_error); + } + + /* wrong extension */ + if (ext_check) + { + seqan3::test::tmp_filename filename{"seq_io_reader_constructor.xyz"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + EXPECT_THROW((bio::seq_io::reader{filename.get_path(), std::forward(args)...}), + bio::unhandled_extension_error); + } +} + +TEST(seq_io_reader, constructor1_just_filename) +{ + seq_io_reader_filename_constructor(true); + EXPECT_TRUE((std::same_as>)); +} + +TEST(seq_io_reader, constructor1_with_opts) +{ + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types_protein}; + using control_t = bio::seq_io::reader, + std::remove_cvref_t, + seqan3::type_list>; + + seq_io_reader_filename_constructor(true, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, constructor2_just_filename_direct_format) +{ + seq_io_reader_filename_constructor(false, bio::fasta{}); + EXPECT_TRUE((std::same_as>)); +} + +TEST(seq_io_reader, constructor2_with_opts_direct_format) +{ + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types_dna}; + using control_t = bio::seq_io::reader, + std::remove_cvref_t, + seqan3::type_list>; + + seq_io_reader_filename_constructor(false, bio::fasta{}, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, constructor2_just_filename_format_variant) +{ + std::variant var{}; + + seq_io_reader_filename_constructor(false, var); + EXPECT_TRUE((std::same_as>)); +} + +TEST(seq_io_reader, constructor2_with_opts_format_variant) +{ + std::variant var{}; + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types_dna}; + using control_t = bio::seq_io::reader, + std::remove_cvref_t, + seqan3::type_list>; + + seq_io_reader_filename_constructor(false, var, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, constructor3) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::seq_io::reader{str, bio::fasta{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(seq_io_reader, constructor3_with_opts) +{ + std::istringstream str; + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types_dna}; + using control_t = bio::seq_io::reader, + std::remove_cvref_t, + seqan3::type_list>; + + EXPECT_NO_THROW((bio::seq_io::reader{str, bio::fasta{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, constructor4) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::seq_io::reader{std::move(str), bio::fasta{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(seq_io_reader, constructor4_with_opts) +{ + std::istringstream str; + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types_dna}; + using control_t = bio::seq_io::reader, + std::remove_cvref_t, + seqan3::type_list>; + + EXPECT_NO_THROW((bio::seq_io::reader{std::move(str), bio::fasta{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, iteration) +{ + { + std::istringstream str{static_cast(input)}; + bio::seq_io::reader reader{str, bio::fasta{}}; + + EXPECT_EQ(std::ranges::distance(reader), 5); + } + + { + std::istringstream str{static_cast(input)}; + bio::seq_io::reader reader{str, bio::fasta{}}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("ID")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 5); + } +} + +TEST(seq_io_reader, empty_file) +{ + { + seqan3::test::tmp_filename filename{"seq_io_reader_constructor.fasta"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + bio::seq_io::reader reader{filename.get_path()}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +TEST(seq_io_reader, empty_stream) +{ + { + std::istringstream str{""}; + bio::seq_io::reader reader{str, bio::fasta{}}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +TEST(seq_io_reader, custom_field_types) +{ + bio::seq_io::reader_options opt{.field_types = bio::seq_io::field_types}; + + std::istringstream str{static_cast(input)}; + bio::seq_io::reader reader{str, bio::fasta{}, opt}; + + EXPECT_TRUE((std::same_as &>)); + EXPECT_TRUE((std::same_as)); +} + +TEST(seq_io_reader, custom_field_ids_structured_bindings) +{ + bio::seq_io::reader_options opt{.field_ids = seqan3::vtag, + .field_types = seqan3::ttag}; + + std::istringstream str{static_cast(input)}; + bio::seq_io::reader reader{str, bio::fasta{}, opt}; + + for (auto & [seq, id] : reader) + EXPECT_TRUE(id.starts_with("ID")); +} + +TEST(seq_io_reader, decompression_filename) +{ + seqan3::test::tmp_filename filename{"seq_io_reader.fasta.gz"}; + + { + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + bio::detail::fast_ostreambuf_iterator it{filecreator}; + it.write_range(input_bgzipped); + } + + bio::seq_io::reader reader{filename.get_path()}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("ID")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 5); +} + +TEST(seq_io_reader, decompression_stream) +{ + std::istringstream str{static_cast(input_bgzipped)}; + + bio::seq_io::reader reader{str, bio::fasta{}}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("ID")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 5); +} From 29d46d4b52918fe6113a16108c9df7c3a1fac95b Mon Sep 17 00:00:00 2001 From: Hannes Hauswedell Date: Mon, 29 Nov 2021 16:19:19 +0100 Subject: [PATCH 2/2] [misc] various things I would rather have in SeqAn --- include/bio/detail/range.hpp | 76 +++++++++--- include/bio/format/fasta_input_handler.hpp | 9 +- include/bio/format/format_input_handler.hpp | 15 ++- include/bio/misc.hpp | 115 +++++++++++++++++- include/bio/record.hpp | 13 +- include/bio/seq_io/misc.hpp | 4 +- include/bio/seq_io/reader.hpp | 2 +- include/bio/seq_io/reader_options.hpp | 30 ++--- submodules/seqan3 | 2 +- .../snippet/seq_io/snippet_reader_options.cpp | 6 +- test/snippet/snippet_tag.cpp | 28 +++++ test/unit/format/fasta_input_test.cpp | 6 +- test/unit/record_test.cpp | 10 +- test/unit/seq_io/seq_io_reader_test.cpp | 4 +- test/unit/stream/istream_test_template.hpp | 2 +- test/unit/stream/ostream_test_template.hpp | 2 +- 16 files changed, 247 insertions(+), 77 deletions(-) create mode 100644 test/snippet/snippet_tag.cpp diff --git a/include/bio/detail/range.hpp b/include/bio/detail/range.hpp index d555878..c6e5d73 100644 --- a/include/bio/detail/range.hpp +++ b/include/bio/detail/range.hpp @@ -13,8 +13,10 @@ #pragma once +#include +#include + #include -#include #include @@ -25,6 +27,57 @@ namespace bio::detail * \{ */ +// ---------------------------------------------------------------------------- +// concepts +// ---------------------------------------------------------------------------- + +/*!\interface bio::detail::back_insertable_with <> + * \extends std::ranges::output_range + * \tparam rng_t The container type. + * \tparam val_t The type to append to the container. + * \brief Describes range types that can grow in amortised constant time by appending an element of type val_t. + */ +//!\cond +template +concept back_insertable_with = std::ranges::output_range && requires(rng_t & v) +{ + v.push_back(std::declval()); +}; +//!\endcond + +/*!\interface bio::detail::back_insertable <> + * \extends std::ranges::output_range + * \extends std::ranges::input_range + * \tparam rng_t The container type. + * \brief Describes range types that can grow in amortised constant time by appending an element. + */ +//!\cond +template +concept back_insertable = + std::ranges::input_range && back_insertable_with>; +//!\endcond + +//!\brief A seqan3::alphabet that is **not** a character or number (any std::integral). +template +concept deliberate_alphabet = seqan3::alphabet && !std::integral>; + +//!\brief A range whose value type is `char`. +template +concept char_range = std::ranges::range && std::same_as>>; + +//!\brief A range whose value type is an integral type other than `char`. +template +concept int_range = std::ranges::range && std::integral>> && + !std::same_as>>; + +//!\brief A type that is not std::span. +template +concept not_a_byte_span = !std::same_as>; + +// ---------------------------------------------------------------------------- +// copy functions +// ---------------------------------------------------------------------------- + /*!\brief Copy elements from the first range into the second range. * \param[in] in The range to copy from. * \param[out] out The range to copy to. @@ -36,8 +89,8 @@ namespace bio::detail * If the input range is sized and the target range offers a `.resize()` member, this function uses * resize and assignment instead of back-insertion. */ -void sized_range_copy(std::ranges::input_range auto && in, - seqan3::back_insertable_with> auto && out) +void sized_range_copy(std::ranges::input_range auto && in, + back_insertable_with> auto && out) { using in_t = decltype(in); using out_t = decltype(out); @@ -64,23 +117,6 @@ void string_copy(std::string_view const in, auto & out) sized_range_copy(in, out); } -//!\brief A seqan3::alphabet that is **not** a character or number (any std::integral). -template -concept deliberate_alphabet = seqan3::alphabet && !std::integral>; - -//!\brief A range whose value type is `char`. -template -concept char_range = std::ranges::range && std::same_as>>; - -//!\brief A range whose value type is an integral type other than `char`. -template -concept int_range = std::ranges::range && std::integral>> && - !std::same_as>>; - -//!\brief A type that is not std::span. -template -concept not_a_byte_span = !std::same_as>; - //!\} } // namespace bio::detail diff --git a/include/bio/format/fasta_input_handler.hpp b/include/bio/format/fasta_input_handler.hpp index 90fa492..7a5182b 100644 --- a/include/bio/format/fasta_input_handler.hpp +++ b/include/bio/format/fasta_input_handler.hpp @@ -100,7 +100,7 @@ class format_input_handler : public format_input_handler_base; + using format_fields = vtag_t; //!\brief Type of the raw record. using raw_record_type = record; //!\brief Type of the low-level iterator. @@ -180,13 +180,10 @@ class format_input_handler : public format_input_handler_base const & /**/, std::string & parsed_field) - { - std::swap(id_buffer, parsed_field); - } + void parse_field(vtag_t const & /**/, std::string & parsed_field) { std::swap(id_buffer, parsed_field); } //!\brief We can prevent another copy if the user wants a string. - void parse_field(seqan3::vtag_t const & /**/, std::string & parsed_field) + void parse_field(vtag_t const & /**/, std::string & parsed_field) { std::swap(seq_buffer, parsed_field); } diff --git a/include/bio/format/format_input_handler.hpp b/include/bio/format/format_input_handler.hpp index aa2d134..6ec8ef0 100644 --- a/include/bio/format/format_input_handler.hpp +++ b/include/bio/format/format_input_handler.hpp @@ -18,7 +18,6 @@ #include #include // TODO replace with char_strictly_to -#include #include #include @@ -105,7 +104,7 @@ class format_input_handler_base } //!\brief Parse into string-like types. - template + template requires detail::char_range static void parse_field_aux(std::string_view const in, parsed_field_t & parsed_field) { @@ -113,7 +112,7 @@ class format_input_handler_base } //!\brief Parse into containers of alphabets. - template + template requires detail::deliberate_alphabet> static void parse_field_aux(std::string_view const in, parsed_field_t & parsed_field) { @@ -146,7 +145,7 @@ class format_input_handler_base //!\brief Various target types have sane default implementations. template - void parse_field(seqan3::vtag_t const & /**/, parsed_field_t & parsed_field) requires(requires { + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) requires(requires { derived_t::parse_field_aux(get(to_derived()->raw_record), parsed_field); }) { @@ -160,21 +159,21 @@ class format_input_handler_base */ //!\brief Only act on those fields that are present in the record and also provided by the format. template - void parse_record_impl(seqan3::vtag_t const & /**/, parsed_record_t & parsed_record) + void parse_record_impl(vtag_t const & /**/, parsed_record_t & parsed_record) { if constexpr (parsed_record_t::field_ids::contains(field_id)) { auto & parsed_field = get(parsed_record); - to_derived()->parse_field(seqan3::vtag, parsed_field); + to_derived()->parse_field(vtag, parsed_field); } // fields that are not in format or not in target record are simply ignored } //!\brief Splits the record into individual fields. template - void parse_record(seqan3::vtag_t const & /**/, parsed_record_t & parsed_record) + void parse_record(vtag_t const & /**/, parsed_record_t & parsed_record) { - (to_derived()->parse_record_impl(seqan3::vtag, parsed_record), ...); + (to_derived()->parse_record_impl(vtag, parsed_record), ...); } //!\} diff --git a/include/bio/misc.hpp b/include/bio/misc.hpp index bcf612d..1af22fd 100644 --- a/include/bio/misc.hpp +++ b/include/bio/misc.hpp @@ -8,6 +8,11 @@ #pragma once +#include +#include + +#include + #include /*!\file @@ -18,17 +23,121 @@ namespace bio { +//----------------------------------------------------------------------------- +// ownership +//----------------------------------------------------------------------------- + /*!\brief An enum used as an argument for templates that switch between owning and non-owning behaviour. + * \ingroup bio * \details * * Typically used to configure a class template to have members that are vectors/strings VS members that are views. * The "shallow" version of such a class is typically cheap to copy (no dynamic memory) while the "deep" version - * is exppensive to copy (holds dynamic memory). + * is expensive to copy (holds dynamic memory). */ enum class ownership { - shallow, //< Cheap to copy. - deep //< Expensive to copy. + shallow, //!< Cheap to copy. + deep //!< Expensive to copy. +}; + +//----------------------------------------------------------------------------- +// vtag +//----------------------------------------------------------------------------- + +/*!\brief The type of bio::vtag. [Default "specialisation" for 0 arguments.] + * \tparam more_vs Any number of values [only 0 arguments pick this specialisation]. + * \ingroup bio + * \see bio::vtag + */ +template +struct vtag_t +{ + //!\brief The number of values stored in the tag. + static constexpr size_t size = 0; + + //!\brief The tag converted to a tuple. + static constexpr auto as_tuple = std::tuple{}; + + //!\brief A function that checks if a value is contained in the tag. + static constexpr bool contains(auto &&) { return false; } + + //!\brief A function that returns the index of a value or ((size_t)-1) if the value is not found. + static constexpr size_t index_of(auto &&) { return static_cast(-1ULL); } +}; + +/*!\brief The type of bio::vtag. [Specialisation for 1 or more arguments] + * \tparam v First value. + * \tparam more_vs More values. + * \ingroup bio + * \see bio::vtag + */ +template +struct vtag_t +{ + //!\brief The first value in the tag. + static constexpr auto first_value = v; + + //!\copybrief bio::vtag_t::size + static constexpr size_t size = sizeof...(more_vs) + 1; + + //!\copybrief bio::vtag_t::as_tuple + static constexpr auto as_tuple = std::tuple{v, more_vs...}; + + //!\brief Whether all values in the tag are unique. + static constexpr bool unique_values = ((v != more_vs) && ...); + + //!\copybrief bio::vtag_t::contains + static constexpr bool contains(auto && s) requires std::equality_comparable_with && + (std::equality_comparable_with &&...) + { + return s == v || ((s == more_vs) || ...); + } + + //!\copybrief bio::vtag_t::index_of + static constexpr size_t index_of(auto && s) requires std::equality_comparable_with && + (std::equality_comparable_with &&...) + { + size_t c = 0; + ((v != s && ++c) && ((more_vs != s && ++c) && ...)); + return c >= size ? static_cast(-1ULL) : c; + } }; +/*!\brief A value-tag template. + * \tparam vs The values to store in the tag. + * \ingroup bio + * \details + * + * Using this template, you can easily turn a value, e.g. a literal value, into a compile-time constant with a unique + * type. + * + * ### Example + * + * \snippet test/snippet/snippet_tag.cpp vtag + */ +template +inline constexpr vtag_t vtag{}; + +//----------------------------------------------------------------------------- +// ttag +//----------------------------------------------------------------------------- + +/*!\brief A type-tag template. + * \tparam type The first type to store. + * \tparam more_types More types to store (optional). + * \ingroup bio + * \see seqan3::type_list + * + * \details + * + * Using this template, you can easily turn a type into a compile-time constant (value). + * + * ### Example + * + * \snippet test/snippet/snippet_tag.cpp ttag + */ +template +inline constexpr seqan3::type_list ttag{}; + } // namespace bio diff --git a/include/bio/record.hpp b/include/bio/record.hpp index 7af2966..65546a9 100644 --- a/include/bio/record.hpp +++ b/include/bio/record.hpp @@ -18,9 +18,10 @@ #include #include -#include #include +#include + namespace bio { @@ -89,7 +90,7 @@ enum class field : uint64_t * \implements seqan3::tuple_like * \ingroup bio * \tparam field_types The types of the fields in this record as a seqan3::type_list. - * \tparam field_ids A seqan3::vtag_t type with bio::field IDs corresponding to field_types. + * \tparam field_ids A vtag_t type with bio::field IDs corresponding to field_types. * \details * * This class template behaves just like an std::tuple, with the exception that it provides an additional @@ -346,8 +347,8 @@ auto const && get(record const && r) * TODO */ template -constexpr auto make_record(seqan3::vtag_t, field_type_ts &... fields) - -> record, field_type_ts...> +constexpr auto make_record(vtag_t, field_type_ts &... fields) + -> record, field_type_ts...> { return {fields...}; } @@ -364,8 +365,8 @@ constexpr auto make_record(seqan3::vtag_t, field_type_ts &... fiel * TODO */ template -constexpr auto tie_record(seqan3::vtag_t, field_type_ts &... fields) - -> record, field_type_ts &...> +constexpr auto tie_record(vtag_t, field_type_ts &... fields) + -> record, field_type_ts &...> { return {fields...}; } diff --git a/include/bio/seq_io/misc.hpp b/include/bio/seq_io/misc.hpp index f1f34e0..25ea591 100644 --- a/include/bio/seq_io/misc.hpp +++ b/include/bio/seq_io/misc.hpp @@ -13,7 +13,7 @@ #pragma once -#include +#include #include @@ -22,6 +22,6 @@ namespace bio::seq_io //!\brief Default fields for seqan3::seq_io::reader_options. //!\ingroup seq_io -inline constexpr auto default_field_ids = seqan3::vtag; +inline constexpr auto default_field_ids = vtag; } // namespace bio::seq_io diff --git a/include/bio/seq_io/reader.hpp b/include/bio/seq_io/reader.hpp index 685e983..72e79ae 100644 --- a/include/bio/seq_io/reader.hpp +++ b/include/bio/seq_io/reader.hpp @@ -16,12 +16,12 @@ #include #include +#include #include #include #include #include #include -#include #include #include diff --git a/include/bio/seq_io/reader_options.hpp b/include/bio/seq_io/reader_options.hpp index 2436eae..301ad27 100644 --- a/include/bio/seq_io/reader_options.hpp +++ b/include/bio/seq_io/reader_options.hpp @@ -16,13 +16,13 @@ #include #include +#include #include #include #include #include #include #include -#include #include #include @@ -58,19 +58,19 @@ inline constexpr auto field_types = []() { if constexpr (ownership == bio::ownership::deep) { - return seqan3::ttag, std::string, std::vector>, - std::conditional_t, std::string, std::vector>>; + return ttag, std::string, std::vector>, + std::conditional_t, std::string, std::vector>>; } else { - return seqan3::ttag, - std::string_view, - decltype(std::string_view{} | seqan3::views::char_to)>, - std::conditional_t, - std::string_view, - decltype(std::string_view{} | seqan3::views::char_to)>>; + return ttag, + std::string_view, + decltype(std::string_view{} | seqan3::views::char_to)>, + std::conditional_t, + std::string_view, + decltype(std::string_view{} | seqan3::views::char_to)>>; } }(); @@ -107,7 +107,7 @@ inline constexpr auto field_types_char = field_types, std::span, std::span>; + ttag, std::span, std::span>; // TODO use seqan3::list_traits::repeat as soon as available /*!\brief Options that can be used to configure the behaviour of seqan3::am_io::reader. @@ -163,7 +163,7 @@ struct reader_options */ field_ids_t field_ids = default_field_ids; - /*!\brief The types corresponding to each field; a seqan3::ttag over the types. + /*!\brief The types corresponding to each field; a ttag over the types. * * \details * @@ -171,13 +171,13 @@ struct reader_options */ field_types_t field_types = field_types_dna; - /*!\brief The formats that input files can take; a seqan3::ttag over the types. + /*!\brief The formats that input files can take; a ttag over the types. * * \details * * See seqan3::am_io::reader for an overview of the the supported formats. */ - formats_t formats = seqan3::ttag; + formats_t formats = ttag; //!\brief Options that are passed on to the internal stream oject. transparent_istream_options stream_options{}; diff --git a/submodules/seqan3 b/submodules/seqan3 index ed921fb..b85ae3e 160000 --- a/submodules/seqan3 +++ b/submodules/seqan3 @@ -1 +1 @@ -Subproject commit ed921fbbd6c5c996a2df09d32b155808367ec617 +Subproject commit b85ae3ef90ba7ba7425af407aec04fc858f69f97 diff --git a/test/snippet/seq_io/snippet_reader_options.cpp b/test/snippet/seq_io/snippet_reader_options.cpp index 1379441..bb66c3f 100644 --- a/test/snippet/seq_io/snippet_reader_options.cpp +++ b/test/snippet/seq_io/snippet_reader_options.cpp @@ -40,9 +40,9 @@ bio::seq_io::reader_options options //![example_advanced2] bio::seq_io::reader_options options { - .field_ids = seqan3::vtag, - .field_types = seqan3::ttag, - .formats = seqan3::ttag + .field_ids = bio::vtag, + .field_types = bio::ttag, + .formats = bio::ttag }; //![example_advanced2] } diff --git a/test/snippet/snippet_tag.cpp b/test/snippet/snippet_tag.cpp new file mode 100644 index 0000000..c079b4d --- /dev/null +++ b/test/snippet/snippet_tag.cpp @@ -0,0 +1,28 @@ +#include + +//![vtag] +void foo(bio::vtag_t<1>) { /* do one thing */ } + +void foo(bio::vtag_t<2>) { /* do another thing */ } + +void bar() +{ + foo(bio::vtag<1>); // calls first overload + foo(bio::vtag<2>); // calls second overload +} +//![vtag] + +//![ttag] +void bax(seqan3::type_list) { /* do one thing */ } + +void bax(seqan3::type_list) { /* do another thing */ } + +void bat() +{ + bax(bio::ttag); // calls first overload + bax(bio::ttag); // calls second overload +} +//![ttag] + +int main() +{} diff --git a/test/unit/format/fasta_input_test.cpp b/test/unit/format/fasta_input_test.cpp index 9c2594f..283e249 100644 --- a/test/unit/format/fasta_input_test.cpp +++ b/test/unit/format/fasta_input_test.cpp @@ -29,7 +29,7 @@ using std::literals::string_view_literals::operator""sv; struct read : public ::testing::Test { - using default_rec_t = bio::record, + using default_rec_t = bio::record, std::string_view, decltype(std::string_view{} | seqan3::views::char_to)>; @@ -58,7 +58,7 @@ struct read : public ::testing::Test bio::format_input_handler input_handler{istream}; - bio::record, id_t, seq_t> rec; + bio::record, id_t, seq_t> rec; for (unsigned i = 0; i < 3; ++i) { @@ -294,7 +294,7 @@ TEST_F(read, fail_no_seq) // // std::istringstream istream{input}; // bio::format_input_handler input_handler{istream}; -// using rec_t = bio::record, std::string_view, +// using rec_t = bio::record, std::string_view, // std::vector>; rec_t rec; // // EXPECT_THROW(input_handler.parse_next_record_into(rec), seqan3::invalid_char_assignment); diff --git a/test/unit/record_test.cpp b/test/unit/record_test.cpp index f9ee3a5..f1cfc61 100644 --- a/test/unit/record_test.cpp +++ b/test/unit/record_test.cpp @@ -14,14 +14,14 @@ #include #include #include -#include #include +#include #include using seqan3::operator""_dna4; -using default_fields = seqan3::vtag_t; +using default_fields = bio::vtag_t; // This is needed for EXPECT_RANGE_EQ: namespace seqan3 @@ -54,7 +54,7 @@ TEST(fields, usage) struct record : public ::testing::Test { - using ids = seqan3::vtag_t; + using ids = bio::vtag_t; using record_type = bio::record; }; @@ -111,7 +111,7 @@ TEST_F(record, make_record) std::string s = "MY ID"; auto vec = "ACGT"_dna4; - auto r = bio::make_record(seqan3::vtag, s, vec); + auto r = bio::make_record(bio::vtag, s, vec); EXPECT_TRUE((std::same_as)); } @@ -120,6 +120,6 @@ TEST_F(record, tie_record) std::string s = "MY ID"; auto vec = "ACGT"_dna4; - auto r = bio::tie_record(seqan3::vtag, s, vec); + auto r = bio::tie_record(bio::vtag, s, vec); EXPECT_TRUE((std::same_as &>>)); } diff --git a/test/unit/seq_io/seq_io_reader_test.cpp b/test/unit/seq_io/seq_io_reader_test.cpp index fc9f0b8..9673acb 100644 --- a/test/unit/seq_io/seq_io_reader_test.cpp +++ b/test/unit/seq_io/seq_io_reader_test.cpp @@ -212,8 +212,8 @@ TEST(seq_io_reader, custom_field_types) TEST(seq_io_reader, custom_field_ids_structured_bindings) { - bio::seq_io::reader_options opt{.field_ids = seqan3::vtag, - .field_types = seqan3::ttag}; + bio::seq_io::reader_options opt{.field_ids = bio::vtag, + .field_types = bio::ttag}; std::istringstream str{static_cast(input)}; bio::seq_io::reader reader{str, bio::fasta{}, opt}; diff --git a/test/unit/stream/istream_test_template.hpp b/test/unit/stream/istream_test_template.hpp index 1856d36..b9cd92c 100644 --- a/test/unit/stream/istream_test_template.hpp +++ b/test/unit/stream/istream_test_template.hpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/test/unit/stream/ostream_test_template.hpp b/test/unit/stream/ostream_test_template.hpp index 280fd38..2d2eb66 100644 --- a/test/unit/stream/ostream_test_template.hpp +++ b/test/unit/stream/ostream_test_template.hpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include