Skip to content

Commit

Permalink
add support for addressing barcode sequences in output template
Browse files Browse the repository at this point in the history
Molecular barcodes that have no been error corrected are now written to OX/BZ instead of RX/QX

Added support for s,c,m in the first literal of a token to address the decoded sample, cellular and molecular sequence. the decoded sequence is addressed as one, concatenated, sequence and not as a segmented sequence. For now the quality scores are unchanged from the observed sequence. This needs to be changed into something that makes more sense...

for instance `s::` will be the entire sample barcode sequence.

Fixed a bug in schema validation where template element was not validated.
  • Loading branch information
moonwatcher committed Nov 10, 2021
1 parent 544a58e commit 35966b8
Show file tree
Hide file tree
Showing 14 changed files with 5,260 additions and 5,152 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
install.sh
/test/*/result
/test/*/*/result
/.vscode/**
46 changes: 20 additions & 26 deletions auxiliary.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,7 @@ class Auxiliary {
inline void set_RG(const string& rg) {
if(!rg.empty()) ks_put_string(rg, RG);
};
inline void update_sample_barcode(const Barcode& barcode) {
if(ks_not_empty(BC)) {
ks_put_character('-', BC);
}
barcode.encode_iupac_ambiguity(BC);
};
inline void update_sample_barcode(const Observation& observation) {
inline void update_raw_sample_barcode(const Observation& observation) {
if(ks_not_empty(BC)) {
ks_put_character('-', BC);
ks_put_character(' ', QT);
Expand All @@ -264,49 +258,49 @@ class Auxiliary {
observation.encode_phred_quality(QT, SAM_PHRED_DECODING_OFFSET);
};

inline void update_cellular_barcode(const Barcode& barcode) {
inline void update_raw_cellular_barcode(const Observation& observation) {
if(ks_not_empty(CR)) {
ks_put_character('-', CR);
ks_put_character(' ', CY);
}
observation.encode_iupac_ambiguity(CR);
observation.encode_phred_quality(CY, SAM_PHRED_DECODING_OFFSET);
};
inline void update_corrected_cellular_barcode(const Barcode& barcode) {
if(ks_not_empty(CB)) {
ks_put_character('-', CB);
}
barcode.encode_iupac_ambiguity(CB);
};
inline void update_cellular_barcode(const Observation& observation) {
inline void update_corrected_cellular_barcode(const Observation& observation) {
if(ks_not_empty(CB)) {
ks_put_character('-', CB);
}
observation.encode_iupac_ambiguity(CB);
};
inline void update_raw_cellular_barcode(const Observation& observation) {
if(ks_not_empty(CR)) {
ks_put_character('-', CR);
ks_put_character(' ', CY);

inline void update_raw_molecular_barcode(const Observation& observation) {
if(ks_not_empty(OX)) {
ks_put_character('-', OX);
ks_put_character(' ', BZ);
}
observation.encode_iupac_ambiguity(CR);
observation.encode_phred_quality(CY, SAM_PHRED_DECODING_OFFSET);
observation.encode_iupac_ambiguity(OX);
observation.encode_phred_quality(BZ, SAM_PHRED_DECODING_OFFSET);
};

inline void update_molecular_barcode(const Barcode& barcode) {
inline void update_corrected_molecular_barcode(const Barcode& barcode) {
if(ks_not_empty(RX)) {
ks_put_character('-', RX);
}
barcode.encode_iupac_ambiguity(RX);
};
inline void update_molecular_barcode(const Observation& observation) {
inline void update_corrected_molecular_barcode(const Observation& observation) {
if(ks_not_empty(RX)) {
ks_put_character('-', RX);
ks_put_character(' ', QX);
}
observation.encode_iupac_ambiguity(RX);
observation.encode_phred_quality(QX, SAM_PHRED_DECODING_OFFSET);
};
inline void update_raw_molecular_barcode(const Observation& observation) {
if(ks_not_empty(OX)) {
ks_put_character('-', OX);
ks_put_character(' ', BZ);
}
observation.encode_iupac_ambiguity(OX);
observation.encode_phred_quality(BZ, SAM_PHRED_DECODING_OFFSET);
};

inline void clear() {
/* FI and TC don't change during demultiplexing */
Expand Down
48 changes: 44 additions & 4 deletions configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -1060,12 +1060,12 @@
"$ref": "#/definitions/decoder",
"title": "Sample barcode decoder"
},
"template": {
"$ref": "#/definitions/template"
},
"threads": {
"$ref": "#/definitions/threads"
},
"transform": {
"$ref": "#/definitions/transform"
},
"working directory": {
"$ref": "#/definitions/url"
}
Expand Down Expand Up @@ -1373,11 +1373,51 @@
"Tile": "Template",
"properties": {
"transform": {
"$ref": "#/definitions/transform"
"$ref": "#/definitions/template_transform"
}
},
"type": "object"
},
"template_segment_token": {
"Description": "Token to extract from a segment for template composition",
"examples": [
"0:0:3",
"s:0:10",
"c:10:20",
"m::",
"0:0:",
"0:-4:",
"1:-5:-1",
"1:3:-2"
],
"format": "regex",
"pattern": "^(s|c|m|[0-9]+):(-?[0-9]+)?:(-?[0-9]+)?$",
"title": "Template segment token",
"type": "string"
},
"template_transform": {
"Tile": "Template transform",
"properties": {
"knit": {
"items": {
"$ref": "#/definitions/knit"
},
"minItems": 1,
"type": "array"
},
"token": {
"items": {
"$ref": "#/definitions/template_segment_token"
},
"minItems": 1,
"type": "array"
}
},
"required": [
"token"
],
"type": "object"
},
"threads": {
"description": "Number of parallel processing threads used by the pipeline.",
"minimum": 1,
Expand Down
4 changes: 4 additions & 0 deletions include.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
#include <unordered_map>
#include <vector>
#include <stddef.h>
#include <regex>

using std::cerr;
using std::condition_variable;
Expand Down Expand Up @@ -112,6 +113,9 @@ using std::uint8_t;
using std::unique_lock;
using std::unordered_map;
using std::vector;
using std::regex;
using std::smatch;
using std::regex_match;

/* zlib dependencies
Used for probing gzip compressed files */
Expand Down
9 changes: 6 additions & 3 deletions mdd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ MdSampleDecoder::MdSampleDecoder(const Value& ontology) try :
};
void MdSampleDecoder::classify(const Read& input, Read& output) {
MdDecoder< Barcode >::classify(input, output);
output.update_sample_barcode(this->observation);
output.append_to_corrected_sample_barcode_sequence(*this->decoded, this->observation);
output.update_raw_sample_barcode(this->observation);
output.update_sample_distance(this->decoding_hamming_distance);
output.set_RG(this->rg_by_barcode_index[this->decoded->index]);
};
Expand All @@ -109,8 +110,9 @@ MdCellularDecoder::MdCellularDecoder(const Value& ontology) try :
};
void MdCellularDecoder::classify(const Read& input, Read& output) {
MdDecoder< Barcode >::classify(input, output);
output.append_to_corrected_cellular_barcode_sequence(*this->decoded, this->observation);
output.update_raw_cellular_barcode(this->observation);
output.update_cellular_barcode(*this->decoded);
output.update_corrected_cellular_barcode(*this->decoded);
if(this->decoded->is_classified()) {
output.update_cellular_distance(this->decoding_hamming_distance);
} else {
Expand All @@ -127,8 +129,9 @@ MdMolecularDecoder::MdMolecularDecoder(const Value& ontology) try :
};
void MdMolecularDecoder::classify(const Read& input, Read& output) {
MdDecoder< Barcode >::classify(input, output);
output.append_to_corrected_molecular_barcode_sequence(*this->decoded, this->observation);
output.update_raw_molecular_barcode(this->observation);
output.update_molecular_barcode(*this->decoded);
output.update_corrected_molecular_barcode(*this->decoded);
if(this->decoded->is_classified()) {
output.update_molecular_distance(this->decoding_hamming_distance);
} else {
Expand Down
2 changes: 1 addition & 1 deletion naive.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class NaiveMolecularDecoder : public Decoder< Barcode > {
inline void classify(const Read& input, Read& output) override {
this->observation.clear();
this->rule.apply(input, this->observation);
output.update_molecular_barcode(observation);
output.update_raw_molecular_barcode(observation);
Decoder< Barcode >::classify(input, output);
};
};
Expand Down
9 changes: 6 additions & 3 deletions pamld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ PamlSampleDecoder::PamlSampleDecoder(const Value& ontology) try :
};
void PamlSampleDecoder::classify(const Read& input, Read& output) {
PamlDecoder< Barcode >::classify(input, output);
output.update_sample_barcode(this->observation);
output.append_to_corrected_sample_barcode_sequence(*this->decoded, this->observation);
output.update_raw_sample_barcode(this->observation);
output.update_sample_distance(this->decoding_hamming_distance);
output.update_sample_decoding_confidence(this->decoding_confidence);
output.set_RG(this->rg_by_barcode_index[this->decoded->index]);
Expand All @@ -141,8 +142,9 @@ PamlCellularDecoder::PamlCellularDecoder(const Value& ontology) try :
};
void PamlCellularDecoder::classify(const Read& input, Read& output) {
PamlDecoder< Barcode >::classify(input, output);
output.append_to_corrected_cellular_barcode_sequence(*this->decoded, this->observation);
output.update_raw_cellular_barcode(this->observation);
output.update_cellular_barcode(*this->decoded);
output.update_corrected_cellular_barcode(*this->decoded);
if(this->decoded->is_classified()) {
output.update_cellular_decoding_confidence(this->decoding_confidence);
output.update_cellular_distance(this->decoding_hamming_distance);
Expand All @@ -161,8 +163,9 @@ PamlMolecularDecoder::PamlMolecularDecoder(const Value& ontology) try :
};
void PamlMolecularDecoder::classify(const Read& input, Read& output) {
PamlDecoder< Barcode >::classify(input, output);
output.append_to_corrected_molecular_barcode_sequence(*this->decoded, this->observation);
output.update_raw_molecular_barcode(this->observation);
output.update_molecular_barcode(*this->decoded);
output.update_corrected_molecular_barcode(*this->decoded);
if(this->decoded->is_classified()) {
output.update_molecular_decoding_confidence(this->decoding_confidence);
output.update_molecular_distance(this->decoding_hamming_distance);
Expand Down
52 changes: 36 additions & 16 deletions read.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,19 @@ class Read : public SequenceArray< Segment > {
uint32_t cellular_distance;
double cellular_decoding_confidence;
double barcode_decoding_confidence;
ObservedSequence corrected_sample_barcode;
ObservedSequence corrected_cellular_barcode;
ObservedSequence corrected_molecular_barcode;

inline void clear() override {
for(auto& segment : segment_array) {
segment.clear();
}

corrected_sample_barcode.clear();
corrected_cellular_barcode.clear();
corrected_molecular_barcode.clear();

sample_distance = 0;
sample_decoding_confidence = 1;
molecular_distance = 0;
Expand Down Expand Up @@ -242,8 +250,13 @@ class Read : public SequenceArray< Segment > {
leader->auxiliary.set_RG(rg);
};

inline void update_sample_barcode(const Observation& observation) {
leader->auxiliary.update_sample_barcode(observation);
inline void append_to_corrected_sample_barcode_sequence(const Barcode& barcode, const Observation& observation) {
for(size_t i(0); i < observation.segment_cardinality(); ++i) {
corrected_sample_barcode.append_corrected(barcode[i], observation[i], 0, observation[i].length);
}
};
inline void update_raw_sample_barcode(const Observation& observation) {
leader->auxiliary.update_raw_sample_barcode(observation);
};
inline void update_sample_decoding_confidence(const double& confidence) {
if(sample_decoding_confidence == 1) {
Expand All @@ -262,11 +275,19 @@ class Read : public SequenceArray< Segment > {
sample_distance = distance;
};

inline void update_molecular_barcode(const Barcode& barcode) {
leader->auxiliary.update_molecular_barcode(barcode);
inline void append_to_corrected_molecular_barcode_sequence(const Barcode& barcode, const Observation& observation) {
for(size_t i(0); i < observation.segment_cardinality(); ++i) {
corrected_molecular_barcode.append_corrected(barcode[i], observation[i], 0, observation[i].length);
}
};
inline void update_molecular_barcode(const Observation& observation) {
leader->auxiliary.update_molecular_barcode(observation);
inline void update_raw_molecular_barcode(const Observation& observation) {
leader->auxiliary.update_raw_molecular_barcode(observation);
};
inline void update_corrected_molecular_barcode(const Barcode& barcode) {
leader->auxiliary.update_corrected_molecular_barcode(barcode);
};
inline void update_corrected_molecular_barcode(const Observation& observation) {
leader->auxiliary.update_corrected_molecular_barcode(observation);
};
inline void update_molecular_decoding_confidence(const double& confidence) {
if(molecular_decoding_confidence == 1) {
Expand All @@ -284,15 +305,17 @@ class Read : public SequenceArray< Segment > {
inline void set_molecular_distance(const uint32_t& distance) {
molecular_distance += distance;
};
inline void update_raw_molecular_barcode(const Observation& observation) {
leader->auxiliary.update_raw_molecular_barcode(observation);
};

inline void update_cellular_barcode(const Barcode& barcode) {
leader->auxiliary.update_cellular_barcode(barcode);
inline void append_to_corrected_cellular_barcode_sequence(const Barcode& barcode, const Observation& observation) {
for(size_t i(0); i < observation.segment_cardinality(); ++i) {
corrected_cellular_barcode.append_corrected(barcode[i], observation[i], 0, observation[i].length);
}
};
inline void update_cellular_barcode(const Observation& observation) {
leader->auxiliary.update_cellular_barcode(observation);
inline void update_raw_cellular_barcode(const Observation& observation) {
leader->auxiliary.update_raw_cellular_barcode(observation);
};
inline void update_corrected_cellular_barcode(const Barcode& barcode) {
leader->auxiliary.update_corrected_cellular_barcode(barcode);
};
inline void update_cellular_decoding_confidence(const double& confidence) {
if(cellular_decoding_confidence == 1) {
Expand All @@ -310,9 +333,6 @@ class Read : public SequenceArray< Segment > {
inline void set_cellular_distance(const uint32_t& distance) {
cellular_distance = distance;
};
inline void update_raw_cellular_barcode(const Observation& observation) {
leader->auxiliary.update_raw_cellular_barcode(observation);
};

Read(const int32_t& cardinality, const Platform& platform, int32_t leading_segment_index) :
SequenceArray< Segment >(cardinality),
Expand Down
20 changes: 20 additions & 0 deletions sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,11 @@ template < class T > class SequenceArray {
segment_array[i].encode_iupac_ambiguity(buffer);
}
};
inline void encode_flat_iupac_ambiguity(kstring_t& buffer) const {
for(const auto& segment : segment_array) {
segment.encode_iupac_ambiguity(buffer);
}
};
inline void encode_bam(string& value) const {
for(const auto& segment : segment_array) {
for(int32_t i(0); i < segment.length; ++i) {
Expand Down Expand Up @@ -364,6 +369,16 @@ class ObservedSequence : public Sequence {
this->quality[length] = '\0';
}
};
inline void append_corrected(const Sequence& corrected, const ObservedSequence& other, const int32_t& start, const int32_t& size) {
if(size > 0 && start < other.length) {
increase_to_size(length + size);
memcpy(code + length, corrected.code + start, size);
memcpy(quality + length, other.quality + start, size);
/*TODO quality of corrected bases needs adjustment */
length += size;
terminate();
}
};
inline void encode_phred_quality(kstring_t& buffer, const uint8_t phred_offset) const {
if(length > 0) {
ks_increase_by_size(buffer, length + 2);
Expand Down Expand Up @@ -399,6 +414,11 @@ class Observation : public SequenceArray< ObservedSequence > {
segment_array[i].encode_phred_quality(buffer, phred_offset);
}
};
inline void encode_flat_phred_quality(kstring_t& buffer, const uint8_t phred_offset) const {
for(const auto& segment : segment_array) {
segment.encode_phred_quality(buffer, phred_offset);
}
};
inline double compensated_expected_error() const {
double y(0);
double t(0);
Expand Down
5 changes: 5 additions & 0 deletions test/BDGGG/valid/annotated.err
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
}
}
],
"outgoing": {
"count": 2500,
"pf count": 2390,
"pf fraction": 0.956
},
"sample": {
"average classified confidence": 0.997941827740009,
"average classified distance": 0.028838342810722,
Expand Down

0 comments on commit 35966b8

Please sign in to comment.