Skip to content

Commit

Permalink
rewrite stream.h/cc to use InputFile
Browse files Browse the repository at this point in the history
also drop tag.h/cc since it was barely more than a typedef anyway
  • Loading branch information
mr-martian committed Jul 6, 2021
1 parent 798c4a4 commit 11c56c8
Show file tree
Hide file tree
Showing 20 changed files with 146 additions and 912 deletions.
2 changes: 0 additions & 2 deletions apertium/Makefile.am
Expand Up @@ -39,7 +39,6 @@ h_sources = a.h \
streamed_type.h \
string_to_wostream.h \
shell_utils.h \
tag.h \
tagger.h \
tagger_data.h \
tagger_data_hmm.h \
Expand Down Expand Up @@ -109,7 +108,6 @@ cc_sources = a.cc \
stream.cc \
stream_tagger.cc \
shell_utils.cc \
tag.cc \
tagger.cc \
tagger_data.cc \
tagger_data_hmm.cc \
Expand Down
3 changes: 1 addition & 2 deletions apertium/a.h
Expand Up @@ -18,7 +18,6 @@

#include "analysis.h"
#include "morpheme.h"
#include "tag.h"

#include <vector>

Expand All @@ -29,7 +28,7 @@ class a {
friend bool operator<(const a &a_, const a &b_);
a();
a(const Analysis &Analysis_);
std::vector<Tag> TheTags;
std::vector<UString> TheTags;
std::vector<Morpheme> TheMorphemes;
};
}
Expand Down
15 changes: 15 additions & 0 deletions apertium/analysis.cc
Expand Up @@ -51,4 +51,19 @@ Analysis::operator UString() const {

return UString_;
}

void
Analysis::read(InputFile& in)
{
UChar32 c;
do {
TheMorphemes.push_back(Morpheme());
TheMorphemes.back().read(in);
c = in.get();
} while (c == '+');
if (in.eof() || c == '\0') {
throw Exception::Stream::UnexpectedEndOfFile("Unterminated lexical unit");
}
in.unget(c); // leave $ or / for caller
}
}
2 changes: 2 additions & 0 deletions apertium/analysis.h
Expand Up @@ -22,6 +22,7 @@
#include <string>
#include <vector>
#include <lttoolbox/ustring.h>
#include <lttoolbox/input_file.h>

namespace Apertium {
class Analysis {
Expand All @@ -31,6 +32,7 @@ class Analysis {
friend bool operator==(const Analysis &a, const Analysis &b);
friend bool operator<(const Analysis &a, const Analysis &b);
operator UString() const;
void read(InputFile& in);
std::vector<Morpheme> TheMorphemes;
};
}
Expand Down
9 changes: 2 additions & 7 deletions apertium/apertium_perceptron_trace.cc
Expand Up @@ -41,13 +41,8 @@ int perceptron_trace(int argc, char* argv[])
PerceptronTagger pt(flags);
pt.read_spec(argv[2]);

std::ifstream untagged_stream;
try_open_fstream("UNTAGGED_CORPUS", argv[3], untagged_stream);
Stream untagged(flags, untagged_stream, argv[3]);

std::ifstream tagged_stream;
try_open_fstream("TAGGED_CORPUS", argv[4], tagged_stream);
Stream tagged(flags, tagged_stream, argv[4]);
Stream untagged(flags, argv[3]);
Stream tagged(flags, argv[4]);

TrainingCorpus tc(tagged, untagged, false, false);

Expand Down
18 changes: 3 additions & 15 deletions apertium/deserialiser.h
Expand Up @@ -21,7 +21,6 @@
#include "i.h"
#include "lemma.h"
#include "morpheme.h"
#include "tag.h"
#include "apertium_config.h"

#include <lttoolbox/deserialiser.h>
Expand Down Expand Up @@ -62,14 +61,9 @@ template <> class Deserialiser<Morpheme> {
inline static Morpheme deserialise(std::istream &Stream_);
};

template <> class Deserialiser<Tag> {
public:
inline static Tag deserialise(std::istream &Stream_);
};

a Deserialiser<a>::deserialise(std::istream &Stream_) {
a StreamedType_;
StreamedType_.TheTags = Deserialiser<std::vector<Tag> >::deserialise(Stream_);
StreamedType_.TheTags = Deserialiser<std::vector<UString> >::deserialise(Stream_);
StreamedType_.TheMorphemes =
Deserialiser<std::vector<Morpheme> >::deserialise(Stream_);
return StreamedType_;
Expand All @@ -84,7 +78,7 @@ Analysis Deserialiser<Analysis>::deserialise(std::istream &Stream_) {

i Deserialiser<i>::deserialise(std::istream &Stream_) {
i StreamedType_;
StreamedType_.TheTags = Deserialiser<std::vector<Tag> >::deserialise(Stream_);
StreamedType_.TheTags = Deserialiser<std::vector<UString> >::deserialise(Stream_);
return StreamedType_;
}

Expand All @@ -98,13 +92,7 @@ Morpheme Deserialiser<Morpheme>::deserialise(std::istream &Stream_) {
Morpheme SerialisedType_;
SerialisedType_.TheLemma = Deserialiser<UString>::deserialise(Stream_);
SerialisedType_.TheTags =
Deserialiser<std::vector<Tag> >::deserialise(Stream_);
return SerialisedType_;
}

Tag Deserialiser<Tag>::deserialise(std::istream &Stream_) {
Tag SerialisedType_;
SerialisedType_.TheTag = Deserialiser<UString>::deserialise(Stream_);
Deserialiser<std::vector<UString> >::deserialise(Stream_);
return SerialisedType_;
}

Expand Down
3 changes: 1 addition & 2 deletions apertium/i.h
Expand Up @@ -18,7 +18,6 @@

#include "analysis.h"
#include "morpheme.h"
#include "tag.h"

#include <vector>

Expand All @@ -31,7 +30,7 @@ class i {
i();
i(const Analysis &Analysis_);
i(const Morpheme &Morpheme_);
std::vector<Tag> TheTags;
std::vector<UString> TheTags;
};
}

Expand Down
2 changes: 0 additions & 2 deletions apertium/lexical_unit.h
Expand Up @@ -17,8 +17,6 @@
#define TAGGING_EXPRESSION_H

#include "analysis.h"

#include <string>
#include <vector>

namespace Apertium {
Expand Down
52 changes: 50 additions & 2 deletions apertium/morpheme.cc
Expand Up @@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const Morpheme &morph) {
::operator<<(out, morph.TheLemma);
for (auto& it : morph.TheTags) {
out << '<';
::operator<<(out, it.TheTag);
::operator<<(out, it);
out << '>';
}
// namespace issue
Expand All @@ -54,9 +54,57 @@ Morpheme::operator UString() const {
UString ustring_ = TheLemma;

for (auto& Tag_ : TheTags) {
ustring_ += static_cast<UString>(Tag_);
ustring_ += '<';
ustring_ += Tag_;
ustring_ += '>';
}

return ustring_;
}

void
Morpheme::read(InputFile& in)
{
UChar32 c = in.get();
while (c != '<' && c != '$' && c != '/' && c != '\0' && c != '+') {
TheLemma += c;
if (c == '\\') {
if (in.eof() || in.peek() == '\0') {
throw Exception::Stream::UnexpectedEndOfFile("Unterminted lexical unit");
}
TheLemma += in.get();
}
c = in.get();
}
if (TheLemma.empty()) {
throw Exception::Morpheme::TheLemma_empty("empty lemma");
}
while (c == '<') {
UString tg = in.readBlock('<', '>');
if (tg.size() == 2) {
throw Exception::Morpheme::TheTags_empty("invalid tag <>");
}
TheTags.push_back(tg.substr(1, tg.size()-2));
c = in.get();
}
if (TheTags.empty()) {
throw Exception::Morpheme::TheTags_empty("morpheme has no tags");
}
if (c == '#') {
while (c != '<' && c != '$' && c != '/' && c != '\0' && c != '+') {
TheLemma += c;
if (c == '\\') {
if (in.eof() || in.peek() == '\0') {
throw Exception::Stream::UnexpectedEndOfFile("trailing backslash");
}
TheLemma += in.get();
}
c = in.get();
}
if (c == '<') {
throw Exception::Stream::UnexpectedCharacter("unexpected < after lemma queue");
}
}
in.unget(c);
}
}
8 changes: 4 additions & 4 deletions apertium/morpheme.h
Expand Up @@ -16,9 +16,8 @@
#ifndef MORPHEME_H
#define MORPHEME_H

#include "tag.h"

#include <string>
#include <lttoolbox/ustring.h>
#include <lttoolbox/input_file.h>
#include <vector>
#include <iostream>

Expand All @@ -29,8 +28,9 @@ class Morpheme {
friend bool operator<(const Morpheme &a, const Morpheme &b);
friend std::ostream& operator<<(std::ostream& out, const Morpheme &morph);
operator UString() const;
void read(InputFile& in);
UString TheLemma;
std::vector<Tag> TheTags;
std::vector<UString> TheTags;
};
}

Expand Down
16 changes: 4 additions & 12 deletions apertium/perceptron_spec.cc
Expand Up @@ -61,9 +61,7 @@ static Morpheme make_sentinel_wordoid(
const UString &tag_str) {
Morpheme morpheme;
morpheme.TheLemma = lemma_str;
Tag tag;
tag.TheTag = tag_str;
morpheme.TheTags.push_back(tag);
morpheme.TheTags.push_back(tag_str);
return morpheme;
}

Expand Down Expand Up @@ -520,13 +518,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op)
stack.push(ambgset);
} break;
case EXTAGS: {
const std::vector<Tag> &tags = stack.top().wrd().TheTags;
/*std::vector<Tag>::const_iterator it = tags.begin();
std::cerr << "tags: ";
for (;it != tags.end(); it++) {
std::cerr << &(*it) << " " << it->TheTag << ", ";
}
std::cerr << "\n";*/
const std::vector<UString> &tags = stack.top().wrd().TheTags;
std::vector<std::string> *tags_str = new std::vector<std::string>;
tags_str->resize(tags.size());
transform(tags.begin(), tags.end(), tags_str->begin(), get_tag);
Expand Down Expand Up @@ -770,9 +762,9 @@ void PerceptronSpec::appendStr(UnaryFeatureVec::iterator begin,
}

std::string
PerceptronSpec::Machine::get_tag(const Tag &in) {
PerceptronSpec::Machine::get_tag(const UString &in) {
std::string result;
utf8::utf16to8(in.TheTag.begin(), in.TheTag.end(), std::back_inserter(result));
utf8::utf16to8(in.begin(), in.end(), std::back_inserter(result));
return result;
}

Expand Down
4 changes: 2 additions & 2 deletions apertium/perceptron_spec.h
Expand Up @@ -267,7 +267,7 @@ class PerceptronSpec
}
StackValue(const Morpheme &wordoid) {
/*std::cerr << "Before ";
std::vector<Tag>::const_iterator it = wordoid.TheTags.begin();
std::vector<UString>::const_iterator it = wordoid.TheTags.begin();
for (;it != wordoid.TheTags.end(); it++) {
std::cerr << &(*it) << " ";
}
Expand Down Expand Up @@ -459,7 +459,7 @@ class PerceptronSpec
int get_int_operand();
unsigned int get_uint_operand();
const std::string& get_str_operand();
static std::string get_tag(const Tag &in);
static std::string get_tag(const UString &in);
bool execCommonOp(Opcode op);
public:
void traceMachineState();
Expand Down
8 changes: 2 additions & 6 deletions apertium/sentence_stream.cc
Expand Up @@ -16,15 +16,11 @@ bool isSentenceEnd(StreamedType &token) {
if (morphemes.size() != 1) {
return false;
}
std::vector<Tag> &tags = morphemes.begin()->TheTags;
std::vector<UString> &tags = morphemes.begin()->TheTags;
if (tags.size() != 1) {
return false;
}
Tag &tag = *tags.begin();
if (tag.TheTag != "sent"_u) {
return false;
}
return true;
return (*tags.begin() == "sent"_u);
}

bool isSentenceEnd(StreamedType tok, Stream &in, bool sent_seg) {
Expand Down
12 changes: 0 additions & 12 deletions apertium/serialiser.h
Expand Up @@ -21,7 +21,6 @@
#include "i.h"
#include "lemma.h"
#include "morpheme.h"
#include "tag.h"
#include "apertium_config.h"

#include <lttoolbox/serialiser.h>
Expand Down Expand Up @@ -66,12 +65,6 @@ template <> class Serialiser<Morpheme> {
std::ostream &Output);
};

template <> class Serialiser<Tag> {
public:
inline static void serialise(const Tag &SerialisedType_,
std::ostream &Output);
};

}

void Serialiser<a>::serialise(const a &SerialisedType_, std::ostream &Output) {
Expand Down Expand Up @@ -99,11 +92,6 @@ void Serialiser<Morpheme>::serialise(const Morpheme &SerialisedType_,
::serialise(SerialisedType_.TheTags, Output);
}

void Serialiser<Tag>::serialise(const Tag &SerialisedType_,
std::ostream &Output) {
::serialise(SerialisedType_.TheTag, Output);
}

// [1] operator+ promotes its operand to a printable integral type.

#endif // SERIALISER_H

0 comments on commit 11c56c8

Please sign in to comment.