Skip to content

Commit

Permalink
AlphabetExe
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-martian committed Aug 2, 2021
1 parent cbc3272 commit a8b0ace
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 26 deletions.
4 changes: 2 additions & 2 deletions lttoolbox/Makefile.am
@@ -1,11 +1,11 @@

h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \
h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \
deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \
match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \
pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \
transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \
ustring.h
cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \
match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \
regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \
Expand Down
6 changes: 6 additions & 0 deletions lttoolbox/alphabet.cc
Expand Up @@ -316,3 +316,9 @@ Alphabet::createLoopbackSymbols(set<int32_t> &symbols, Alphabet &basis, Side s,
}
}
}

vector<UString>&
Alphabet::getTags()
{
return slexicinv;
}
5 changes: 5 additions & 0 deletions lttoolbox/alphabet.h
Expand Up @@ -197,6 +197,11 @@ class Alphabet
* @param nonTagsToo by default only tags are included, but if this is true we include all symbols
*/
void createLoopbackSymbols(set<int32_t> &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false);

/**
* Return a reference to the array of tags
*/
vector<UString>& getTags();
};

#endif
96 changes: 96 additions & 0 deletions lttoolbox/alphabet_exe.cc
@@ -0,0 +1,96 @@
/*
* Copyright (C) 2021 Apertium
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/

#include <lttoolbox/alphabet_exe.h>

#include <lttoolbox/compression.h>

AlphabetExe::AlphabetExe(StringWriter* sw_)
: sw(sw_), tag_count(0), tags(nullptr)
{}

AlphabetExe::~AlphabetExe()
{
delete[] tags;
}

void
AlphabetExe::read(FILE* input, bool mmap)
{
if (mmap) {
} else {
tag_count = Compression::multibyte_read(input);
tags = new StringRef[tag_count];
for (uint32_t i = 0; i < tag_count; i++) {
UString tg;
tg += '<';
tg += Compression::string_read(input);
tg += '>';
tags[i] = sw->add(tg);
}
// has to be a separate loop, otherwise the string_views get
// invalidated when the StringWriter buffer expands
for (uint32_t i = 0; i < tag_count; i++) {
symbol_map[sw->get(tags[i])] = -static_cast<int32_t>(i) - 1;
}
int pairs = Compression::multibyte_read(input);
for (int i = 0; i < pairs; i++) {
Compression::multibyte_read(input);
Compression::multibyte_read(input);
}
}
}

int32_t
AlphabetExe::operator()(UString_view sv)
{
auto it = symbol_map.find(sv);
if (it != symbol_map.end()) {
return it->second;
} else {
return 0;
}
}

void
AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const
{
if (symbol == 0) {
return;
} else if (symbol < 0) {
result.append(sw->get(tags[-symbol-1]));
} else if (uppercase) {
result += u_toupper(static_cast<UChar32>(symbol));
} else {
result += static_cast<UChar32>(symbol);
}
}

bool
AlphabetExe::isTag(const int32_t symbol) const
{
return symbol < 0;
}

void
AlphabetExe::clearSymbol(const int32_t symbol)
{
if (symbol < 0) {
tags[-symbol-1].start = 0;
tags[-symbol-1].count = 0;
}
}
40 changes: 40 additions & 0 deletions lttoolbox/alphabet_exe.h
@@ -0,0 +1,40 @@
/*
* Copyright (C) 2021 Apertium
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/

#ifndef _LT_ALPHABET_EXE_
#define _LT_ALPHABET_EXE_

#include <lttoolbox/string_writer.h>
#include <map>

class AlphabetExe {
private:
StringWriter* sw;
uint64_t tag_count;
StringRef* tags;
std::map<UString_view, int32_t> symbol_map;
public:
AlphabetExe(StringWriter* sw_);
~AlphabetExe();
void read(FILE* in, bool mmap);
int32_t operator()(UString_view sv);
void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const;
bool isTag(const int32_t symbol) const;
void clearSymbol(const int32_t symbol);
};

#endif
2 changes: 1 addition & 1 deletion lttoolbox/compiler.cc
Expand Up @@ -179,7 +179,7 @@ Compiler::procAlphabet()
bool space = true;
for(unsigned int i = 0; i < letters.length(); i++)
{
if(!u_isspace(letters.at(i)))
if(!u_isspace(letters[i]))
{
space = false;
break;
Expand Down
13 changes: 9 additions & 4 deletions lttoolbox/fst_processor.cc
Expand Up @@ -40,6 +40,7 @@ UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u;


FSTProcessor::FSTProcessor()
: alphabet(AlphabetExe(&str_write))
{
// escaped_chars chars
escaped_chars.insert('[');
Expand Down Expand Up @@ -956,13 +957,17 @@ FSTProcessor::load(FILE *input)
}

// symbols
alphabet.read(input);
fgetpos(input, &pos);
alphabet.read(input, false);
fsetpos(input, &pos);
Alphabet temp;
temp.read(input);

len = Compression::multibyte_read(input);

while(len > 0) {
UString name = Compression::string_read(input);
transducers[name].read(input, alphabet);
transducers[name].read(input, temp);
len--;
}
}
Expand Down Expand Up @@ -1067,7 +1072,7 @@ FSTProcessor::initDecompositionSymbols()
}
else if(!showControlSymbols)
{
alphabet.setSymbol(compoundOnlyLSymbol, ""_u);
alphabet.clearSymbol(compoundOnlyLSymbol);
}

if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0
Expand All @@ -1080,7 +1085,7 @@ FSTProcessor::initDecompositionSymbols()
}
else if(!showControlSymbols)
{
alphabet.setSymbol(compoundRSymbol, ""_u);
alphabet.clearSymbol(compoundRSymbol);
}
}

Expand Down
10 changes: 8 additions & 2 deletions lttoolbox/fst_processor.h
Expand Up @@ -19,10 +19,11 @@
#define _FSTPROCESSOR_

#include <lttoolbox/ustring.h>
#include <lttoolbox/alphabet.h>
#include <lttoolbox/alphabet_exe.h>
#include <lttoolbox/buffer.h>
#include <lttoolbox/my_stdio.h>
#include <lttoolbox/state.h>
#include <lttoolbox/string_writer.h>
#include <lttoolbox/transducer_exe.h>
#include <lttoolbox/input_file.h>
#include <libxml/xmlreader.h>
Expand Down Expand Up @@ -134,10 +135,15 @@ class FSTProcessor
*/
int rcx_current_char;

/**
* String manager
*/
StringWriter str_write;

/**
* Alphabet
*/
Alphabet alphabet;
AlphabetExe alphabet;

/**
* Input buffer
Expand Down
14 changes: 7 additions & 7 deletions lttoolbox/state.cc
Expand Up @@ -452,7 +452,7 @@ State::NFinals(vector<pair<UString, double>> lf, int maxAnalyses, int maxWeightC

UString
State::filterFinals(const set<TransducerExe*>& finals,
Alphabet const &alphabet,
AlphabetExe const &alphabet,
set<UChar32> const &escaped_chars,
bool display_weights, int max_analyses, int max_weight_classes,
bool uppercase, bool firstupper, int firstchar) const
Expand Down Expand Up @@ -537,7 +537,7 @@ State::filterFinals(const set<TransducerExe*>& finals,

set<pair<UString, vector<UString> > >
State::filterFinalsLRX(const set<TransducerExe*>& finals,
Alphabet const &alphabet,
AlphabetExe const &alphabet,
set<UChar32> const &escaped_chars,
bool uppercase, bool firstupper, int firstchar) const
{
Expand Down Expand Up @@ -584,7 +584,7 @@ State::filterFinalsLRX(const set<TransducerExe*>& finals,

UString
State::filterFinalsSAO(const set<TransducerExe*>& finals,
Alphabet const &alphabet,
AlphabetExe const &alphabet,
set<UChar32> const &escaped_chars,
bool uppercase, bool firstupper, int firstchar) const
{
Expand Down Expand Up @@ -635,7 +635,7 @@ State::filterFinalsSAO(const set<TransducerExe*>& finals,

UString
State::filterFinalsTM(const set<TransducerExe*>& finals,
Alphabet const &alphabet,
AlphabetExe const &alphabet,
set<UChar32> const &escaped_chars,
queue<UString> &blankqueue, vector<UString> &numbers) const
{
Expand Down Expand Up @@ -749,12 +749,12 @@ State::pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max

for(unsigned int i = 0; i<state.size(); i++)
{
vector<pair<int, double>> seq = *state.at(i).sequence;
vector<pair<int, double>> seq = *state[i].sequence;

if(lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol))
{
int this_noOfCompoundElements = 0;
for (int j = seq.size()-2; j>0; j--) if ((seq.at(j)).first==separationSymbol) this_noOfCompoundElements++;
for (int j = seq.size()-2; j>0; j--) if ((seq[j]).first==separationSymbol) this_noOfCompoundElements++;
noOfCompoundElements[i] = this_noOfCompoundElements;
minNoOfCompoundElements = (minNoOfCompoundElements < this_noOfCompoundElements) ?
minNoOfCompoundElements : this_noOfCompoundElements;
Expand Down Expand Up @@ -862,7 +862,7 @@ State::restartFinals(const set<TransducerExe*>& finals, int requiredSymbol, Stat


UString
State::getReadableString(const Alphabet &a)
State::getReadableString(const AlphabetExe &a)
{
UString retval;
retval += '[';
Expand Down
12 changes: 6 additions & 6 deletions lttoolbox/state.h
Expand Up @@ -24,7 +24,7 @@
#include <queue>
#include <climits>

#include <lttoolbox/alphabet.h>
#include <lttoolbox/alphabet_exe.h>
#include <lttoolbox/node.h>
#include <lttoolbox/match_exe.h>
#include <lttoolbox/match_state.h>
Expand Down Expand Up @@ -259,7 +259,7 @@ class State
* @return the result of the transduction
*/
UString filterFinals(const set<TransducerExe*>& finals,
Alphabet const &a,
AlphabetExe const &a,
set<UChar32> const &escaped_chars,
bool display_weights = false,
int max_analyses = INT_MAX,
Expand All @@ -280,7 +280,7 @@ class State
* @return the result of the transduction
*/
UString filterFinalsSAO(const set<TransducerExe*>& finals,
Alphabet const &a,
AlphabetExe const &a,
set<UChar32> const &escaped_chars,
bool uppercase = false,
bool firstupper = false,
Expand All @@ -300,7 +300,7 @@ class State
*/

set<pair<UString, vector<UString> > > filterFinalsLRX(const set<TransducerExe*>& finals,
Alphabet const &a,
AlphabetExe const &a,
set<UChar32> const &escaped_chars,
bool uppercase = false,
bool firstupper = false,
Expand Down Expand Up @@ -332,10 +332,10 @@ class State
/**
* Return the full states string (to allow debuging...) using a Java ArrayList.toString style
*/
UString getReadableString(const Alphabet &a);
UString getReadableString(const AlphabetExe &a);

UString filterFinalsTM(const set<TransducerExe*>& finals,
Alphabet const &alphabet,
AlphabetExe const &alphabet,
set<UChar32> const &escaped_chars,
queue<UString> &blanks,
vector<UString> &numbers) const;
Expand Down

0 comments on commit a8b0ace

Please sign in to comment.