diff --git a/configure.ac b/configure.ac
index 4bcad76..15a9dae 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,6 +54,8 @@ AC_CHECK_FUNCS([setlocale strdup])
AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked])
+AC_CHECK_HEADERS([string_view])
+
CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS"
LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS"
diff --git a/python/setup.py.in b/python/setup.py.in
index 85973a7..2461036 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -3,42 +3,25 @@
'''
Setup for SWIG Python bindings for lex-tools
'''
-from os import path
from distutils.core import Extension, setup
-from distutils.command.build import build
-
-
-class CustomBuild(build):
- sub_commands = [
- ('build_ext', build.has_ext_modules),
- ('build_py', build.has_pure_modules),
- ('build_clib', build.has_c_libraries),
- ('build_scripts', build.has_scripts),
- ]
-
-
-def get_sources():
- sources = ['apertium_lex_tools.i']
- cc_sources = ['lrx_processor.cc']
- rel_path = '../src'
- sources.extend(path.join(rel_path, f) for f in cc_sources)
- return sources
-
-def get_include_dirs():
- # Remove '-I' from Flags, as python add '-I' on its own
- dirs = '@LTTOOLBOX_CFLAGS@'.replace('-I', '').split()
- dirs += '@LIBXML_CFLAGS@'.replace('-I', '').split()
- return dirs + ['../src']
+from sys import platform
+compile_args = '@CXXFLAGS@'.split() + '@LTTOOLBOX_CFLAGS@'.split() + '@ICU_CFLAGS@'.split()
+link_args = []
+if platform == 'darwin':
+ compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
+ link_args.append('-mmacosx-version-min=10.7')
apertium_lex_tools_module = Extension(
name='_apertium_lex_tools',
- sources=get_sources(),
- swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split()+'@ICU_CFLAGS@'.split(),
- include_dirs=get_include_dirs(),
- library_dirs=['/usr/include/libxml2', '/usr/local/lib'],
- extra_compile_args='@CXXFLAGS@'.split(),
- extra_link_args=['-lxml2', '-llttoolbox3'],
+ language='c++',
+ sources=['apertium_lex_tools.i'],
+ swig_opts=['-c++', '-I..', '-I@top_srcdir@/src', '-Wall'],
+ include_dirs=['@top_srcdir@', '@top_srcdir@/src'],
+ library_dirs=['@top_srcdir@/src/.libs'],
+ libraries=[],
+ extra_compile_args=compile_args,
+ extra_link_args=link_args,
)
setup(
@@ -50,7 +33,7 @@ setup(
author_email='@PACKAGE_BUGREPORT@',
license='GPL-3.0+',
maintainer_email='@PACKAGE_BUGREPORT@',
- cmdclass={'build': CustomBuild},
ext_modules=[apertium_lex_tools_module],
py_modules=['apertium_lex_tools'],
+ data_files=[]
)
diff --git a/src/binary_header.h b/src/binary_header.h
new file mode 100644
index 0000000..8df71b4
--- /dev/null
+++ b/src/binary_header.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2021 Apertium
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#ifndef _LRX_BINARY_HEADER_
+#define _LRX_BINARY_HEADER_
+
+#include
+#include
+
+// Global lttoolbox features
+constexpr char HEADER_LRX[4]{'A', 'L', 'R', 'X'};
+enum LRX_FEATURES : uint64_t {
+ LRX_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format
+ LRX_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added
+ LRX_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits
+};
+
+struct weight {
+ int32_t id;
+ char _pad[4]{};
+ double pisu;
+};
+
+inline void weight_to_le(weight& w) {
+ uint32_t id = static_cast(w.id);
+ to_le_32(id);
+
+ uint64_t pisu = *reinterpret_cast(&w.pisu);
+ to_le_64(pisu);
+}
+
+inline void weight_from_le(weight& w) {
+ uint32_t id = static_cast(w.id);
+ from_le_32(id);
+
+ uint64_t pisu = *reinterpret_cast(&w.pisu);
+ from_le_64(pisu);
+}
+
+#endif
diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc
index c9dd3aa..01c3427 100644
--- a/src/lrx_compiler.cc
+++ b/src/lrx_compiler.cc
@@ -16,12 +16,11 @@
*/
#include
-#include
#include
#include
-#include
#include
#include
+#include
using namespace std;
@@ -908,32 +907,45 @@ LRXCompiler::procSeq()
void
LRXCompiler::write(FILE *fst)
{
- alphabet.write(fst);
+ fwrite_unlocked(HEADER_LRX, 1, 4, fst);
+ uint64_t features = 0;
+ features |= LRX_MMAP;
+ write_le_64(fst, features);
- Compression::multibyte_write(recognisers.size(), fst);
- for(auto& it : recognisers)
- {
- Compression::string_write(it.first, fst);
+ StringWriter sw;
+ for (auto& it : alphabet.getTags()) {
+ sw.add(it);
+ }
+ for (auto& it : recognisers) {
+ sw.add(it.first);
+ }
+ sw.write(fst);
+
+ alphabet.write_mmap(fst, sw);
+
+ write_le_64(fst, recognisers.size());
+ for (auto& it : recognisers) {
+ StringRef loc = sw.add(it.first);
+ write_le_32(fst, loc.start);
+ write_le_32(fst, loc.count);
+ it.second.write_mmap(fst, alphabet);
debug("+ %d => %S\n", it.second.size(), it.first.c_str());
if (debugMode) {
it.second.show(alphabet, debug_output, 0, false);
}
- it.second.write(fst);
}
- Compression::string_write("main"_u, fst);
if(outputGraph)
{
transducer.show(alphabet, debug_output, 0, false);
}
- transducer.write(fst);
+ transducer.write_mmap(fst, alphabet);
+ write_le_64(fst, weights.size());
for(auto& it : weights)
{
debug("%.4f %d\n", it.second, it.first);
- weight record{it.first, "", it.second};
- weight_to_le(record);
- fwrite((void *)&record, 1, sizeof(weight), fst);
+ write_le_double(fst, it.second);
}
if(!outputGraph)
diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc
index 8f6f100..8498e10 100644
--- a/src/lrx_processor.cc
+++ b/src/lrx_processor.cc
@@ -15,11 +15,14 @@
* along with this program; if not, see .
*/
-#include
+#include
#include
#include
#include
#include
+#include
+#include
+#include
using namespace std;
@@ -45,22 +48,15 @@ LRXProcessor::itow(int i)
LRXProcessor::LRXProcessor()
-{
-
- initial_state = new State();
-
- lineno = 1; // Used for rule tracing
- pos = 0;
-
- traceMode = false;
- debugMode = false;
- outOfWord = true;
- nullFlush = false;
-}
+ : alphabet(&str_write), initial_state(new State())
+{}
LRXProcessor::~LRXProcessor()
{
delete initial_state;
+ if (mmapping) {
+ munmap(mmap_pointer, mmap_len);
+ }
}
void
@@ -84,60 +80,119 @@ LRXProcessor::setDebugMode(bool m)
void
LRXProcessor::load(FILE *in)
{
- alphabet.read(in);
- any_char = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR);
- any_tag = alphabet(LRX_PROCESSOR_TAG_ANY_TAG);
- any_upper = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER);
- any_lower = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER);
- word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY);
-
- int len = Compression::multibyte_read(in);
-
- while(len > 0)
- {
- UString name = Compression::string_read(in);
- recognisers[name].read(in, alphabet);
- if(debugMode)
- {
- cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n";
+ bool mmap = false;
+ fpos_t pos;
+ if (fgetpos(in, &pos) == 0) {
+ char header[4]{};
+ if (fread_unlocked(header, 1, 4, in) == 4 &&
+ strncmp(header, HEADER_LRX, 4) == 0) {
+ auto features = read_le_64(in);
+ if (features >= LRX_UNKNOWN) {
+ throw std::runtime_error("Rule file has features that are unknown to this version of apertium-lex-tools - upgrade!");
+ }
+ mmap = features & LRX_MMAP;
+ } else {
+ fsetpos(in, &pos);
}
- len--;
}
- if(debugMode)
- {
- cerr << "recognisers: " << recognisers.size() << endl;
- }
+ if(mmap) {
+ fgetpos(in, &pos);
+ rewind(in);
+ mmapping = mmap_file(in, mmap_pointer, mmap_len);
+ if (mmapping) {
+ void* ptr = mmap_pointer + 12;
+ ptr = str_write.init(ptr);
+
+ ptr = alphabet.init(ptr);
+
+ uint64_t recognizer_count = reinterpret_cast(ptr)[0];
+ ptr += sizeof(uint64_t);
+ for (uint64_t i = 0; i < recognizer_count; i++) {
+ StringRef tn = reinterpret_cast(ptr)[0];
+ ptr += sizeof(StringRef);
+ UString name = UString{str_write.get(tn)};
+ ptr = recognisers[name].init(ptr);
+ }
- UString name = Compression::string_read(in);
+ ptr = transducer.init(ptr);
- transducer.read(in, alphabet);
+ uint64_t weight_count = reinterpret_cast(ptr)[0];
+ ptr += sizeof(uint64_t);
+ double* weight_list = reinterpret_cast(ptr);
+ for (uint64_t i = 0; i < weight_count; i++) {
+ UString sid = "<"_u + itow(i + 1) + ">"_u;
+ weights[sid] = weight_list[i];
+ }
+ } else {
+ fsetpos(in, &pos);
- // Now read in weights
- weight record;
- while(fread(&record, sizeof(weight), 1, in))
- {
- weight_from_le(record);
- UString sid = "<"_u + itow(record.id) + ">"_u;
- weights[sid] = record.pisu;
+ str_write.read(in);
- /*
- if(debugMode)
- {
- cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n";
+ alphabet.read(in, true);
+
+ uint64_t recognizer_count = read_le_64(in);
+ for (uint64_t i = 0; i < recognizer_count; i++) {
+ uint32_t s = read_le_32(in);
+ uint32_t c = read_le_32(in);
+ UString name = UString{str_write.get(s, c)};
+ recognisers[name].read(in);
+ }
+
+ transducer.read(in);
+
+ uint64_t weight_count = read_le_double(in);
+ for (uint64_t i = 0; i < weight_count; i++) {
+ UString sid = "<"_u + itow(i + 1) + ">"_u;
+ weights[sid] = read_le_double(in);
+ }
+ }
+ } else {
+ Alphabet temp_alpha;
+ temp_alpha.read(in);
+ fsetpos(in, &pos);
+ alphabet.read(in, false);
+
+ int len = Compression::multibyte_read(in);
+
+ while(len > 0) {
+ UString name = Compression::string_read(in);
+ recognisers[name].read_compressed(in, temp_alpha);
+ if(debugMode) {
+ //cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n";
+ }
+ len--;
+ }
+
+ if(debugMode) {
+ cerr << "recognisers: " << recognisers.size() << endl;
+ }
+
+ UString name = Compression::string_read(in);
+
+ transducer.read_compressed(in, temp_alpha);
+
+ // Now read in weights
+ weight record;
+ while(fread(&record, sizeof(weight), 1, in)) {
+ weight_from_le(record);
+ UString sid = "<"_u + itow(record.id) + ">"_u;
+ weights[sid] = record.pisu;
}
- */
}
- return;
+ any_char = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR);
+ any_tag = alphabet(LRX_PROCESSOR_TAG_ANY_TAG);
+ any_upper = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER);
+ any_lower = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER);
+ word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY);
}
void
LRXProcessor::init()
{
- initial_state->init(transducer.getInitial());
-
- anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end());
+ anfinals.insert(&transducer);
+ initial_state->init(anfinals);
escaped_chars.insert('[');
escaped_chars.insert(']');
@@ -162,13 +217,12 @@ LRXProcessor::recognisePattern(const UString lu, const UString op)
return false;
}
+ set exes;
+ exes.insert(&recognisers[op]);
State *first_state = new State();
- first_state->init(recognisers[op].getInitial());
+ first_state->init(exes);
State cur = *first_state;
- map end_states;
- end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end());
-
bool readingTag = false;
UString tag;
int val = 0;
@@ -249,12 +303,7 @@ LRXProcessor::recognisePattern(const UString lu, const UString op)
cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n";
}
*/
- if(cur.isFinal(end_states))
- {
- return true;
- }
-
- return false;
+ return cur.isFinal(exes);
}
void
diff --git a/src/lrx_processor.h b/src/lrx_processor.h
index 1a03d86..a865777 100644
--- a/src/lrx_processor.h
+++ b/src/lrx_processor.h
@@ -23,11 +23,10 @@
#include
#include
-#include
-
-#include
+#include
#include
-#include
+#include
+#include
#include
using namespace std;
@@ -36,21 +35,22 @@ class LRXProcessor
{
private:
- Alphabet alphabet;
- TransExe transducer;
- map recognisers;
+ StringWriter str_write;
+ AlphabetExe alphabet;
+ TransducerExe transducer;
+ map recognisers;
map weights;
vector alive_states;
- map anfinals;
+ set anfinals;
set escaped_chars;
State *initial_state;
- bool traceMode;
- bool debugMode;
- bool nullFlush;
- bool outOfWord;
+ bool traceMode = false;
+ bool debugMode = false;
+ bool nullFlush = false;
+ bool outOfWord = true;
int32_t any_char;
int32_t any_upper;
@@ -58,8 +58,12 @@ class LRXProcessor
int32_t any_tag;
int32_t word_boundary;
- unsigned int pos;
- unsigned long lineno;
+ unsigned int pos = 0;
+ unsigned long lineno = 1; // Used for rule tracing
+
+ bool mmapping = false;
+ void* mmap_pointer = nullptr;
+ int mmap_len = 0;
UString itow(int i);
bool recognisePattern(const UString lu, const UString op);
diff --git a/src/weight.h b/src/weight.h
deleted file mode 100644
index cc12458..0000000
--- a/src/weight.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (C) 2011--2012 Universitat d'Alacant
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see .
- */
-
-#ifndef __WEIGHT_H__
-#define __WEIGHT_H__
-
-#include
-#include
-#include
-
-struct weight {
- int32_t id;
- char _pad[4]{};
- double pisu;
-};
-
-// This should all be optimized out on little-endian archs
-
-template
-inline uint64_t U64(T t) {
- return static_cast(t);
-}
-
-inline void weight_to_le(weight& w) {
- uint32_t id = static_cast(w.id);
- uint8_t *bytes = reinterpret_cast(&w.id);
- bytes[3] = (id >> 24) & 0xFF;
- bytes[2] = (id >> 16) & 0xFF;
- bytes[1] = (id >> 8) & 0xFF;
- bytes[0] = id & 0xFF;
-
- bytes = reinterpret_cast(&w.pisu);
- uint64_t pisu = *reinterpret_cast(&w.pisu);
- bytes[7] = (pisu >> 56) & 0xFF;
- bytes[6] = (pisu >> 48) & 0xFF;
- bytes[5] = (pisu >> 40) & 0xFF;
- bytes[4] = (pisu >> 32) & 0xFF;
- bytes[3] = (pisu >> 24) & 0xFF;
- bytes[2] = (pisu >> 16) & 0xFF;
- bytes[1] = (pisu >> 8) & 0xFF;
- bytes[0] = pisu & 0xFF;
-}
-
-inline void weight_from_le(weight& w) {
- uint32_t id = static_cast(w.id);
- uint8_t *bytes = reinterpret_cast(&id);
- id = (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | bytes[0];
- w.id = static_cast(id);
-
- bytes = reinterpret_cast(&w.pisu);
- uint64_t pisu = (U64(bytes[7]) << 56ull) | (U64(bytes[6]) << 48ull) | (U64(bytes[5]) << 40ull) | (U64(bytes[4]) << 32ull) | (U64(bytes[3]) << 24ull) | (U64(bytes[2]) << 16ull) | (U64(bytes[1]) << 8ull) | U64(bytes[0]);
- w.pisu = *reinterpret_cast(&pisu);
-}
-
-#endif /* __WEIGHT_H__ */