Skip to content

Commit

Permalink
mmap-able files
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-martian committed Aug 3, 2021
1 parent 4d13731 commit 743ed7f
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 190 deletions.
2 changes: 2 additions & 0 deletions configure.ac
Expand Up @@ -54,6 +54,8 @@ AC_CHECK_FUNCS([setlocale strdup])

AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked])

AC_CHECK_HEADERS([string_view])

CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS"
LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS"

Expand Down
47 changes: 15 additions & 32 deletions python/setup.py.in
Expand Up @@ -3,42 +3,25 @@
'''
Setup for SWIG Python bindings for lex-tools
'''
from os import path
from distutils.core import Extension, setup
from distutils.command.build import build


class CustomBuild(build):
sub_commands = [
('build_ext', build.has_ext_modules),
('build_py', build.has_pure_modules),
('build_clib', build.has_c_libraries),
('build_scripts', build.has_scripts),
]


def get_sources():
sources = ['apertium_lex_tools.i']
cc_sources = ['lrx_processor.cc']
rel_path = '../src'
sources.extend(path.join(rel_path, f) for f in cc_sources)
return sources

def get_include_dirs():
# Remove '-I' from Flags, as python add '-I' on its own
dirs = '@LTTOOLBOX_CFLAGS@'.replace('-I', '').split()
dirs += '@LIBXML_CFLAGS@'.replace('-I', '').split()
return dirs + ['../src']
from sys import platform

compile_args = '@CXXFLAGS@'.split() + '@LTTOOLBOX_CFLAGS@'.split() + '@ICU_CFLAGS@'.split()
link_args = []
if platform == 'darwin':
compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
link_args.append('-mmacosx-version-min=10.7')

apertium_lex_tools_module = Extension(
name='_apertium_lex_tools',
sources=get_sources(),
swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split()+'@ICU_CFLAGS@'.split(),
include_dirs=get_include_dirs(),
library_dirs=['/usr/include/libxml2', '/usr/local/lib'],
extra_compile_args='@CXXFLAGS@'.split(),
extra_link_args=['-lxml2', '-llttoolbox3'],
language='c++',
sources=['apertium_lex_tools.i'],
swig_opts=['-c++', '-I..', '-I@top_srcdir@/src', '-Wall'],
include_dirs=['@top_srcdir@', '@top_srcdir@/src'],
library_dirs=['@top_srcdir@/src/.libs'],
libraries=[],
extra_compile_args=compile_args,
extra_link_args=link_args,
)

setup(
Expand All @@ -50,7 +33,7 @@ setup(
author_email='@PACKAGE_BUGREPORT@',
license='GPL-3.0+',
maintainer_email='@PACKAGE_BUGREPORT@',
cmdclass={'build': CustomBuild},
ext_modules=[apertium_lex_tools_module],
py_modules=['apertium_lex_tools'],
data_files=[]
)
54 changes: 54 additions & 0 deletions src/binary_header.h
@@ -0,0 +1,54 @@
/*
* Copyright (C) 2021 Apertium
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <https://www.gnu.org/licenses/>.
*/

#ifndef _LRX_BINARY_HEADER_
#define _LRX_BINARY_HEADER_

#include <cstdint>
#include <lttoolbox/endian_util.h>

// Global lttoolbox features
constexpr char HEADER_LRX[4]{'A', 'L', 'R', 'X'};
enum LRX_FEATURES : uint64_t {
LRX_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format
LRX_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added
LRX_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits
};

struct weight {
int32_t id;
char _pad[4]{};
double pisu;
};

inline void weight_to_le(weight& w) {
uint32_t id = static_cast<uint32_t>(w.id);
to_le_32(id);

uint64_t pisu = *reinterpret_cast<uint64_t*>(&w.pisu);
to_le_64(pisu);
}

inline void weight_from_le(weight& w) {
uint32_t id = static_cast<uint32_t>(w.id);
from_le_32(id);

uint64_t pisu = *reinterpret_cast<uint64_t*>(&w.pisu);
from_le_64(pisu);
}

#endif
38 changes: 25 additions & 13 deletions src/lrx_compiler.cc
Expand Up @@ -16,12 +16,11 @@
*/

#include <lrx_compiler.h>
#include <weight.h>
#include <lttoolbox/string_utils.h>
#include <lttoolbox/xml_parse_util.h>
#include <lttoolbox/compression.h>
#include <iostream>
#include <limits>
#include <binary_header.h>

using namespace std;

Expand Down Expand Up @@ -908,32 +907,45 @@ LRXCompiler::procSeq()
void
LRXCompiler::write(FILE *fst)
{
alphabet.write(fst);
fwrite_unlocked(HEADER_LRX, 1, 4, fst);
uint64_t features = 0;
features |= LRX_MMAP;
write_le_64(fst, features);

Compression::multibyte_write(recognisers.size(), fst);
for(auto& it : recognisers)
{
Compression::string_write(it.first, fst);
StringWriter sw;
for (auto& it : alphabet.getTags()) {
sw.add(it);
}
for (auto& it : recognisers) {
sw.add(it.first);
}
sw.write(fst);

alphabet.write_mmap(fst, sw);

write_le_64(fst, recognisers.size());
for (auto& it : recognisers) {
StringRef loc = sw.add(it.first);
write_le_32(fst, loc.start);
write_le_32(fst, loc.count);
it.second.write_mmap(fst, alphabet);
debug("+ %d => %S\n", it.second.size(), it.first.c_str());
if (debugMode) {
it.second.show(alphabet, debug_output, 0, false);
}
it.second.write(fst);
}

Compression::string_write("main"_u, fst);
if(outputGraph)
{
transducer.show(alphabet, debug_output, 0, false);
}
transducer.write(fst);
transducer.write_mmap(fst, alphabet);

write_le_64(fst, weights.size());
for(auto& it : weights)
{
debug("%.4f %d\n", it.second, it.first);
weight record{it.first, "", it.second};
weight_to_le(record);
fwrite((void *)&record, 1, sizeof(weight), fst);
write_le_double(fst, it.second);
}

if(!outputGraph)
Expand Down

0 comments on commit 743ed7f

Please sign in to comment.