From 588cf6a80db0d55d84a356ee0efc1039eb9f0e97 Mon Sep 17 00:00:00 2001 From: Amr Keleg Date: Tue, 4 Jun 2019 23:09:16 +0200 Subject: [PATCH 1/4] Disjunct multiple FSTs encoded in the same at&t file Fixes #56 --- lttoolbox/att_compiler.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index a38b95ef..a3c3d005 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -146,6 +146,11 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } split(line, L'\t', tokens); + if (tokens[0].find('-') == 0) + { + wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + continue; + } from = stoi(tokens[0]); AttNode* source = get_node(from); From 0bfda152b3768584a4a026ba5f4abf283436af0a Mon Sep 17 00:00:00 2001 From: Amr Keleg Date: Thu, 6 Jun 2019 22:00:17 +0200 Subject: [PATCH 2/4] Properly Disjunct multiple FSTs encoded in the same at&t file Instead of just ignoring '--' lines, The at&t compiler will first detect whether the file has mutliple FSTs encoded in it. In the case of multiple FSTs, the compiler will create a new starting state with epsilon transitions to the starting state of each FST. --- lttoolbox/att_compiler.cc | 59 +++++++++++++++++++++++++++++++++------ lttoolbox/att_compiler.h | 5 ++++ 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index a3c3d005..3b90e8c9 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -111,6 +111,20 @@ AttCompiler::symbol_code(const wstring& symbol) } } +bool +AttCompiler::has_multiple_fsts(string const &file_name) +{ + wifstream infile(file_name.c_str()); // TODO: error checking + wstring line; + + while(getline(infile, line)){ + if (line.find('-') == 0) + return true; + } + + return false; +} + void AttCompiler::parse(string const &file_name, wstring const &dir) { @@ -119,8 +133,19 @@ AttCompiler::parse(string const &file_name, wstring const &dir) wifstream infile(file_name.c_str()); // TODO: error checking vector tokens; wstring line; - bool first_line = true; // First line -- see below + bool first_line_in_fst = true; // First line -- see below bool seen_input_symbol = false; + int state_id_offset = 0; + int largest_seen_state_id = 0; + + if (has_multiple_fsts(file_name)){ + wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + + // Set the starting state to 0 (Epsilon transtions will be added later) + starting_state = 0; + state_id_offset = 1; + } + while (getline(infile, line)) { tokens.clear(); @@ -128,12 +153,12 @@ AttCompiler::parse(string const &file_name, wstring const &dir) wstring upper, lower; double weight; - if (line.length() == 0 && first_line) + if (line.length() == 0 && first_line_in_fst) { wcerr << "Error: empty file '" << file_name << "'." << endl; exit(EXIT_FAILURE); } - if (first_line && line.find(L"\t") == wstring::npos) + if (first_line_in_fst && line.find(L"\t") == wstring::npos) { wcerr << "Error: invalid format '" << file_name << "'." << endl; exit(EXIT_FAILURE); @@ -148,17 +173,32 @@ AttCompiler::parse(string const &file_name, wstring const &dir) if (tokens[0].find('-') == 0) { - wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + // Update the offset for the new FST + state_id_offset = largest_seen_state_id + 1; + first_line_in_fst = true; continue; } - from = stoi(tokens[0]); + + from = stoi(tokens[0]) + state_id_offset; + largest_seen_state_id = max(largest_seen_state_id, from); AttNode* source = get_node(from); /* First line: the initial state is of both types. */ - if (first_line) + if (first_line_in_fst) { - starting_state = from; - first_line = false; + // If the file has a single FST - No need for state id mapping + if (state_id_offset == 0) + starting_state = from; + else{ + AttNode * starting_node = get_node(starting_state); + + // Add an Epsilon transition from the new starting state + starting_node->transductions.push_back( + Transduction(from, L"", L"", + alphabet(symbol_code(L""), symbol_code(L"")), + default_weight)); + } + first_line_in_fst = false; } /* Final state. */ @@ -176,7 +216,8 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } else { - to = stoi(tokens[1]); + to = stoi(tokens[1]) + state_id_offset; + largest_seen_state_id = max(largest_seen_state_id, to); if(dir == L"RL") { upper = tokens[3]; diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h index 7b62146f..f22c9593 100644 --- a/lttoolbox/att_compiler.h +++ b/lttoolbox/att_compiler.h @@ -207,6 +207,11 @@ class AttCompiler * only) character otherwise. */ int symbol_code(const wstring& symbol); + + /** + * Finds whether an at&t file contains multiple FSTs or not + */ + bool has_multiple_fsts(string const &file_name); }; #endif /* _MYATT_COMPILER_ */ From dab649317e79a5ace118e56255c7324be605c10c Mon Sep 17 00:00:00 2001 From: Amr Keleg Date: Sat, 8 Jun 2019 18:41:55 +0200 Subject: [PATCH 3/4] Add a unit test for disjuncting multiple fsts in at&t files --- tests/data/cat-multiple-fst.att | 16 ++++++++++++++++ tests/lt_proc/__init__.py | 5 +++++ 2 files changed, 21 insertions(+) create mode 100644 tests/data/cat-multiple-fst.att diff --git a/tests/data/cat-multiple-fst.att b/tests/data/cat-multiple-fst.att new file mode 100644 index 00000000..d843acd3 --- /dev/null +++ b/tests/data/cat-multiple-fst.att @@ -0,0 +1,16 @@ +0 1 c c +1 2 a a +2 3 t t +3 4 @0@ + +4 5 @0@ n +5 +4 5 @0@ v +-- +0 1 c c +1 2 a a +2 3 t t +3 4 @0@ + +4 5 @0@ n +5 6 @0@ + +6 7 s +7 \ No newline at end of file diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index 09590b00..ba77a215 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -125,5 +125,10 @@ class GardenPathMweNewlines(unittest.TestCase, ProcTest): ]""" ] +class CatMultipleFstsTransducer(unittest.TestCase, ProcTest): + procdix = "data/cat-multiple-fst.att" + inputs = ["cat", "cats"] + expectedOutputs = ["^cat/cat+n/cat+v$", "^cats/cat+n+$"] + # These fail on some systems: #from null_flush_invalid_stream_format import * From 0fd248f8fe00c77357cab6a1c59f8368ca0fc30f Mon Sep 17 00:00:00 2001 From: Lokendra Singh Date: Wed, 19 Jun 2019 15:44:46 +0530 Subject: [PATCH 4/4] Python wrapper in SWIG (#58) SWIG Python wrapper guarded by --enable-python-bindings. By Lokendra Singh --- .gitignore | 8 ++++++ .travis.yml | 2 ++ Makefile.am | 4 +++ configure.ac | 13 +++++++-- python/Makefile.am | 9 ++++++ python/lttoolbox.i | 69 ++++++++++++++++++++++++++++++++++++++++++++++ python/setup.py.in | 50 +++++++++++++++++++++++++++++++++ 7 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 python/Makefile.am create mode 100644 python/lttoolbox.i create mode 100644 python/setup.py.in diff --git a/.gitignore b/.gitignore index a849096a..6353196a 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,11 @@ /lttoolbox/lttoolbox_config.h /lttoolbox/lt-tmxproc /lttoolbox/lt-expand +/python/Makefile +/python/Makefile.in +/python/lttoolbox_wrap.cpp +/python/lttoolbox.py +/python/setup.py +/python/build* +*.egg-info/ +*.egg diff --git a/.travis.yml b/.travis.yml index 7812f25a..753e7358 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,8 @@ os: compiler: - clang - gcc +before_install: + - if [ $TRAVIS_OS_NAME = linux ]; then sudo apt-get install -y swig python3-setuptools; else brew install swig; fi script: - ./autogen.sh - ./configure diff --git a/Makefile.am b/Makefile.am index 02e7cce0..93b9ce2e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,6 +3,10 @@ ACLOCAL_AMFLAGS=-I m4 SUBDIRS = $(GENERIC_LIBRARY_NAME) DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) +if HAVE_PYTHON_BINDINGS +SUBDIRS += python +endif + pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = lttoolbox.pc diff --git a/configure.ac b/configure.ac index 3dfc70d4..a10f9fe5 100644 --- a/configure.ac +++ b/configure.ac @@ -140,6 +140,15 @@ static_assert(!is_same::value, "size_t == uint32_t"); static_assert(!is_same::value, "size_t == uint64_t"); ]])], [AC_DEFINE([SIZET_NOT_CSTDINT], [1], [size_t != (uint32_t, uint64_t)])]) -AM_PATH_PYTHON([2], [], [AC_MSG_WARN([Can't run 'make test' without Python installed.])]) +AM_PATH_PYTHON([3.4], [], [AC_MSG_WARN([Can't generate SWIG wrapper or run tests without Python])]) -AC_OUTPUT([Makefile lttoolbox.pc lttoolbox/Makefile]) +AC_CONFIG_FILES([python/setup.py]) + +AC_ARG_ENABLE([python-bindings], + AS_HELP_STRING([--enable-python-bindings], + [build python bindings (default=disabled)]), + [enable_python_bindings=$enableval], + [enable_python_bindings=no]) +AM_CONDITIONAL([HAVE_PYTHON_BINDINGS], [test x$enable_python_bindings = xyes]) + +AC_OUTPUT([Makefile lttoolbox.pc lttoolbox/Makefile python/Makefile]) diff --git a/python/Makefile.am b/python/Makefile.am new file mode 100644 index 00000000..f5f85bcc --- /dev/null +++ b/python/Makefile.am @@ -0,0 +1,9 @@ +SWIG_INTERFACE = lttoolbox.i + +BUILT_SOURCES = lttoolbox_wrap.cpp lttoolbox.py + +lttoolbox_wrap.cpp: $(SWIG_INTERFACE) setup.py + $(PYTHON) setup.py build + +install-exec-local: + $(PYTHON) setup.py install diff --git a/python/lttoolbox.i b/python/lttoolbox.i new file mode 100644 index 00000000..ed1a660e --- /dev/null +++ b/python/lttoolbox.i @@ -0,0 +1,69 @@ +%module lttoolbox + +%{ +#define SWIG_FILE_WITH_INIT +#include +#include +#include +#include + +class FST: public FSTProcessor +{ +public: + /** + * Imitates functionality of lt-proc using file path + */ + void lt_proc(char arg, char *dictionary_path, char *input_path, char *output_path); +}; + + +void +FST::lt_proc(char arg, char *dictionary_path, char *input_path, char *output_path) +{ + FILE *in = fopen(dictionary_path, "rb"); + load(in); + FILE *input = fopen(input_path, "r"), *output = fopen(output_path, "w"); + switch(arg) + { + case 'g': + initGeneration(); + generation(input, output); + break; + case 'b': + initBiltrans(); + bilingual(input, output); + break; + case 'p': + initPostgeneration(); + intergeneration(input, output); + break; + case 'w': + setDictionaryCaseMode(true); + case 'a': + default: + initAnalysis(); + analysis(input, output); + break; + } + + fclose(in); + fclose(input); + fclose(output); +} + +%} + + +%include +%include +%include +%include + +class FST: public FSTProcessor +{ +public: + /** + * Imitates functionality of lt-proc using file path + */ + void lt_proc(char arg, char *dictionary_path, char *input_path, char *output_path); +}; diff --git a/python/setup.py.in b/python/setup.py.in new file mode 100644 index 00000000..36976049 --- /dev/null +++ b/python/setup.py.in @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +""" +Setup for SWIG Python bindings for lttoolbox +""" +from os import path +from setuptools import Extension, setup +from distutils.command.build import build + + +class CustomBuild(build): + sub_commands = [ + ('build_ext', build.has_ext_modules), + ('build_py', build.has_pure_modules), + ('build_clib', build.has_c_libraries), + ('build_scripts', build.has_scripts), + ] + + +def get_sources(): + sources = ['lttoolbox.i'] + cc_sources = ['alphabet.cc', 'compression.cc', 'fst_processor.cc', 'lt_locale.cc', + 'node.cc', 'state.cc', 'trans_exe.cc', 'xml_parse_util.cc'] + rel_path = '@top_srcdir@/lttoolbox/' + sources.extend(path.join(rel_path, f) for f in cc_sources) + return sources + +lttoolbox_module = Extension( + name='_lttoolbox', + sources=get_sources(), + swig_opts = ["-c++", "-I@top_srcdir@", "-Wall"], + include_dirs=['@top_srcdir@', '/usr/include/libxml2'], + library_dirs=['/usr/include/libxml2'], + extra_compile_args='@CXXFLAGS@'.split(), + extra_link_args=['-lxml2'], +) + +setup( + name='@PACKAGE@', + version='@PACKAGE_VERSION@', + description='SWIG interface to @PACKAGE_NAME@', + long_description="SWIG interface to @PACKAGE_NAME@ for use in apertium-python", + # TODO: author, maintainer, url + author_email='@PACKAGE_BUGREPORT@', + license='GPL-3.0+', + maintainer_email='@PACKAGE_BUGREPORT@', + cmdclass={'build': CustomBuild}, + ext_modules=[lttoolbox_module], + py_modules=['lttoolbox'], +)