Merge 533508c into 5e4f5f9

apertium · Feb 21, 2023 · 3f380b2 · 3f380b2
2 parents 5e4f5f9 + 533508c
commit 3f380b2
Show file tree

Hide file tree

Showing 19 changed files with 397 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -110,3 +110,6 @@ venv.bak/
 
 # pycharm conf files
 .idea/
+/ft-train/corpus/
+/ft-train/opus-100-corpus-v1.0.tar.gz
+/ft-train/output/
diff --git a/Pipfile b/Pipfile
@@ -10,6 +10,7 @@ requests = "*"
 tornado = "==6.0.3"
 commentjson = "*"
 lxml = "*"
+fasttext = "==0.9.2"
 
 [dev-packages]
 coverage = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -53,92 +53,95 @@ Usage
 Installation through `apt` or `pip` adds an `apertium-apy` executable:
 
     $ apertium-apy --help
-
-    usage: apertium-apy [-h] [-s NONPAIRS_PATH] [-l LANG_NAMES] [-f MISSING_FREQS]
-                    [-p PORT] [-c SSL_CERT] [-k SSL_KEY] [-t TIMEOUT]
-                    [-j [NUM_PROCESSES]] [-d] [-P LOG_PATH]
-                    [-i MAX_PIPES_PER_PAIR] [-n MIN_PIPES_PER_PAIR]
-                    [-u MAX_USERS_PER_PIPE] [-m MAX_IDLE_SECS]
-                    [-r RESTART_PIPE_AFTER] [-v VERBOSITY] [-V] [-S]
-                    [-M UNKNOWN_MEMORY_LIMIT] [-T STAT_PERIOD_MAX_AGE]
-                    [-wp WIKI_PASSWORD] [-wu WIKI_USERNAME] [-b]
-                    [-rs RECAPTCHA_SECRET] [-md MAX_DOC_PIPES] [-C CONFIG]
-                    [-ak API_KEYS_FILE]
-                    pairs_path
+    usage: apertium-apy [-h] [-s NONPAIRS_PATH] [-l LANG_NAMES] [-F FASTTEXT_MODEL]
+                      [-f MISSING_FREQS] [-p PORT] [-c SSL_CERT] [-k SSL_KEY]
+                      [-t TIMEOUT] [-j [NUM_PROCESSES]] [-d] [-P LOG_PATH]
+                      [-i MAX_PIPES_PER_PAIR] [-n MIN_PIPES_PER_PAIR]
+                      [-u MAX_USERS_PER_PIPE] [-m MAX_IDLE_SECS]
+                      [-r RESTART_PIPE_AFTER] [-v VERBOSITY] [-V] [-S]
+                      [-M UNKNOWN_MEMORY_LIMIT] [-T STAT_PERIOD_MAX_AGE]
+                      [-wp WIKI_PASSWORD] [-wu WIKI_USERNAME] [-b]
+                      [-rs RECAPTCHA_SECRET] [-md MAX_DOC_PIPES] [-C CONFIG]
+                      [-ak API_KEYS]
+                      pairs_path
 
     Apertium APY -- API server for machine translation and language analysis
 
     positional arguments:
-    pairs_path            path to Apertium installed pairs (all modes files in
+      pairs_path            path to Apertium installed pairs (all modes files in
                             this path are included)
 
-    optional arguments:
-    -h, --help            show this help message and exit
-    -s NONPAIRS_PATH, --nonpairs-path NONPAIRS_PATH
+    options:
+      -h, --help            show this help message and exit
+      -s NONPAIRS_PATH, --nonpairs-path NONPAIRS_PATH
                             path to Apertium tree (only non-translator debug modes
                             are included from this path)
-    -l LANG_NAMES, --lang-names LANG_NAMES
+      -l LANG_NAMES, --lang-names LANG_NAMES
                             path to localised language names sqlite database
                             (default = langNames.db)
-    -f MISSING_FREQS, --missing-freqs MISSING_FREQS
+      -F FASTTEXT_MODEL, --fasttext-model FASTTEXT_MODEL
+                            path to fastText language identification model (e.g.
+                            lid.release.ftz)
+      -f MISSING_FREQS, --missing-freqs MISSING_FREQS
                             path to missing word frequency sqlite database
                             (default = None)
-    -p PORT, --port PORT  port to run server on (default = 2737)
-    -c SSL_CERT, --ssl-cert SSL_CERT
+      -p PORT, --port PORT  port to run server on (default = 2737)
+      -c SSL_CERT, --ssl-cert SSL_CERT
                             path to SSL Certificate
-    -k SSL_KEY, --ssl-key SSL_KEY
+      -k SSL_KEY, --ssl-key SSL_KEY
                             path to SSL Key File
-    -t TIMEOUT, --timeout TIMEOUT
+      -t TIMEOUT, --timeout TIMEOUT
                             timeout for requests (default = 10)
-    -j [NUM_PROCESSES], --num-processes [NUM_PROCESSES]
+      -j [NUM_PROCESSES], --num-processes [NUM_PROCESSES]
                             number of processes to run (default = 1; use 0 to run
                             one http server per core, where each http server runs
                             all available language pairs)
-    -d, --daemon          daemon mode: redirects stdout and stderr to files
+      -d, --daemon          daemon mode: redirects stdout and stderr to files
                             apertium-apy.log and apertium-apy.err; use with --log-
                             path
-    -P LOG_PATH, --log-path LOG_PATH
+      -P LOG_PATH, --log-path LOG_PATH
                             path to log output files to in daemon mode; defaults
                             to local directory
-    -i MAX_PIPES_PER_PAIR, --max-pipes-per-pair MAX_PIPES_PER_PAIR
+      -i MAX_PIPES_PER_PAIR, --max-pipes-per-pair MAX_PIPES_PER_PAIR
                             how many pipelines we can spin up per language pair
                             (default = 1)
-    -n MIN_PIPES_PER_PAIR, --min-pipes-per-pair MIN_PIPES_PER_PAIR
+      -n MIN_PIPES_PER_PAIR, --min-pipes-per-pair MIN_PIPES_PER_PAIR
                             when shutting down pipelines, keep at least this many
                             open per language pair (default = 0)
-    -u MAX_USERS_PER_PIPE, --max-users-per-pipe MAX_USERS_PER_PIPE
+      -u MAX_USERS_PER_PIPE, --max-users-per-pipe MAX_USERS_PER_PIPE
                             how many concurrent requests per pipeline before we
                             consider spinning up a new one (default = 5)
-    -m MAX_IDLE_SECS, --max-idle-secs MAX_IDLE_SECS
+      -m MAX_IDLE_SECS, --max-idle-secs MAX_IDLE_SECS
                             if specified, shut down pipelines that have not been
                             used in this many seconds
-    -r RESTART_PIPE_AFTER, --restart-pipe-after RESTART_PIPE_AFTER
+      -r RESTART_PIPE_AFTER, --restart-pipe-after RESTART_PIPE_AFTER
                             restart a pipeline if it has had this many requests
                             (default = 1000)
-    -v VERBOSITY, --verbosity VERBOSITY
+      -v VERBOSITY, --verbosity VERBOSITY
                             logging verbosity
-    -V, --version         show APY version
-    -S, --scalemt-logs    generates ScaleMT-like logs; use with --log-path;
+      -V, --version         show APY version
+      -S, --scalemt-logs    generates ScaleMT-like logs; use with --log-path;
                             disables
-    -M UNKNOWN_MEMORY_LIMIT, --unknown-memory-limit UNKNOWN_MEMORY_LIMIT
+      -M UNKNOWN_MEMORY_LIMIT, --unknown-memory-limit UNKNOWN_MEMORY_LIMIT
                             keeps unknown words in memory until a limit is
                             reached; use with --missing-freqs (default = 1000)
-    -T STAT_PERIOD_MAX_AGE, --stat-period-max-age STAT_PERIOD_MAX_AGE
+      -T STAT_PERIOD_MAX_AGE, --stat-period-max-age STAT_PERIOD_MAX_AGE
                             How many seconds back to keep track request timing
                             stats (default = 3600)
-    -wp WIKI_PASSWORD, --wiki-password WIKI_PASSWORD
+      -wp WIKI_PASSWORD, --wiki-password WIKI_PASSWORD
                             Apertium Wiki account password for SuggestionHandler
-    -wu WIKI_USERNAME, --wiki-username WIKI_USERNAME
+      -wu WIKI_USERNAME, --wiki-username WIKI_USERNAME
                             Apertium Wiki account username for SuggestionHandler
-    -b, --bypass-token    ReCAPTCHA bypass token
-    -rs RECAPTCHA_SECRET, --recaptcha-secret RECAPTCHA_SECRET
+      -b, --bypass-token    ReCAPTCHA bypass token
+      -rs RECAPTCHA_SECRET, --recaptcha-secret RECAPTCHA_SECRET
                             ReCAPTCHA secret for suggestion validation
-    -md MAX_DOC_PIPES, --max-doc-pipes MAX_DOC_PIPES
+      -md MAX_DOC_PIPES, --max-doc-pipes MAX_DOC_PIPES
                             how many concurrent document translation pipelines we
                             allow (default = 3)
-    -C CONFIG, --config CONFIG
+      -C CONFIG, --config CONFIG
                             Configuration file to load options from
-    -ak, --api-keys         JSON file where API keys are stored. Comments are allowed
+      -ak API_KEYS, --api-keys API_KEYS
+                            Configuration file to load API keys
 
 Contributing
 ------------

diff --git a/apertium_apy/apy.py b/apertium_apy/apy.py
@@ -187,6 +187,8 @@ def parse_args(cli_args=sys.argv[1:]):
     parser.add_argument('-s', '--nonpairs-path', help='path to Apertium tree (only non-translator debug modes are included from this path)')
     parser.add_argument('-l', '--lang-names',
                         help='path to localised language names sqlite database (default = langNames.db)', default='langNames.db')
+    parser.add_argument('-F', '--fasttext-model',
+                        help='path to fastText language identification model (e.g. lid.release.ftz)')
     parser.add_argument('-f', '--missing-freqs', help='path to missing word frequency sqlite database (default = None)', default=None)
     parser.add_argument('-p', '--port', help='port to run server on (default = 2737)', type=int, default=2737)
     parser.add_argument('-c', '--ssl-cert', help='path to SSL Certificate', default=None)
@@ -300,6 +302,10 @@ def setup_application(args):
 
         handlers.append((r'/suggest', SuggestionHandler))
 
+    if args.fasttext_model and importlib_util.find_spec('fasttext') is not None:
+        import fasttext
+        IdentifyLangHandler.fasttext = fasttext.FastText.load_model(args.fasttext_model)
+
     # TODO: fix mypy. Application expects List but List is invariant and we use subclasses
     return tornado.web.Application(handlers)  # type:ignore
 
@@ -334,8 +340,12 @@ def main():
     args = parse_args()
     setup_logging(args)  # before we start logging anything!
 
-    if importlib_util.find_spec('cld2full') is None:
-        logging.warning('Unable to import CLD2, continuing using naive method of language detection')
+    if importlib_util.find_spec('fasttext') is None:
+        logging.warning('Unable to import fastText, trying CLD2')
+        if importlib_util.find_spec('cld2full') is None:
+            logging.warning('Unable to import CLD2, continuing using naive method of language identification')
+    elif not args.fasttext_model:
+        logging.warning('Have fasttext lib, but started without --fasttext-model, not using fastText for language identification')
 
     if importlib_util.find_spec('chardet') is None:
         logging.warning('Unable to import chardet, assuming utf-8 encoding for all websites')

diff --git a/apertium_apy/handlers/base.py b/apertium_apy/handlers/base.py
@@ -10,9 +10,10 @@
 from tornado.escape import utf8
 from tornado.locks import Semaphore
 
-from typing import Union, Dict, Optional, List, Any, Tuple  # noqa: F401
 from apertium_apy.utils import to_fallback_code
-from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline
+# Typing imports that flake8 doesn't understand:
+from typing import Union, Dict, Optional, List, Any, Tuple  # noqa: F401
+from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline  # noqa: F401
 
 
 def dump_json(data):

diff --git a/apertium_apy/handlers/identify_lang.py b/apertium_apy/handlers/identify_lang.py
@@ -1,7 +1,12 @@
 from datetime import timedelta
+import re
 
 from tornado import gen
 
+try:
+    import fasttext
+except ImportError:
+    fasttext = None
 try:
     import cld2full as cld2  # type: ignore
 except ImportError:
@@ -11,20 +16,58 @@
 from apertium_apy.utils import get_coverages, to_alpha3_code
 
 
+def fasttext_strip_prefix(s):
+    """Remove the initial __label__ prefix"""
+    return s[9:]
+
+
+fasttext_max_input = 2048
+
+# there's no [:punct:] class in re module, include the most common here:
+fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\|;:\"\'<>.,/?—–-]+)')
+
+
+def fasttext_clean(s):
+    """Should clean as ft-train/clean does"""
+    return re.sub(fasttext_punct_class, r' \1 ', s.lower())
+
+
+def fasttext_identify(model, text):
+    cleaned = fasttext_clean(text[:fasttext_max_input])
+    # Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
+    results = model.predict(cleaned, k=200, threshold=0.001)
+    if results[0]:
+        possible_langs = zip(map(fasttext_strip_prefix, results[0]),
+                             results[1])
+        return {to_alpha3_code(possible_lang[0]): possible_lang[1]
+                for possible_lang in possible_langs}
+    else:
+        return {'nob': 1.0}  # TODO: better default
+
+
+def cld_identify(text):
+    cld_results = cld2.detect(text)
+    if cld_results[0]:
+        possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
+        return {to_alpha3_code(possible_lang[1]): possible_lang[2]
+                for possible_lang in possible_langs}
+    else:
+        return {'nob': 1.0}  # TODO: better default
+
+
 class IdentifyLangHandler(BaseHandler):
+    fasttext = None
+
     @gen.coroutine
     def get(self):
         text = self.get_argument('q')
         if not text:
             return self.send_error(400, explanation='Missing q argument')
 
-        if cld2:
-            cld_results = cld2.detect(text)
-            if cld_results[0]:
-                possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
-                self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs})
-            else:
-                self.send_response({'nob': 100})  # TODO: Some more reasonable response
+        if self.fasttext is not None:
+            self.send_response(fasttext_identify(self.fasttext, text))
+        elif cld2:
+            self.send_response(cld_identify(text))
         else:
             try:
                 coverages = yield gen.with_timeout(

diff --git a/apertium_apy/handlers/translate.py b/apertium_apy/handlers/translate.py
@@ -12,8 +12,10 @@
 from apertium_apy.handlers.base import BaseHandler
 from apertium_apy.keys import ApiKeys
 from apertium_apy.utils import to_alpha3_code, scale_mt_log
-from apertium_apy.utils.translation import parse_mode_file, make_pipeline, FlushingPipeline, SimplePipeline
-from typing import Union
+from apertium_apy.utils.translation import parse_mode_file, make_pipeline
+# Typing imports that flake8 doesn't understand:
+from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline  # noqa: F401
+from typing import Union        # noqa: F401
 
 
 class TranslationInfo:

diff --git a/ft-train/clean b/ft-train/clean
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+sed -e "s/\([[:punct:]]\)/ \1 /g" \
+    | gawk '{print tolower($0)}'
diff --git a/ft-train/compare b/ft-train/compare
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -euo pipefail
+cd "$(dirname "$0")"
+
+# fasttext test doesn't give correct recall numbers :(
+
+fasttext predict output/1Mmodel.ftz corpus/test >/tmp/predictions.1M
+
+fasttext predict ../lid.176.ftz corpus/test \
+    | awk -v tf=<(./iso639-3to2) '
+BEGIN{OFS=FS="\t"; while(getline<tf){t[$2]=$1}; FS="__label__"; } $2 in t{print "__label__"t[$2];next  }{print}' \
+          >/tmp/predictions.176
+
+for p in /tmp/predictions.176 /tmp/predictions.1M; do
+    echo "$p"
+    paste "$p" corpus/test \
+        | sed 's, .*,,' \
+        | awk '
+BEGIN{OFS=FS="\t"; }
+ {n++; tp[$1]+=($1==$2); fp[$1]+=($1!=$2); tc[$2]++; pc[$1]++ }
+ $1==$2{c++; }
+END{
+    smooth = 0.00001 # avoid div by zero
+    print "Correct:"c,"Total:"n;
+    print "label","truepos","falsepos","true count","pred count","P","R";
+    for(label in tp){
+        p = sprintf("%0.2f", (tp[label] + smooth)/(pc[label] + smooth))
+        r = sprintf("%0.2f", (tp[label] + smooth)/(tc[label] + smooth))
+        print label,tp[label],fp[label],tc[label],pc[label],p,r
+    }
+}
+' \
+    | column -ts $'\t'
+done