Skip to content

Commit

Permalink
Merge 3e697cf into 784ebd9
Browse files Browse the repository at this point in the history
  • Loading branch information
unhammer committed Jan 7, 2023
2 parents 784ebd9 + 3e697cf commit e218f78
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 13 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ requests = "*"
tornado = "==6.0.3"
commentjson = "*"
lxml = "*"
fasttext = "==0.9.2"

[dev-packages]
coverage = "*"
Expand Down
14 changes: 12 additions & 2 deletions apertium_apy/apy.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def parse_args(cli_args=sys.argv[1:]):
parser.add_argument('-s', '--nonpairs-path', help='path to Apertium tree (only non-translator debug modes are included from this path)')
parser.add_argument('-l', '--lang-names',
help='path to localised language names sqlite database (default = langNames.db)', default='langNames.db')
parser.add_argument('-F', '--fasttext-model',
help='path to fastText language identification model')
parser.add_argument('-f', '--missing-freqs', help='path to missing word frequency sqlite database (default = None)', default=None)
parser.add_argument('-p', '--port', help='port to run server on (default = 2737)', type=int, default=2737)
parser.add_argument('-c', '--ssl-cert', help='path to SSL Certificate', default=None)
Expand Down Expand Up @@ -300,6 +302,10 @@ def setup_application(args):

handlers.append((r'/suggest', SuggestionHandler))

if args.fasttext_model and importlib_util.find_spec('fasttext') is not None:
import fasttext
IdentifyLangHandler.fasttext = fasttext.FastText.load_model(args.fasttext_model)

# TODO: fix mypy. Application expects List but List is invariant and we use subclasses
return tornado.web.Application(handlers) # type:ignore

Expand Down Expand Up @@ -334,8 +340,12 @@ def main():
args = parse_args()
setup_logging(args) # before we start logging anything!

if importlib_util.find_spec('cld2full') is None:
logging.warning('Unable to import CLD2, continuing using naive method of language detection')
if importlib_util.find_spec('fasttext') is None:
logging.warning('Unable to import fastText, trying CLD2')
if importlib_util.find_spec('cld2full') is None:
logging.warning('Unable to import CLD2, continuing using naive method of language identification')
elif not args.fasttext_model:
logging.warning('Have fasttext lib, but started without --fasttext-model, not using fastText for language identification')

if importlib_util.find_spec('chardet') is None:
logging.warning('Unable to import chardet, assuming utf-8 encoding for all websites')
Expand Down
5 changes: 3 additions & 2 deletions apertium_apy/handlers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from tornado.escape import utf8
from tornado.locks import Semaphore

from typing import Union, Dict, Optional, List, Any, Tuple # noqa: F401
from apertium_apy.utils import to_fallback_code
from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline
# Typing imports that flake8 doesn't understand:
from typing import Union, Dict, Optional, List, Any, Tuple # noqa: F401
from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline # noqa: F401


def dump_json(data):
Expand Down
44 changes: 37 additions & 7 deletions apertium_apy/handlers/identify_lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

from tornado import gen

try:
import fasttext
except ImportError:
fasttext = None
try:
import cld2full as cld2 # type: ignore
except ImportError:
Expand All @@ -11,20 +15,46 @@
from apertium_apy.utils import get_coverages, to_alpha3_code


def fasttext_strip_prefix(s):
"""Remove the initial __label__ prefix"""
return s[9:]


def fasttext_identify(model, text):
# grab a bunch since currently the model might predict stuff outside possible_langs – it's still fast:
results = model.predict(text, k=200, threshold=0.001)
if results[0]:
possible_langs = zip(map(fasttext_strip_prefix, results[0]),
results[1])
return {to_alpha3_code(possible_lang[0]): possible_lang[1]
for possible_lang in possible_langs}
else:
return {'nob': 1.0} # TODO: better default


def cld_identify(text):
cld_results = cld2.detect(text)
if cld_results[0]:
possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
return {to_alpha3_code(possible_lang[1]): possible_lang[2]
for possible_lang in possible_langs}
else:
return {'nob': 1.0} # TODO: better default


class IdentifyLangHandler(BaseHandler):
fasttext = None

@gen.coroutine
def get(self):
text = self.get_argument('q')
if not text:
return self.send_error(400, explanation='Missing q argument')

if cld2:
cld_results = cld2.detect(text)
if cld_results[0]:
possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs})
else:
self.send_response({'nob': 100}) # TODO: Some more reasonable response
if self.fasttext is not None:
self.send_response(fasttext_identify(self.fasttext, text))
elif cld2:
self.send_response(cld_identify(text))
else:
try:
coverages = yield gen.with_timeout(
Expand Down
6 changes: 4 additions & 2 deletions apertium_apy/handlers/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from apertium_apy.handlers.base import BaseHandler
from apertium_apy.keys import ApiKeys
from apertium_apy.utils import to_alpha3_code, scale_mt_log
from apertium_apy.utils.translation import parse_mode_file, make_pipeline, FlushingPipeline, SimplePipeline
from typing import Union
from apertium_apy.utils.translation import parse_mode_file, make_pipeline
# Typing imports that flake8 doesn't understand:
from apertium_apy.utils.translation import FlushingPipeline, SimplePipeline # noqa: F401
from typing import Union # noqa: F401


class TranslationInfo:
Expand Down
Binary file added lid.176.ftz
Binary file not shown.

0 comments on commit e218f78

Please sign in to comment.