From 3f551deef1d9f1c9a77b13760c8003d69a3fc861 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Tue, 9 Nov 2021 21:32:22 -0500 Subject: [PATCH 01/11] create pipeline model --- setup.py | 1 + summertime/model/__init__.py | 2 + summertime/model/single_doc/__init__.py | 1 + .../model/single_doc/multilingual/__init__.py | 1 + .../multilingual/base_multilingual_model.py | 42 +++++++----- .../translation_pipeline_model.py | 65 +++++++++++++++++++ 6 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 summertime/model/single_doc/multilingual/translation_pipeline_model.py diff --git a/setup.py b/setup.py index 2a268609..abb7e1a7 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ "tqdm~=4.49.0", "tensorboard~=2.4.1", "fasttext~=0.9.2", + "easynmt~=2.0.1" "black", "flake8", "progressbar", diff --git a/summertime/model/__init__.py b/summertime/model/__init__.py index 45b083ed..9c2e987e 100644 --- a/summertime/model/__init__.py +++ b/summertime/model/__init__.py @@ -6,6 +6,7 @@ PegasusModel, TextRankModel, MT5Model, + TranslationPipelineModel, ) from .multi_doc import MultiDocJointModel, MultiDocSeparateModel from .dialogue import HMNetModel, FlattenDialogueModel @@ -16,6 +17,7 @@ BartModel, MBartModel, MT5Model, + TranslationPipelineModel, LexRankModel, LongformerModel, PegasusModel, diff --git a/summertime/model/single_doc/__init__.py b/summertime/model/single_doc/__init__.py index 91bb0a1d..f1e0d7d6 100644 --- a/summertime/model/single_doc/__init__.py +++ b/summertime/model/single_doc/__init__.py @@ -4,5 +4,6 @@ from .longformer_model import LongformerModel from .textrank_model import TextRankModel +from .multilingual import TranslationPipelineModel from .multilingual import MBartModel from .multilingual import MT5Model diff --git a/summertime/model/single_doc/multilingual/__init__.py b/summertime/model/single_doc/multilingual/__init__.py index f5ae191f..a4858319 100644 --- a/summertime/model/single_doc/multilingual/__init__.py +++ b/summertime/model/single_doc/multilingual/__init__.py @@ -1,2 +1,3 @@ from .mbart_model import MBartModel from .mt5_model import MT5Model +from .translation_pipeline_model import TranslationPipelineModel diff --git a/summertime/model/single_doc/multilingual/base_multilingual_model.py b/summertime/model/single_doc/multilingual/base_multilingual_model.py index 08cfab1b..d1b238c5 100644 --- a/summertime/model/single_doc/multilingual/base_multilingual_model.py +++ b/summertime/model/single_doc/multilingual/base_multilingual_model.py @@ -5,6 +5,31 @@ import fasttext +def fasttext_predict(corpus: Union[List[str], List[List[str]]]): + """ + Utility function to predict the language of input text + using fasttext classifier. + """ + url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" + + filepath = get_cached_file_path("fasttext", "lid.176.ftz", url) + + # silence warning on loading model + fasttext.FastText.eprint = lambda x: None + classifier = fasttext.load_model(str(filepath)) + + if all([isinstance(ins, list) for ins in corpus]): + prediction = classifier.predict(corpus[0]) + + elif isinstance(corpus, list): + prediction = classifier.predict(corpus) + + label = prediction[0][0][0] + + label = label.replace("__label__", "") + + return label + class MultilingualSummModel(SingleDocSummModel): lang_tag_dict = None @@ -24,22 +49,7 @@ def __init__( @classmethod def assert_summ_input_language(cls, corpus, query): - url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" - - filepath = get_cached_file_path("fasttext", "lid.176.ftz", url) - - fasttext.FastText.eprint = lambda x: None - classifier = fasttext.load_model(str(filepath)) - - if all([isinstance(ins, list) for ins in corpus]): - prediction = classifier.predict(corpus[0]) - - elif isinstance(corpus, list): - prediction = classifier.predict(corpus) - - label = prediction[0][0][0] - - label = label.replace("__label__", "") + label = fasttext_predict(corpus) if label in cls.lang_tag_dict: print(f"Supported language '{label}' detected.") diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py new file mode 100644 index 00000000..a54a2149 --- /dev/null +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -0,0 +1,65 @@ +from transformers import MT5ForConditionalGeneration, MT5Tokenizer +from .base_multilingual_model import MultilingualSummModel, fasttext_predict +from summertime.model.base_model import SummModel +from summertime.mode.single_doc import BartModel + +from easynmt import EasyNMT + +class TranslationPipelineModel(MultilingualSummModel): + """ + A class for multilingual summarization performed by first + translating into English then performing summarization in English. + """ + + model_name = 'Translation Pipeline' + is_multilingual = True + # TODO: change to Pegasus as default? + def __init__(self, model_backend: SummModel = BartModel, **kwargs): + model: SummModel = model_backend(**kwargs) + self.model = model + + super(MultiDocJointModel, self).__init__( + trained_domain=self.model.trained_domain, + max_input_length=self.model.max_input_length, + max_output_length=self.model.max_output_length, + ) + + # translation module + self.translator = easyNMT("opus-mt") + + def summarize(self, corpus, queries=None): + self.assert_summ_input_type(corpus, queries) + + src_lang = fasttext_predict(corpus) + # translate to English + corpus = self.translator.translate(corpus, source_lang=src_lang, target_lang="en", beam_size=4) + # TODO: translate each doc separately if provided multiple docs in corpus + if queries: + queries = self.translator.translate(queries, target_lang="en", beam_size=4) + + # summarize in English + english_summaries = self.model.summarize(corpus, queries) + + summaries = self.translator(english_summaries, source_lang="en", target_lang=src_lang, beam_size=4) + + return summaries + + @classmethod + def show_capability(cls) -> None: + basic_description = cls.generate_basic_description() + more_details = ( + "A simple pipeline model for multilingual translation. " + "Uses machine translation to translate input into English, " + "then performs summarization in English before translating results " + "back to the original language.\n" + "Strengths: \n - Massively multilingual: supports ~150 languages\n" + "Weaknesses: \n - Information loss from translation to and from English" + "Initialization arguments: \n " + " - model_backend: the monolingual model to use for summarization. Defaults to BART" + # TODO: if change to Pegasus, change this to reflect that!! + "- `device = 'cpu'` specifies the device the model is stored on and uses for computation. " + "Use `device='cuda'` to run on an Nvidia GPU." + ) + print(f"{basic_description} \n {'#'*20} \n {more_details}") + + From 2c8a5f8752759ab5cc4dbf609e2e4af338d67ffe Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Tue, 9 Nov 2021 21:32:37 -0500 Subject: [PATCH 02/11] update reqs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 14c99b73..f2cae0b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,7 @@ mpi4py==3.0.3 tqdm==4.49.0 tensorboard==2.4.1 fasttext==0.9.2 +easynmt==2.0.1 black flake8 progressbar From 5f905adb13eb07d2d07fce8bf818e5a124aa41fd Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 10 Nov 2021 10:36:48 -0500 Subject: [PATCH 03/11] fix typo in setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index abb7e1a7..e48825c7 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ "tqdm~=4.49.0", "tensorboard~=2.4.1", "fasttext~=0.9.2", - "easynmt~=2.0.1" + "easynmt~=2.0.1", "black", "flake8", "progressbar", From 08bb39fd3d016d17ae4c57390641e7be2ed80f3e Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 10 Nov 2021 11:00:45 -0500 Subject: [PATCH 04/11] fix imports --- .../model/single_doc/multilingual/base_multilingual_model.py | 1 + .../model/single_doc/multilingual/translation_pipeline_model.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/summertime/model/single_doc/multilingual/base_multilingual_model.py b/summertime/model/single_doc/multilingual/base_multilingual_model.py index d1b238c5..d1ea0fa2 100644 --- a/summertime/model/single_doc/multilingual/base_multilingual_model.py +++ b/summertime/model/single_doc/multilingual/base_multilingual_model.py @@ -3,6 +3,7 @@ get_cached_file_path, ) import fasttext +from typing import List, Union def fasttext_predict(corpus: Union[List[str], List[List[str]]]): diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index a54a2149..e2f4aeff 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -33,7 +33,7 @@ def summarize(self, corpus, queries=None): src_lang = fasttext_predict(corpus) # translate to English corpus = self.translator.translate(corpus, source_lang=src_lang, target_lang="en", beam_size=4) - # TODO: translate each doc separately if provided multiple docs in corpus + # TODO: translate each doc separately if provided multiple docs in corpus? if queries: queries = self.translator.translate(queries, target_lang="en", beam_size=4) From 96318ca45d9853e4ff9eff313f0fe2144e21e594 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 10 Nov 2021 11:16:16 -0500 Subject: [PATCH 05/11] fix imports --- .../single_doc/multilingual/translation_pipeline_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index e2f4aeff..68cad920 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -1,7 +1,7 @@ from transformers import MT5ForConditionalGeneration, MT5Tokenizer from .base_multilingual_model import MultilingualSummModel, fasttext_predict from summertime.model.base_model import SummModel -from summertime.mode.single_doc import BartModel +from summertime.model.single_doc import BartModel from easynmt import EasyNMT @@ -18,14 +18,14 @@ def __init__(self, model_backend: SummModel = BartModel, **kwargs): model: SummModel = model_backend(**kwargs) self.model = model - super(MultiDocJointModel, self).__init__( + super(TranslationPipelineModel, self).__init__( trained_domain=self.model.trained_domain, max_input_length=self.model.max_input_length, max_output_length=self.model.max_output_length, ) # translation module - self.translator = easyNMT("opus-mt") + self.translator = EasyNMT("opus-mt") def summarize(self, corpus, queries=None): self.assert_summ_input_type(corpus, queries) From f058549e2f102c6edf170c845dccfd0460d14c5e Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 10 Nov 2021 13:55:15 -0500 Subject: [PATCH 06/11] fix bugs in summarize method --- .../model/single_doc/base_single_doc_model.py | 22 ++++++++++--------- .../translation_pipeline_model.py | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/summertime/model/single_doc/base_single_doc_model.py b/summertime/model/single_doc/base_single_doc_model.py index 8cd00a5c..7b98cfe7 100644 --- a/summertime/model/single_doc/base_single_doc_model.py +++ b/summertime/model/single_doc/base_single_doc_model.py @@ -39,16 +39,18 @@ def assert_summ_input_type(cls, corpus, query): def assert_summ_input_language(cls, corpus, query): warning = "Warning: non-ASCII input corpus detected!\n\ - If this is not English, consider using \ - one of our multilingual models." - - if all([isinstance(ins, list) for ins in corpus]): - if not all([ins.isascii() for batch in corpus for ins in batch]): - print(warning) - - elif isinstance(corpus, list): - if not all([ins.isascii() for ins in corpus]): - print(warning) +If this is not English, consider using \ +one of our multilingual models." + + # isascii not supported in python 3.6 + try: + if all([isinstance(ins, list) for ins in corpus]): + [ins.encode('ascii') for batch in corpus for ins in batch] + + elif isinstance(corpus, list): + [ins.encode('ascii') for ins in corpus] + except UnicodeEncodeError: + print(warning) return "en" # ISO-639-1 code for English diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index 68cad920..9f43e4b9 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -40,7 +40,7 @@ def summarize(self, corpus, queries=None): # summarize in English english_summaries = self.model.summarize(corpus, queries) - summaries = self.translator(english_summaries, source_lang="en", target_lang=src_lang, beam_size=4) + summaries = self.translator.translate(english_summaries, source_lang="en", target_lang=src_lang, beam_size=4) return summaries From 36088a57522efa05a0f7cffce2fd98628ca92ff1 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 10 Nov 2021 14:01:16 -0500 Subject: [PATCH 07/11] add to docs and reformat --- README.md | 1 + .../model/single_doc/base_single_doc_model.py | 4 ++-- .../multilingual/base_multilingual_model.py | 3 ++- .../translation_pipeline_model.py | 20 +++++++++++-------- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index fdf48544..0d9f2120 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,7 @@ SummerTime supports different models (e.g., TextRank, BART, Longformer) as well | LongformerModel | :heavy_check_mark: | | | | | | MBartModel | :heavy_check_mark: | | | | 50 languages (Arabic, Czech, German, English, Spanish, Estonian, Finnish, French, Gujarati, Hindi, Italian, Japanese, Kazakh, Korean, Lithuanian, Latvian, Burmese, Nepali, Dutch, Romanian, Russian, Sinhala, Turkish, Vietnamese, Chinese, Afrikaans, Azerbaijani, Bengali, Persian, Hebrew, Croatian, Indonesian, Georgian, Khmer, Macedonian, Malayalam, Mongolian, Marathi, Polish, Pashto, Portuguese, Swedish, Tamil, Telugu, Thai, Tagalog, Ukrainian, Urdu, Xhosa, Slovenian) | | MT5Model | :heavy_check_mark: | | | | 101 languages (full list [here](https://github.com/google-research/multilingual-t5#readme)) | +| TranslationPipelineModel | :heavy_check_mark: | | | | 150+ languages (full list [here](https://github.com/UKPLab/EasyNMT#Opus-MT)) | | MultiDocJointModel | | :heavy_check_mark: | | | | MultiDocSeparateModel | | :heavy_check_mark: | | | | PegasusModel | :heavy_check_mark: | | | | diff --git a/summertime/model/single_doc/base_single_doc_model.py b/summertime/model/single_doc/base_single_doc_model.py index 7b98cfe7..b50b0c90 100644 --- a/summertime/model/single_doc/base_single_doc_model.py +++ b/summertime/model/single_doc/base_single_doc_model.py @@ -45,10 +45,10 @@ def assert_summ_input_language(cls, corpus, query): # isascii not supported in python 3.6 try: if all([isinstance(ins, list) for ins in corpus]): - [ins.encode('ascii') for batch in corpus for ins in batch] + [ins.encode("ascii") for batch in corpus for ins in batch] elif isinstance(corpus, list): - [ins.encode('ascii') for ins in corpus] + [ins.encode("ascii") for ins in corpus] except UnicodeEncodeError: print(warning) diff --git a/summertime/model/single_doc/multilingual/base_multilingual_model.py b/summertime/model/single_doc/multilingual/base_multilingual_model.py index d1ea0fa2..358f7749 100644 --- a/summertime/model/single_doc/multilingual/base_multilingual_model.py +++ b/summertime/model/single_doc/multilingual/base_multilingual_model.py @@ -8,7 +8,7 @@ def fasttext_predict(corpus: Union[List[str], List[List[str]]]): """ - Utility function to predict the language of input text + Utility function to predict the language of input text using fasttext classifier. """ url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" @@ -31,6 +31,7 @@ def fasttext_predict(corpus: Union[List[str], List[List[str]]]): return label + class MultilingualSummModel(SingleDocSummModel): lang_tag_dict = None diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index 9f43e4b9..302436ce 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -1,20 +1,22 @@ -from transformers import MT5ForConditionalGeneration, MT5Tokenizer from .base_multilingual_model import MultilingualSummModel, fasttext_predict from summertime.model.base_model import SummModel from summertime.model.single_doc import BartModel from easynmt import EasyNMT + class TranslationPipelineModel(MultilingualSummModel): """ - A class for multilingual summarization performed by first + A class for multilingual summarization performed by first translating into English then performing summarization in English. """ - model_name = 'Translation Pipeline' + model_name = "Translation Pipeline" is_multilingual = True # TODO: change to Pegasus as default? + def __init__(self, model_backend: SummModel = BartModel, **kwargs): + model: SummModel = model_backend(**kwargs) self.model = model @@ -29,10 +31,12 @@ def __init__(self, model_backend: SummModel = BartModel, **kwargs): def summarize(self, corpus, queries=None): self.assert_summ_input_type(corpus, queries) - + src_lang = fasttext_predict(corpus) # translate to English - corpus = self.translator.translate(corpus, source_lang=src_lang, target_lang="en", beam_size=4) + corpus = self.translator.translate( + corpus, source_lang=src_lang, target_lang="en", beam_size=4 + ) # TODO: translate each doc separately if provided multiple docs in corpus? if queries: queries = self.translator.translate(queries, target_lang="en", beam_size=4) @@ -40,7 +44,9 @@ def summarize(self, corpus, queries=None): # summarize in English english_summaries = self.model.summarize(corpus, queries) - summaries = self.translator.translate(english_summaries, source_lang="en", target_lang=src_lang, beam_size=4) + summaries = self.translator.translate( + english_summaries, source_lang="en", target_lang=src_lang, beam_size=4 + ) return summaries @@ -61,5 +67,3 @@ def show_capability(cls) -> None: "Use `device='cuda'` to run on an Nvidia GPU." ) print(f"{basic_description} \n {'#'*20} \n {more_details}") - - From 82f62b7158e8e29088902532e5fc2f6214979212 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Sun, 14 Nov 2021 15:54:50 -0500 Subject: [PATCH 08/11] fix formatting --- .../model/single_doc/multilingual/base_multilingual_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/summertime/model/single_doc/multilingual/base_multilingual_model.py b/summertime/model/single_doc/multilingual/base_multilingual_model.py index e1514e8e..780b6700 100644 --- a/summertime/model/single_doc/multilingual/base_multilingual_model.py +++ b/summertime/model/single_doc/multilingual/base_multilingual_model.py @@ -3,7 +3,7 @@ get_cached_file_path, ) import fasttext -from typing import List, Union, Dict, List, Tuple +from typing import List, Union, Dict, Tuple def fasttext_predict(corpus: Union[List[str], List[List[str]]]): @@ -37,6 +37,7 @@ def fasttext_predict(corpus: Union[List[str], List[List[str]]]): return label + class MultilingualSummModel(SingleDocSummModel): # a dictionary of languages supported by the model. From 06278070c268023b6ed175b5b22a7e82d5f5ae62 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 17 Nov 2021 14:51:41 -0500 Subject: [PATCH 09/11] add supported langs for translation --- .../translation_pipeline_model.py | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index 302436ce..b290a257 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -14,6 +14,199 @@ class TranslationPipelineModel(MultilingualSummModel): model_name = "Translation Pipeline" is_multilingual = True # TODO: change to Pegasus as default? + # language codes from https://github.com/UKPLab/EasyNMT#Opus-MT documentation + # language codes not supported by https://fasttext.cc/docs/en/language-identification.html + # are commented out. + supported_langs = [ + # "aav", + "aed", + "af", + # "alv", + "am", + "ar", + # "art", + # "ase", + "az", + "bat", + "bcl", + "be", + # "bem", + # "ber", + "bg", + # "bi", + "bn", + # "bnt", + # "bzs", + "ca", + # "cau", + # "ccs", + "ceb", + # "cel", + # "chk", + # "cpf", + # "crs", + "cs", + # "csg", + # "csn", + # "cus", + "cy", + "da", + "de", + # "dra", + # "ee", + # "efi", + "el", + "en", + "eo", + "es", + "et", + "eu", + # "euq", + "fi", + # "fj", + "fr", + # "fse", + "ga", + # "gaa", + # "gil", + "gl", + # "grk", + # "guw", + "gv", + # "ha", + "he", + "hi", + # "hil", + # "ho", + "hr", + "ht", + "hu", + "hy", + "id", + # "ig", + "ilo", + "is", + # "iso", + "it", + "ja", + # "jap", + "ka", + # "kab", + # "kg", + # "kj", + # "kl", + "ko", + # "kqn", + # "kwn", + # "kwy", + # "lg", + # "ln", + # "loz", + "lt", + # "lu", + # "lua", + # "lue", + # "lun", + # "luo", + # "lus", + "lv", + # "map", + # "mfe", + # "mfs", + "mg", + # "mh", + "mk", + # "mkh", + "ml", + # "mos", + "mr", + "ms", + "mt", + # "mul", + # "ng", + # "nic", + # "niu", + "nl", + "no", + # "nso", + # "ny", + # "nyk", + # "om", + "pa", + # "pag", + # "pap", + # "phi", + # "pis", + "pl", + # "pon", + # "poz", + # "pqe", + # "pqw", + # "prl", + "pt", + # "rn", + # "rnd", + "ro", + # "roa", + "ru", + "run", + # "rw", + # "sal", + # "sg", + "sh", + # "sit", + "sk", + "sl", + # "sm", + # "sn", + "sq", + # "srn", + # "ss", + # "ssp", + # "st", + "sv", + "sw", + # "swc", + # "taw", + # "tdt", + "th", + # "ti", + # "tiv", + "tl", + # "tll", + # "tn", + # "to", + # "toi", + # "tpi", + "tr", + # "trk", + # "ts", + # "tum", + # "tut", + # "tvl", + # "tw", + # "ty", + # "tzo", + "uk", + # "umb", + "ur", + # "ve", + "vi", + # "vsl", + "wa", + # "wal", + "war", + # "wls", + # "xh", + # "yap", + "yo", + # "yua", + # "zai", + "zh", + # "zne", + ] + + lang_tag_dict = {lang: lang for lang in supported_langs} def __init__(self, model_backend: SummModel = BartModel, **kwargs): From 83f0740a4e1de9ecef4df474b362c8b42cdc51cb Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 17 Nov 2021 14:56:31 -0500 Subject: [PATCH 10/11] remove langs not supported by fasttext --- README.md | 2 +- .../translation_pipeline_model.py | 116 +----------------- 2 files changed, 2 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 84f9bb20..55e8be3b 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ SummerTime supports different models (e.g., TextRank, BART, Longformer) as well | LongformerModel | :heavy_check_mark: | | | | | | MBartModel | :heavy_check_mark: | | | | 50 languages (Arabic, Czech, German, English, Spanish, Estonian, Finnish, French, Gujarati, Hindi, Italian, Japanese, Kazakh, Korean, Lithuanian, Latvian, Burmese, Nepali, Dutch, Romanian, Russian, Sinhala, Turkish, Vietnamese, Chinese, Afrikaans, Azerbaijani, Bengali, Persian, Hebrew, Croatian, Indonesian, Georgian, Khmer, Macedonian, Malayalam, Mongolian, Marathi, Polish, Pashto, Portuguese, Swedish, Tamil, Telugu, Thai, Tagalog, Ukrainian, Urdu, Xhosa, Slovenian) | | MT5Model | :heavy_check_mark: | | | | 101 languages (full list [here](https://github.com/google-research/multilingual-t5#readme)) | -| TranslationPipelineModel | :heavy_check_mark: | | | | 150+ languages (full list [here](https://github.com/UKPLab/EasyNMT#Opus-MT)) | +| TranslationPipelineModel | :heavy_check_mark: | | | | ~70 languages | | MultiDocJointModel | | :heavy_check_mark: | | | | MultiDocSeparateModel | | :heavy_check_mark: | | | | PegasusModel | :heavy_check_mark: | | | | diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py index b290a257..81b5aee5 100644 --- a/summertime/model/single_doc/multilingual/translation_pipeline_model.py +++ b/summertime/model/single_doc/multilingual/translation_pipeline_model.py @@ -16,194 +16,80 @@ class TranslationPipelineModel(MultilingualSummModel): # TODO: change to Pegasus as default? # language codes from https://github.com/UKPLab/EasyNMT#Opus-MT documentation # language codes not supported by https://fasttext.cc/docs/en/language-identification.html - # are commented out. + # are removed supported_langs = [ - # "aav", "aed", "af", - # "alv", "am", "ar", - # "art", - # "ase", "az", "bat", "bcl", "be", - # "bem", - # "ber", "bg", - # "bi", "bn", - # "bnt", - # "bzs", "ca", - # "cau", - # "ccs", "ceb", - # "cel", - # "chk", - # "cpf", - # "crs", "cs", - # "csg", - # "csn", - # "cus", "cy", "da", "de", - # "dra", - # "ee", - # "efi", "el", "en", "eo", "es", "et", "eu", - # "euq", "fi", - # "fj", "fr", - # "fse", "ga", - # "gaa", - # "gil", "gl", - # "grk", - # "guw", "gv", - # "ha", "he", "hi", - # "hil", - # "ho", "hr", "ht", "hu", "hy", "id", - # "ig", "ilo", "is", - # "iso", "it", "ja", - # "jap", "ka", - # "kab", - # "kg", - # "kj", - # "kl", "ko", - # "kqn", - # "kwn", - # "kwy", - # "lg", - # "ln", - # "loz", "lt", - # "lu", - # "lua", - # "lue", - # "lun", - # "luo", - # "lus", "lv", - # "map", - # "mfe", - # "mfs", "mg", - # "mh", "mk", - # "mkh", "ml", - # "mos", "mr", "ms", "mt", - # "mul", - # "ng", - # "nic", - # "niu", "nl", "no", - # "nso", - # "ny", - # "nyk", - # "om", "pa", - # "pag", - # "pap", - # "phi", - # "pis", "pl", - # "pon", - # "poz", - # "pqe", - # "pqw", - # "prl", "pt", - # "rn", - # "rnd", "ro", - # "roa", "ru", "run", - # "rw", - # "sal", - # "sg", "sh", - # "sit", "sk", "sl", - # "sm", - # "sn", "sq", - # "srn", - # "ss", - # "ssp", - # "st", "sv", "sw", - # "swc", - # "taw", - # "tdt", "th", - # "ti", - # "tiv", "tl", - # "tll", - # "tn", - # "to", - # "toi", - # "tpi", "tr", - # "trk", - # "ts", - # "tum", - # "tut", - # "tvl", - # "tw", - # "ty", - # "tzo", "uk", - # "umb", "ur", - # "ve", "vi", - # "vsl", "wa", - # "wal", "war", - # "wls", - # "xh", - # "yap", "yo", - # "yua", - # "zai", "zh", - # "zne", ] lang_tag_dict = {lang: lang for lang in supported_langs} From 36ef1a5f8f79d30ad4448a1f0e102fc290f990e2 Mon Sep 17 00:00:00 2001 From: NickSchoelkopf Date: Wed, 17 Nov 2021 16:13:58 -0500 Subject: [PATCH 11/11] update mBART langs in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 55e8be3b..603554b5 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ SummerTime supports different models (e.g., TextRank, BART, Longformer) as well | HMNetModel | | | :heavy_check_mark: | | | | LexRankModel | :heavy_check_mark: | | | | | | LongformerModel | :heavy_check_mark: | | | | | -| MBartModel | :heavy_check_mark: | | | | 50 languages (Arabic, Czech, German, English, Spanish, Estonian, Finnish, French, Gujarati, Hindi, Italian, Japanese, Kazakh, Korean, Lithuanian, Latvian, Burmese, Nepali, Dutch, Romanian, Russian, Sinhala, Turkish, Vietnamese, Chinese, Afrikaans, Azerbaijani, Bengali, Persian, Hebrew, Croatian, Indonesian, Georgian, Khmer, Macedonian, Malayalam, Mongolian, Marathi, Polish, Pashto, Portuguese, Swedish, Tamil, Telugu, Thai, Tagalog, Ukrainian, Urdu, Xhosa, Slovenian) | +| MBartModel | :heavy_check_mark: | | | | 50 languages (full list [here](https://huggingface.co/facebook/mbart-large-50)) | | MT5Model | :heavy_check_mark: | | | | 101 languages (full list [here](https://github.com/google-research/multilingual-t5#readme)) | | TranslationPipelineModel | :heavy_check_mark: | | | | ~70 languages | | MultiDocJointModel | | :heavy_check_mark: | | |