Yale-LILY · niansong1996 · Nov 17, 2021 · Nov 10, 2021 · Nov 10, 2021 · Nov 10, 2021
diff --git a/README.md b/README.md
@@ -85,8 +85,9 @@ SummerTime supports different models (e.g., TextRank, BART, Longformer) as well
 | HMNetModel                |                      |                      | :heavy_check_mark:   |                      |                      |
 | LexRankModel              | :heavy_check_mark:   |                      |                      |                      |                      |
 | LongformerModel           | :heavy_check_mark:   |                      |                      |                      |                      |
-| MBartModel                | :heavy_check_mark:   |                      |                      |                      | 50 languages (Arabic, Czech, German, English, Spanish, Estonian, Finnish, French, Gujarati, Hindi, Italian, Japanese, Kazakh, Korean, Lithuanian, Latvian, Burmese, Nepali, Dutch, Romanian, Russian, Sinhala, Turkish, Vietnamese, Chinese, Afrikaans, Azerbaijani, Bengali, Persian, Hebrew, Croatian, Indonesian, Georgian, Khmer, Macedonian, Malayalam, Mongolian, Marathi, Polish, Pashto, Portuguese, Swedish, Tamil, Telugu, Thai, Tagalog, Ukrainian, Urdu, Xhosa, Slovenian) |
+| MBartModel                | :heavy_check_mark:   |                      |                      |                      | 50 languages (full list [here](https://huggingface.co/facebook/mbart-large-50)) |
 | MT5Model                  | :heavy_check_mark:   |                      |                      |                      | 101 languages (full list [here](https://github.com/google-research/multilingual-t5#readme)) |
+| TranslationPipelineModel  | :heavy_check_mark:   |                      |                      |                      | ~70 languages |
 | MultiDocJointModel        |                      | :heavy_check_mark:   |                      |                      |
 | MultiDocSeparateModel     |                      | :heavy_check_mark:   |                      |                      |
 | PegasusModel              | :heavy_check_mark:   |                      |                      |                      |

diff --git a/requirements.txt b/requirements.txt
@@ -24,6 +24,7 @@ mpi4py==3.0.3
 tqdm==4.49.0
 tensorboard==2.4.1
 fasttext==0.9.2
+easynmt==2.0.1
 black
 flake8
 progressbar
diff --git a/setup.py b/setup.py
@@ -43,6 +43,7 @@
         "tqdm~=4.49.0",
         "tensorboard~=2.4.1",
         "fasttext~=0.9.2",
+        "easynmt~=2.0.1",
         "black",
         "flake8",
         "progressbar",

diff --git a/summertime/model/__init__.py b/summertime/model/__init__.py
@@ -6,6 +6,7 @@
     PegasusModel,
     TextRankModel,
     MT5Model,
+    TranslationPipelineModel,
 )
 from .multi_doc import MultiDocJointModel, MultiDocSeparateModel
 from .dialogue import HMNetModel, FlattenDialogueModel
@@ -16,6 +17,7 @@
     BartModel,
     MBartModel,
     MT5Model,
+    TranslationPipelineModel,
     LexRankModel,
     LongformerModel,
     PegasusModel,

diff --git a/summertime/model/single_doc/__init__.py b/summertime/model/single_doc/__init__.py
@@ -4,5 +4,6 @@
 from .longformer_model import LongformerModel
 from .textrank_model import TextRankModel
 
+from .multilingual import TranslationPipelineModel
 from .multilingual import MBartModel
 from .multilingual import MT5Model
diff --git a/summertime/model/single_doc/multilingual/__init__.py b/summertime/model/single_doc/multilingual/__init__.py
@@ -1,2 +1,3 @@
 from .mbart_model import MBartModel
 from .mt5_model import MT5Model
+from .translation_pipeline_model import TranslationPipelineModel
diff --git a/summertime/model/single_doc/multilingual/base_multilingual_model.py b/summertime/model/single_doc/multilingual/base_multilingual_model.py
@@ -3,7 +3,39 @@
     get_cached_file_path,
 )
 import fasttext
-from typing import Dict, List, Tuple
+from typing import List, Union, Dict, Tuple
+
+
+def fasttext_predict(corpus: Union[List[str], List[List[str]]]):
+    """
+    Utility function to predict the language of input text
+    using fasttext classifier.
+    """
+    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
+
+    filepath = get_cached_file_path("fasttext", "lid.176.ftz", url)
+
+    fasttext.FastText.eprint = lambda x: None
+    classifier = fasttext.load_model(str(filepath))
+
+    # fasttext returns a tuple of 2 lists:
+    # the first list contains a list of predicted language labels
+    # of the form {__label__<lang_code>}
+    # and the second list contains the corresponding probabilities
+    prediction: Tuple[List[List[str]], List] = None
+    if all([isinstance(ins, list) for ins in corpus]):
+        prediction = classifier.predict(corpus[0])
+
+    elif isinstance(corpus, list):
+        prediction = classifier.predict(corpus)
+
+    # access the first (most likely) predicted language label
+    label = prediction[0][0][0]
+
+    # remove prefix from label string to get language code
+    label = label.replace("__label__", "")
+
+    return label
 
 
 class MultilingualSummModel(SingleDocSummModel):
@@ -30,29 +62,7 @@ def assert_summ_input_type(cls, corpus, query):
 
         super().assert_summ_input_type(corpus, query)
 
-        url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
-
-        filepath = get_cached_file_path("fasttext", "lid.176.ftz", url)
-
-        fasttext.FastText.eprint = lambda x: None
-        classifier = fasttext.load_model(str(filepath))
-
-        # fasttext returns a tuple of 2 lists:
-        # the first list contains a list of predicted language labels
-        # of the form {__label__<lang_code>}
-        # and the second list contains the corresponding probabilities
-        prediction: Tuple[List[List[str]], List] = None
-        if all([isinstance(ins, list) for ins in corpus]):
-            prediction = classifier.predict(corpus[0])
-
-        elif isinstance(corpus, list):
-            prediction = classifier.predict(corpus)
-
-        # access the first (most likely) predicted language label
-        label = prediction[0][0][0]
-
-        # remove prefix from label string to get language code
-        label = label.replace("__label__", "")
+        label = fasttext_predict(corpus)
 
         # check if language code is in the supported language dictionary
         if label in cls.lang_tag_dict:

diff --git a/summertime/model/single_doc/multilingual/translation_pipeline_model.py b/summertime/model/single_doc/multilingual/translation_pipeline_model.py
@@ -0,0 +1,148 @@
+from .base_multilingual_model import MultilingualSummModel, fasttext_predict
+from summertime.model.base_model import SummModel
+from summertime.model.single_doc import BartModel
+
+from easynmt import EasyNMT
+
+
+class TranslationPipelineModel(MultilingualSummModel):
+    """
+    A class for multilingual summarization performed by first
+    translating into English then performing summarization in English.
+    """
+
+    model_name = "Translation Pipeline"
+    is_multilingual = True
+    # TODO: change to Pegasus as default?
+    # language codes from https://github.com/UKPLab/EasyNMT#Opus-MT documentation
+    # language codes not supported by https://fasttext.cc/docs/en/language-identification.html
+    # are removed
+    supported_langs = [
+        "aed",
+        "af",
+        "am",
+        "ar",
+        "az",
+        "bat",
+        "bcl",
+        "be",
+        "bg",
+        "bn",
+        "ca",
+        "ceb",
+        "cs",
+        "cy",
+        "da",
+        "de",
+        "el",
+        "en",
+        "eo",
+        "es",
+        "et",
+        "eu",
+        "fi",
+        "fr",
+        "ga",
+        "gl",
+        "gv",
+        "he",
+        "hi",
+        "hr",
+        "ht",
+        "hu",
+        "hy",
+        "id",
+        "ilo",
+        "is",
+        "it",
+        "ja",
+        "ka",
+        "ko",
+        "lt",
+        "lv",
+        "mg",
+        "mk",
+        "ml",
+        "mr",
+        "ms",
+        "mt",
+        "nl",
+        "no",
+        "pa",
+        "pl",
+        "pt",
+        "ro",
+        "ru",
+        "run",
+        "sh",
+        "sk",
+        "sl",
+        "sq",
+        "sv",
+        "sw",
+        "th",
+        "tl",
+        "tr",
+        "uk",
+        "ur",
+        "vi",
+        "wa",
+        "war",
+        "yo",
+        "zh",
+    ]
+
+    lang_tag_dict = {lang: lang for lang in supported_langs}
+
+    def __init__(self, model_backend: SummModel = BartModel, **kwargs):
+
+        model: SummModel = model_backend(**kwargs)
+        self.model = model
+
+        super(TranslationPipelineModel, self).__init__(
+            trained_domain=self.model.trained_domain,
+            max_input_length=self.model.max_input_length,
+            max_output_length=self.model.max_output_length,
+        )
+
+        # translation module
+        self.translator = EasyNMT("opus-mt")
+
+    def summarize(self, corpus, queries=None):
+        self.assert_summ_input_type(corpus, queries)
+
+        src_lang = fasttext_predict(corpus)
+        # translate to English
+        corpus = self.translator.translate(
+            corpus, source_lang=src_lang, target_lang="en", beam_size=4
+        )
+        # TODO: translate each doc separately if provided multiple docs in corpus?
+        if queries:
+            queries = self.translator.translate(queries, target_lang="en", beam_size=4)
+
+        # summarize in English
+        english_summaries = self.model.summarize(corpus, queries)
+
+        summaries = self.translator.translate(
+            english_summaries, source_lang="en", target_lang=src_lang, beam_size=4
+        )
+
+        return summaries
+
+    @classmethod
+    def show_capability(cls) -> None:
+        basic_description = cls.generate_basic_description()
+        more_details = (
+            "A simple pipeline model for multilingual translation. "
+            "Uses machine translation to translate input into English, "
+            "then performs summarization in English before translating results "
+            "back to the original language.\n"
+            "Strengths: \n - Massively multilingual: supports ~150 languages\n"
+            "Weaknesses: \n - Information loss from translation to and from English"
+            "Initialization arguments: \n "
+            " - model_backend: the monolingual model to use for summarization. Defaults to BART"
+            # TODO: if change to Pegasus, change this to reflect that!!
+            "- `device = 'cpu'` specifies the device the model is stored on and uses for computation. "
+            "Use `device='cuda'` to run on an Nvidia GPU."
+        )
+        print(f"{basic_description} \n {'#'*20} \n {more_details}")