Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add translation pipeline model #110

Merged
merged 12 commits into from Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Expand Up @@ -85,8 +85,9 @@ SummerTime supports different models (e.g., TextRank, BART, Longformer) as well
| HMNetModel | | | :heavy_check_mark: | | |
| LexRankModel | :heavy_check_mark: | | | | |
| LongformerModel | :heavy_check_mark: | | | | |
| MBartModel | :heavy_check_mark: | | | | 50 languages (Arabic, Czech, German, English, Spanish, Estonian, Finnish, French, Gujarati, Hindi, Italian, Japanese, Kazakh, Korean, Lithuanian, Latvian, Burmese, Nepali, Dutch, Romanian, Russian, Sinhala, Turkish, Vietnamese, Chinese, Afrikaans, Azerbaijani, Bengali, Persian, Hebrew, Croatian, Indonesian, Georgian, Khmer, Macedonian, Malayalam, Mongolian, Marathi, Polish, Pashto, Portuguese, Swedish, Tamil, Telugu, Thai, Tagalog, Ukrainian, Urdu, Xhosa, Slovenian) |
| MBartModel | :heavy_check_mark: | | | | 50 languages (full list [here](https://huggingface.co/facebook/mbart-large-50)) |
| MT5Model | :heavy_check_mark: | | | | 101 languages (full list [here](https://github.com/google-research/multilingual-t5#readme)) |
| TranslationPipelineModel | :heavy_check_mark: | | | | ~70 languages |
| MultiDocJointModel | | :heavy_check_mark: | | |
| MultiDocSeparateModel | | :heavy_check_mark: | | |
| PegasusModel | :heavy_check_mark: | | | |
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -24,6 +24,7 @@ mpi4py==3.0.3
tqdm==4.49.0
tensorboard==2.4.1
fasttext==0.9.2
easynmt==2.0.1
black
flake8
progressbar
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -43,6 +43,7 @@
"tqdm~=4.49.0",
"tensorboard~=2.4.1",
"fasttext~=0.9.2",
"easynmt~=2.0.1",
"black",
"flake8",
"progressbar",
Expand Down
2 changes: 2 additions & 0 deletions summertime/model/__init__.py
Expand Up @@ -6,6 +6,7 @@
PegasusModel,
TextRankModel,
MT5Model,
TranslationPipelineModel,
)
from .multi_doc import MultiDocJointModel, MultiDocSeparateModel
from .dialogue import HMNetModel, FlattenDialogueModel
Expand All @@ -16,6 +17,7 @@
BartModel,
MBartModel,
MT5Model,
TranslationPipelineModel,
LexRankModel,
LongformerModel,
PegasusModel,
Expand Down
1 change: 1 addition & 0 deletions summertime/model/single_doc/__init__.py
Expand Up @@ -4,5 +4,6 @@
from .longformer_model import LongformerModel
from .textrank_model import TextRankModel

from .multilingual import TranslationPipelineModel
from .multilingual import MBartModel
from .multilingual import MT5Model
1 change: 1 addition & 0 deletions summertime/model/single_doc/multilingual/__init__.py
@@ -1,2 +1,3 @@
from .mbart_model import MBartModel
from .mt5_model import MT5Model
from .translation_pipeline_model import TranslationPipelineModel
58 changes: 34 additions & 24 deletions summertime/model/single_doc/multilingual/base_multilingual_model.py
Expand Up @@ -3,7 +3,39 @@
get_cached_file_path,
)
import fasttext
from typing import Dict, List, Tuple
from typing import List, Union, Dict, Tuple


def fasttext_predict(corpus: Union[List[str], List[List[str]]]):
"""
Utility function to predict the language of input text
using fasttext classifier.
"""
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"

filepath = get_cached_file_path("fasttext", "lid.176.ftz", url)

fasttext.FastText.eprint = lambda x: None
classifier = fasttext.load_model(str(filepath))

# fasttext returns a tuple of 2 lists:
# the first list contains a list of predicted language labels
# of the form {__label__<lang_code>}
# and the second list contains the corresponding probabilities
prediction: Tuple[List[List[str]], List] = None
if all([isinstance(ins, list) for ins in corpus]):
prediction = classifier.predict(corpus[0])

elif isinstance(corpus, list):
prediction = classifier.predict(corpus)

# access the first (most likely) predicted language label
label = prediction[0][0][0]

# remove prefix from label string to get language code
label = label.replace("__label__", "")

return label


class MultilingualSummModel(SingleDocSummModel):
Expand All @@ -30,29 +62,7 @@ def assert_summ_input_type(cls, corpus, query):

super().assert_summ_input_type(corpus, query)

url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"

filepath = get_cached_file_path("fasttext", "lid.176.ftz", url)

fasttext.FastText.eprint = lambda x: None
classifier = fasttext.load_model(str(filepath))

# fasttext returns a tuple of 2 lists:
# the first list contains a list of predicted language labels
# of the form {__label__<lang_code>}
# and the second list contains the corresponding probabilities
prediction: Tuple[List[List[str]], List] = None
if all([isinstance(ins, list) for ins in corpus]):
prediction = classifier.predict(corpus[0])

elif isinstance(corpus, list):
prediction = classifier.predict(corpus)

# access the first (most likely) predicted language label
label = prediction[0][0][0]

# remove prefix from label string to get language code
label = label.replace("__label__", "")
label = fasttext_predict(corpus)

# check if language code is in the supported language dictionary
if label in cls.lang_tag_dict:
Expand Down
148 changes: 148 additions & 0 deletions summertime/model/single_doc/multilingual/translation_pipeline_model.py
@@ -0,0 +1,148 @@
from .base_multilingual_model import MultilingualSummModel, fasttext_predict
from summertime.model.base_model import SummModel
from summertime.model.single_doc import BartModel

from easynmt import EasyNMT


class TranslationPipelineModel(MultilingualSummModel):
"""
A class for multilingual summarization performed by first
translating into English then performing summarization in English.
"""

model_name = "Translation Pipeline"
is_multilingual = True
# TODO: change to Pegasus as default?
# language codes from https://github.com/UKPLab/EasyNMT#Opus-MT documentation
# language codes not supported by https://fasttext.cc/docs/en/language-identification.html
# are removed
supported_langs = [
"aed",
"af",
"am",
"ar",
"az",
"bat",
"bcl",
"be",
"bg",
"bn",
"ca",
"ceb",
"cs",
"cy",
"da",
"de",
"el",
"en",
"eo",
"es",
"et",
"eu",
"fi",
"fr",
"ga",
"gl",
"gv",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"ilo",
"is",
"it",
"ja",
"ka",
"ko",
"lt",
"lv",
"mg",
"mk",
"ml",
"mr",
"ms",
"mt",
"nl",
"no",
"pa",
"pl",
"pt",
"ro",
"ru",
"run",
"sh",
"sk",
"sl",
"sq",
"sv",
"sw",
"th",
"tl",
"tr",
"uk",
"ur",
"vi",
"wa",
"war",
"yo",
"zh",
]

lang_tag_dict = {lang: lang for lang in supported_langs}

def __init__(self, model_backend: SummModel = BartModel, **kwargs):

model: SummModel = model_backend(**kwargs)
self.model = model

super(TranslationPipelineModel, self).__init__(
trained_domain=self.model.trained_domain,
max_input_length=self.model.max_input_length,
max_output_length=self.model.max_output_length,
)

# translation module
self.translator = EasyNMT("opus-mt")

def summarize(self, corpus, queries=None):
self.assert_summ_input_type(corpus, queries)

src_lang = fasttext_predict(corpus)
# translate to English
corpus = self.translator.translate(
corpus, source_lang=src_lang, target_lang="en", beam_size=4
)
# TODO: translate each doc separately if provided multiple docs in corpus?
if queries:
queries = self.translator.translate(queries, target_lang="en", beam_size=4)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when you specify the beam_size as 4, is the output one sentence or 4?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still just one sentence -- this is just the beam size for the translation model's generation.


# summarize in English
english_summaries = self.model.summarize(corpus, queries)

summaries = self.translator.translate(
english_summaries, source_lang="en", target_lang=src_lang, beam_size=4
)

return summaries

@classmethod
def show_capability(cls) -> None:
basic_description = cls.generate_basic_description()
more_details = (
"A simple pipeline model for multilingual translation. "
"Uses machine translation to translate input into English, "
"then performs summarization in English before translating results "
"back to the original language.\n"
"Strengths: \n - Massively multilingual: supports ~150 languages\n"
"Weaknesses: \n - Information loss from translation to and from English"
"Initialization arguments: \n "
" - model_backend: the monolingual model to use for summarization. Defaults to BART"
# TODO: if change to Pegasus, change this to reflect that!!
"- `device = 'cpu'` specifies the device the model is stored on and uses for computation. "
"Use `device='cuda'` to run on an Nvidia GPU."
)
print(f"{basic_description} \n {'#'*20} \n {more_details}")