From bf4877222da98d749833e36d5470e051b7c4d361 Mon Sep 17 00:00:00 2001 From: "ALEX5739\\amalien" Date: Fri, 4 Dec 2020 13:57:14 +0100 Subject: [PATCH] Update requirements for transformer version Update requiremetns for tranformer version due to installation issue. Update requirements for benchmarks script with version and missing packages, limited the need for loading spacy when using transformer modellers, and fix incomplete codesnippet in docs --- danlp/models/bert_models.py | 4 ++-- danlp/models/spacy_models.py | 7 ++++--- docs/docs/frameworks/transformers.md | 1 + examples/benchmarks/requirements_benchmarks.txt | 4 +++- requirements.txt | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/danlp/models/bert_models.py b/danlp/models/bert_models.py index 412efad..2929f80 100644 --- a/danlp/models/bert_models.py +++ b/danlp/models/bert_models.py @@ -175,7 +175,7 @@ def _classes(self): def _get_pred(self, tokenizer, model, max_lenght, sentence): input1 = tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt', - max_length=max_lenght, return_overflowing_tokens=True) + max_length=max_lenght, truncation=True, return_overflowing_tokens=True) if 'overflowing_tokens' in input1: warnings.warn('Maximum length for sequence exceeded, truncation may result in unexpected results. Consider running the model on a shorter sequenze then {} tokens'.format(max_lenght)) pred = model(input1['input_ids'], token_type_ids=input1['token_type_ids'])[0] @@ -298,7 +298,7 @@ def _classes(self): def _get_pred(self, tokenizer, model, max_lenght, sentence): input1 = tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors='pt', - max_length=max_lenght, return_overflowing_tokens=True) + max_length=max_lenght, truncation=True, return_overflowing_tokens=True) if 'overflowing_tokens' in input1: warnings.warn('Maximum length for sequence exceeded, truncation may result in unexpected results. Consider running the model on a shorter sequenze then {} tokens'.format(max_lenght)) pred = model(input1['input_ids'], token_type_ids=input1['token_type_ids'])[0] diff --git a/danlp/models/spacy_models.py b/danlp/models/spacy_models.py index b1e6c8c..4e19df3 100644 --- a/danlp/models/spacy_models.py +++ b/danlp/models/spacy_models.py @@ -3,7 +3,7 @@ from danlp.download import DEFAULT_CACHE_DIR, download_model, \ _unzip_process_func -from spacy.tokens import Doc + def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ @@ -71,8 +71,9 @@ class SpacyChunking: :param str cache_dir: the directory for storing cached models :param bool verbose: `True` to increase verbosity """ - + def __init__(self, model=None, cache_dir=DEFAULT_CACHE_DIR, verbose=False): + if model == None: self.model = load_spacy_model(cache_dir=cache_dir, verbose=verbose) else: @@ -105,7 +106,7 @@ def predict(self, text: Union[str, List[str]], bio=True): return get_noun_chunks(doc, bio=bio) if isinstance(text, list): - + from spacy.tokens import Doc parser = self.model.parser tagger = self.model.tagger diff --git a/docs/docs/frameworks/transformers.md b/docs/docs/frameworks/transformers.md index cedb717..49eadd0 100644 --- a/docs/docs/frameworks/transformers.md +++ b/docs/docs/frameworks/transformers.md @@ -28,6 +28,7 @@ A pytorch version of the [Danish BERT](https://github.com/botxo/nordic_bert) tr For **predicting a masked word** in a sentence, you can after downloading the model through DaNLP, use the transformer library directly as described in the following snippet: ```python +from transformer import pipeline from danlp.models import load_bert_base_model # load the BERT model model = load_bert_base_model() diff --git a/examples/benchmarks/requirements_benchmarks.txt b/examples/benchmarks/requirements_benchmarks.txt index 8cf6501..384d13d 100644 --- a/examples/benchmarks/requirements_benchmarks.txt +++ b/examples/benchmarks/requirements_benchmarks.txt @@ -4,13 +4,15 @@ gensim==3.8.1 flair==0.4.5 pyconll==2.2.1 pandas==1.0.1 -transformers==2.3.0 +transformers==3.1.0 +torch==1.6.0 srsly==1.0.2 sentida==0.5.0 Morfessor==2.0.6 PyICU==2.4.2 pycld2==0.41 sentida==0.5.0 +nltk afinn polyglot seqeval diff --git a/requirements.txt b/requirements.txt index 28cc440..2b10395 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ flair==0.4.5 pyconll==2.2.1 conllu==0.11 pandas==1.0.1 -transformers==3.5.1 +transformers==3.1.0 srsly==1.0.2 tweepy \ No newline at end of file