diff --git a/Dockerfile b/Dockerfile
index 36c3e9a..396daba 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ WORKDIR /work
 COPY requirements.in .
 
 RUN pip install -r requirements.in
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
+RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
 RUN python -m spacy download en_core_web_sm
 RUN python -m spacy download en_core_web_md
 
diff --git a/README.md b/README.md
index 8501b41..210b36c 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ pip install scispacy
 to install a model (see our full selection of available models below), run a command like the following:
 
 ```bash
-pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
+pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
 ```
 
 Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy.
@@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL)
 
 | Model          | Description       | Install URL
 |:---------------|:------------------|:----------|
-| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)|
-| en_core_sci_md |  A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)|
-| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)|
-| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)|
-| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)|
-| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)|
-| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)|
-| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)|
+| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)|
+| en_core_sci_md |  A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)|
+| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)|
+| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)|
+| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)|
+| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)|
+| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)|
+| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)|
 
 
 ## Additional Pipeline Components
diff --git a/configs/base_ner.cfg b/configs/base_ner.cfg
index 84b2ca5..00b0506 100644
--- a/configs/base_ner.cfg
+++ b/configs/base_ner.cfg
@@ -1,3 +1,6 @@
+[vars]
+include_static_vectors = null
+
 [paths]
 vectors = null
 init_tok2vec = null
@@ -31,26 +34,26 @@ moves = null
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
-hidden_width = 64
-maxout_pieces = 2
+hidden_width = 128
+maxout_pieces = 3
 use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.ner.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-rows = [5000, 2500, 2500, 2500]
-include_static_vectors = true
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
+rows = [5000, 2500, 2500, 2500, 100]
+include_static_vectors = ${vars.include_static_vectors}
 
 [components.ner.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
@@ -82,7 +85,7 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
 accumulate_gradient = 1
 patience = 0
 max_epochs = 7
diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg
index 9e80b53..c8b7371 100644
--- a/configs/base_ner_scibert.cfg
+++ b/configs/base_ner_scibert.cfg
@@ -5,7 +5,7 @@ parser_tagger_path = null
 vocab_path = null
 
 [system]
-gpu_allocator = null
+gpu_allocator = "pytorch"
 seed = 0
 
 [nlp]
@@ -31,7 +31,7 @@ moves = null
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 hidden_width = 64
@@ -40,17 +40,17 @@ use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.ner.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-rows = [5000, 2500, 2500, 2500]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
+rows = [5000, 2500, 2500, 2500, 100]
 include_static_vectors = false
 
 [components.ner.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
@@ -83,7 +83,7 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
 accumulate_gradient = 1
 patience = 0
 max_epochs = 7
diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg
index 437170c..cec9e47 100644
--- a/configs/base_parser_tagger.cfg
+++ b/configs/base_parser_tagger.cfg
@@ -1,3 +1,6 @@
+[vars]
+include_static_vectors = null
+
 [paths]
 genia_train = "project_data/genia_train.spacy"
 genia_dev = "project_data/genia_dev.spacy"
@@ -35,7 +38,7 @@ moves = null
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
@@ -64,17 +67,17 @@ upstream = "*"
 factory = "tok2vec"
 
 [components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-rows = [5000, 2500, 2500, 2500]
-include_static_vectors = true
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
+rows = [5000, 2500, 2500, 2500, 100]
+include_static_vectors = ${vars.include_static_vectors}
 
 [components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
@@ -106,7 +109,7 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
 accumulate_gradient = 1
 patience = 0
 max_epochs = 20
diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg
index 9e59d4e..016a395 100644
--- a/configs/base_parser_tagger_scibert.cfg
+++ b/configs/base_parser_tagger_scibert.cfg
@@ -7,7 +7,7 @@ init_tok2vec = null
 vocab_path = null
 
 [system]
-gpu_allocator = "pytorch"
+gpu_allocator = null
 seed = 0
 
 [nlp]
@@ -36,12 +36,12 @@ moves = null
 update_with_oracle_cut_size = 100
 
 [components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
-use_upper = true
+use_upper = false
 nO = null
 
 [components.parser.model.tok2vec]
@@ -69,9 +69,10 @@ max_batch_items = 4096
 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
 
 [components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v1"
+@architectures = "spacy-transformers.TransformerModel.v3"
 name = "allenai/scibert_scivocab_uncased"
 tokenizer_config = {"use_fast": true}
+mixed_precision = true
 
 [components.transformer.model.get_spans]
 @span_getters = "spacy-transformers.strided_spans.v1"
@@ -105,7 +106,7 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
 accumulate_gradient = 1
 patience = 0
 max_epochs = 8
@@ -120,8 +121,8 @@ get_length = null
 
 [training.batcher.size]
 @schedules = "compounding.v1"
-start = 16
-stop = 64
+start = 4
+stop = 12
 compound = 1.001
 t = 0.0
 
@@ -157,14 +158,6 @@ ents_r = 0.0
 [pretraining]
 
 [initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = ${paths.vocab_path}
-lookups = null
-
-[initialize.components]
-
-[initialize.tokenizer]
 
 [initialize.before_init]
 @callbacks = "replace_tokenizer"
diff --git a/configs/base_specialized_ner.cfg b/configs/base_specialized_ner.cfg
index c6274b7..f083838 100644
--- a/configs/base_specialized_ner.cfg
+++ b/configs/base_specialized_ner.cfg
@@ -1,3 +1,6 @@
+[vars]
+include_static_vectors = null
+
 [paths]
 vectors = null
 init_tok2vec = null
@@ -33,26 +36,26 @@ moves = null
 update_with_oracle_cut_size = 100
 
 [components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
+@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
-hidden_width = 64
-maxout_pieces = 2
+hidden_width = 128
+maxout_pieces = 3
 use_upper = true
 nO = null
 
 [components.ner.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
 
 [components.ner.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-rows = [5000, 2500, 2500, 2500]
-include_static_vectors = true
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
+rows = [5000, 2500, 2500, 2500, 100]
+include_static_vectors = ${vars.include_static_vectors}
 
 [components.ner.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
@@ -82,7 +85,7 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
 accumulate_gradient = 1
 patience = 0
 max_epochs = 7
diff --git a/docs/index.md b/docs/index.md
index 63ab93d..8579f9e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,15 +17,14 @@ pip install <Model URL>
 
 | Model          | Description       | Install URL
 |:---------------|:------------------|:----------|
-| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)|
-| en_core_sci_md |  A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)|
-| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)|
-| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)|
-| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)|
-| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)|
-| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)|
-| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)|
-
+| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)|
+| en_core_sci_md |  A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)|
+| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)|
+| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)|
+| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)|
+| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)|
+| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)|
+| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)|
 
 
 
@@ -35,18 +34,18 @@ Our models achieve performance within 3% of published state of the art dependenc
 
 | model          | UAS | LAS   | POS   | Mentions (F1) | Web UAS | 
 |:---------------|:----|:------|:------|:---|:---|
-| en_core_sci_sm | 89.54| 87.62  |  98.32  |  68.15  |  87.62  |
-| en_core_sci_md | 89.61| 87.77 |  98.56 |  69.64 |  88.05  |
-| en_core_sci_lg | 89.63| 87.81  |  98.56  |  69.61  |  88.08  |
-| en_core_sci_scibert | 92.03| 90.25  |  98.91  |  67.91  |  92.21  |
+| en_core_sci_sm | 89.27| 87.33  |  98.29  |  68.05  |  87.61  |
+| en_core_sci_md | 89.86| 87.92 |  98.43 |  69.32 |  88.05  |
+| en_core_sci_lg | 89.54| 87.66  |  98.29  |  69.52  |  87.68  |
+| en_core_sci_scibert | 92.28| 90.83  |  98.93  |  67.84  |  92.63  |
 
 
 | model          | F1 |   Entity Types|
 |:---------------|:-----|:--------|
-| en_ner_craft_md | 76.11|GGP, SO, TAXON, CHEBI, GO, CL|
-| en_ner_jnlpba_md | 71.62| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN |
-| en_ner_bc5cdr_md | 84.49| DISEASE, CHEMICAL|
-| en_ner_bionlp13cg_md | 77.75| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE |
+| en_ner_craft_md | 78.35|GGP, SO, TAXON, CHEBI, GO, CL|
+| en_ner_jnlpba_md | 70.89| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN |
+| en_ner_bc5cdr_md | 84.70| DISEASE, CHEMICAL|
+| en_ner_bionlp13cg_md | 76.79| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE |
 
 
 ### Example Usage
diff --git a/project.yml b/project.yml
index b2983a1..2f11d1a 100644
--- a/project.yml
+++ b/project.yml
@@ -2,7 +2,8 @@ title: "scispaCy pipeline"
 description: "All the steps needed in the scispaCy pipeline"
 
 vars:
-  version_string: "0.4.0"
+  version_string: "0.5.0"
+  gpu_id: "0"
   freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs"
   freqs_loc_local: "assets/gorc_subset.freqs"
   vectors_loc_s3: "s3://ai2-s2-scispacy/data/pubmed_with_header.txt.gz"
@@ -259,7 +260,7 @@ commands:
   - name: parser-tagger-train-sm
     help: "Train the base models"
     script:
-      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc}"
+      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False"
     deps:
       - "${vars.parser_tagger_config_loc}"
       - "${vars.genia_train_spacy_loc}"
@@ -272,7 +273,7 @@ commands:
   - name: parser-tagger-train-md
     help: "Train the base models"
     script:
-      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc}"
+      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
     deps:
       - "${vars.parser_tagger_config_loc}"
       - "${vars.genia_train_spacy_loc}"
@@ -286,7 +287,7 @@ commands:
   - name: parser-tagger-train-lg
     help: "Train the base models"
     script:
-      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc}"
+      - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True"
     deps:
       - "${vars.parser_tagger_config_loc}"
       - "${vars.genia_train_spacy_loc}"
@@ -300,7 +301,7 @@ commands:
   - name: parser-tagger-train-scibert
     help: "Train the scibert transformer model"
     script:
-      - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id 0"
+      - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id ${vars.gpu_id}"
     deps:
       - "${vars.parser_tagger_config_loc}"
       - "${vars.genia_train_spacy_loc}"
@@ -313,7 +314,7 @@ commands:
   - name: ner-train-sm
     help: "Train the main ner"
     script:
-      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc}"
+      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False"
     deps:
       - "${vars.ner_config_loc}"
       - "${vars.parser_tagger_sm_loc}/model-best"
@@ -324,7 +325,7 @@ commands:
   - name: ner-train-md
     help: "Train the main ner"
     script:
-      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc}"
+      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
     deps:
       - "${vars.ner_config_loc}"
       - "${vars.parser_tagger_md_loc}/model-best"
@@ -336,7 +337,7 @@ commands:
   - name: ner-train-lg
     help: "Train the main ner"
     script:
-      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc}"
+      - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True"
     deps:
       - "${vars.ner_config_loc}"
       - "${vars.parser_tagger_lg_loc}/model-best"
@@ -348,7 +349,7 @@ commands:
   - name: ner-train-scibert
     help: "Train the scibert ner model."
     script:
-      - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id 0"
+      - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id ${vars.gpu_id}"
     deps:
       - "${vars.ner_config_loc}"
       - "${vars.parser_tagger_scibert_loc}/model-best"
@@ -359,10 +360,10 @@ commands:
   - name: ner-train-specialized
     help: "Train the specialized NER models"
     script:
-      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}"
-      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}"
-      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}"
-      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}"
+      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
+      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
+      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
+      - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True"
     deps:
       - "${vars.corpus_pubtator_loc_local}"
       - "${vars.bc5cdr_loc_local}/train.tsv"
@@ -427,8 +428,8 @@ commands:
   - name: evaluate-parser-tagger-scibert
     help: "Evaluate the parser and tagger scibert model"
     script:
-      - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json"
-      - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json"
+      - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json --gpu-id ${vars.gpu_id}"
+      - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}"
     deps:
       - "${vars.parser_tagger_scibert_loc}/model-best"
       - "${vars.genia_test_spacy_loc}"
@@ -470,7 +471,7 @@ commands:
   - name: evaluate-ner-scibert
     help: "Evaluate NER scibert"
     script:
-      - "python scripts/evaluate_ner.py --model_path ${vars.ner_scibert_loc}/model-best --dataset medmentions-test --output ${vars.ner_scibert_loc}/model_best_results.json --med_mentions_folder_path assets/"
+      - "python scripts/evaluate_ner.py --model_path ${vars.ner_scibert_loc}/model-best --dataset medmentions-test --output ${vars.ner_scibert_loc}/model_best_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}"
     deps:
       - "${vars.ner_scibert_loc}"
       - "${vars.corpus_pubtator_loc_local}"
@@ -578,9 +579,9 @@ commands:
   - name: evaluate-package-scibert
     help: "Evaluate the packaged scibert model"
     script:
-      - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json"
-      - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json"
-      - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/"
+      - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json --gpu-id ${vars.gpu_id}"
+      - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json --gpu-id ${var.gpu_id}"
+      - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}"
     deps:
       - "${vars.package_scibert_loc}"
     outputs:
diff --git a/requirements.in b/requirements.in
index b5970a4..8351a3a 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,5 +1,5 @@
 numpy
-spacy>=3.0.0,<3.1.0
+spacy>=3.2.0,<3.3.0
 spacy-lookups-data
 pandas
 requests>=2.0.0,<3.0.0
@@ -15,7 +15,7 @@ pytest
 pytest-cov
 flake8
 # black currently pinned because of a dependency issue with spacy, typer, and click
-black<=21.12b0
+black
 mypy
 types-requests
 
diff --git a/scispacy/abbreviation.py b/scispacy/abbreviation.py
index cf5e4aa..e2ac595 100644
--- a/scispacy/abbreviation.py
+++ b/scispacy/abbreviation.py
@@ -229,7 +229,7 @@ def find_matches_for(
         to_remove = set()
         global_matches = self.global_matcher(doc)
         for match, start, end in global_matches:
-            string_key = self.global_matcher.vocab.strings[match]
+            string_key = self.global_matcher.vocab.strings[match]  # type: ignore
             to_remove.add(string_key)
             all_occurences[rules[string_key]].add(doc[start:end])
         for key in to_remove:
diff --git a/scispacy/hearst_patterns.py b/scispacy/hearst_patterns.py
index c13446f..992b7a4 100644
--- a/scispacy/hearst_patterns.py
+++ b/scispacy/hearst_patterns.py
@@ -1,3 +1,5 @@
+from typing import List, Dict, Any
+
 """
 BSD 3-Clause License
 
@@ -35,7 +37,7 @@
 punct = {"IS_PUNCT": True, "OP": "?"}
 det = {"ORTH": "*", "OP": "*"}
 
-BASE_PATTERNS = [
+BASE_PATTERNS: List[Dict[str, Any]] = [
     # '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)', 'first'
     {
         "label": "such_as",
diff --git a/scispacy/version.py b/scispacy/version.py
index 65ece88..c8b11da 100644
--- a/scispacy/version.py
+++ b/scispacy/version.py
@@ -1,5 +1,5 @@
 _MAJOR = "0"
-_MINOR = "4"
+_MINOR = "5"
 _REVISION = "0"
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
diff --git a/scripts/evaluate_ner.py b/scripts/evaluate_ner.py
index 707b50d..cbe6add 100644
--- a/scripts/evaluate_ner.py
+++ b/scripts/evaluate_ner.py
@@ -4,14 +4,19 @@
 import spacy
 import importlib
 
+from thinc.api import require_gpu
+
 from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv
 from scispacy.train_utils import evaluate_ner
 
 
-def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str]):
+def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str], gpu_id: Optional[int]):
+    if gpu_id is not None and gpu_id >= 0:
+        require_gpu(gpu_id)
+
     if code is not None:
         # need to import code before loading a spacy model
-        spec = importlib.util.spec_from_file_location(name, str(loc))
+        spec = importlib.util.spec_from_file_location("python_code", str(code))
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
 
@@ -40,6 +45,7 @@ def main(model_path: str, dataset: str, output_path: str, code: Optional[str], m
     parser.add_argument("--output_path", type=str, help="Path to write results to")
     parser.add_argument("--code", type=str, default=None, help="Path to code to import before loading spacy model")
     parser.add_argument("--med_mentions_folder_path", type=str, default=None, help="Path to the med mentions folder")
+    parser.add_argument("--gpu_id", type=int, default=-1, help="GPU id to use")
 
     args = parser.parse_args()
-    main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path)
\ No newline at end of file
+    main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path, args.gpu_id)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b9c0d5d..c972d5c 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     license="Apache",
     install_requires=[
-        "spacy>=3.0.0,<3.1.0",
+        "spacy>=3.2.0,<3.3.0",
         "requests>=2.0.0,<3.0.0",
         "conllu",
         "numpy",
diff --git a/tests/custom_tests/test_custom_segmentation.py b/tests/custom_tests/test_custom_segmentation.py
index 83e9e02..4d4838d 100644
--- a/tests/custom_tests/test_custom_segmentation.py
+++ b/tests/custom_tests/test_custom_segmentation.py
@@ -1,51 +1,260 @@
 import pytest
 
 TEST_CASES = [
-            ("LSTM networks, which we preview in Sec. 2, have been successfully", ["LSTM networks, which we preview in Sec. 2, have been successfully"]),
-            ("When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1.", ["When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1."]),
-            ("We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational", ["We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational"]),
-            ("Hill functions indeed fit the data well (Fig. 3A and Table 1).", ["Hill functions indeed fit the data well (Fig. 3A and Table 1)."]),
-            ('In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).', ['In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).']),
-            ("There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10. Figure 3 (left) provides a better understanding of the influence of sparsity.", ["There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10.", "Figure 3 (left) provides a better understanding of the influence of sparsity."]),
-            ("Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005). It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).", ["Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005).", "It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997)."]),
-            ("1) The first item. 2) The second item.", ["1) The first item.", "2) The second item."]),
-            ("two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they", ["two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they"]),
-            pytest.param("all neu-\nrons fire at", ["all neu-\nrons fire at"], marks=pytest.mark.xfail),
-            ("the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract", ["the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract"]),
-            ("While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform.", ["While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform."]),
-            ("We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree.", ["We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree."]),
-            ("LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014).", ["LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014)."]),
-            ("1 Introduction\n\nMost models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.", ["1 Introduction\n\n", "Most models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models."]),
-            ("In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\nA. Peer-to-Peer\n\nA system built using ROS consists of a number of processes, potentially on a number of different", ["In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\n", "A. Peer-to-Peer\n\n", "A system built using ROS consists of a number of processes, potentially on a number of different"]),
-            ("\n\n2 Long Short-Term Memory Networks\n\n\n\n2.1 Overview\n\nRecurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.", ["\n\n2 Long Short-Term Memory Networks\n\n\n\n", "2.1 Overview\n\n", "Recurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht."]),
-            ("In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time. Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B). These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).", ["In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time.", "Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B).", "These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp)."]),
-            ("This is a sentence. (This is an interjected sentence.) This is also a sentence.", ["This is a sentence.", "(This is an interjected sentence.)", "This is also a sentence."]),
-            ("Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system. EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.", ["Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system.", "EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material."]),
-            ("Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).", ["Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007).", "Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior.", "A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012)."]),
-            ('. . .', ['. . .']),
-            ("IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state. If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed. Note that the action's precondition as specified in the domain model must also be satisfied. Figure 5 presents an outline of the system. Each iteration starts with a population of policies (line(2)). Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)). Note that the evaluation of policies is implied when the fittest policy or policies is/are required.", ["IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state.", "If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed.", "Note that the action's precondition as specified in the domain model must also be satisfied.", "Figure 5 presents an outline of the system.", "Each iteration starts with a population of policies (line(2)).", "Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)).", "Note that the evaluation of policies is implied when the fittest policy or policies is/are required."]),
-             ("MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details). Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 . Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities. Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR. Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives. Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.", ["MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details).", "Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 .", "Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities.", "Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR.", "Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives.", "Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes."]),
-            ("Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015). Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: \"Research that is highly cited or published in top journals may be good for the academic discipline but not for society\" (p. 547). Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014). The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest. The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.", ["Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015).", "Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: \"Research that is highly cited or published in top journals may be good for the academic discipline but not for society\" (p. 547).", "Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014).", "The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest.", "The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research."]),
-            ("CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA. Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.", ["CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA.", "Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation."]),
-            ("In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun.", ["In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun."]),
-            ("A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB. We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.", ["A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB.", "We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site."]),
-            pytest.param("Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B. Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.", ["Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B.", "Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner."], marks=pytest.mark.xfail),
-            pytest.param("There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation.", ["There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation."], marks=pytest.mark.xfail),
-            ("This sentence mentions Eqs. 1-4 and should not be split.", ["This sentence mentions Eqs. 1-4 and should not be split."]),
-            ("This sentence ends with part an abbreviation that is part of a word material. It also has another sentence after it.", ["This sentence ends with part an abbreviation that is part of a word material.", "It also has another sentence after it."]),
-            ("It also has a sentence before it. This sentence mentions Eqs. 1-4 and should not be split. It also has another sentence after it.", ["It also has a sentence before it.", "This sentence mentions Eqs. 1-4 and should not be split.", "It also has another sentence after it."]),
-            ("This sentence is the last segment and ends with an abbreviation that is part of a word material.", ["This sentence is the last segment and ends with an abbreviation that is part of a word material."]),
-            ("PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml).", ["PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml)."]),
-            ("    This document starts with whitespaces. Next sentence.", ["    ", "This document starts with whitespaces.", "Next sentence."]),
-            pytest.param("How about tomorrow?We can meet at eden garden.", ["How about tomorrow?", "We can meet at eden garden."], marks=pytest.mark.xfail)
-             ]
+    (
+        "LSTM networks, which we preview in Sec. 2, have been successfully",
+        ["LSTM networks, which we preview in Sec. 2, have been successfully"],
+    ),
+    (
+        "When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1.",
+        [
+            "When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1."
+        ],
+    ),
+    (
+        "We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational",
+        [
+            "We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational"
+        ],
+    ),
+    (
+        "Hill functions indeed fit the data well (Fig. 3A and Table 1).",
+        ["Hill functions indeed fit the data well (Fig. 3A and Table 1)."],
+    ),
+    (
+        "In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).",
+        [
+            "In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”)."
+        ],
+    ),
+    (
+        "There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10. Figure 3 (left) provides a better understanding of the influence of sparsity.",
+        [
+            "There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10.",
+            "Figure 3 (left) provides a better understanding of the influence of sparsity.",
+        ],
+    ),
+    (
+        "Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005). It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).",
+        [
+            "Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005).",
+            "It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).",
+        ],
+    ),
+    (
+        "1) The first item. 2) The second item.",
+        ["1) The first item.", "2) The second item."],
+    ),
+    (
+        "two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they",
+        [
+            "two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they"
+        ],
+    ),
+    pytest.param(
+        "all neu-\nrons fire at", ["all neu-\nrons fire at"], marks=pytest.mark.xfail
+    ),
+    (
+        "the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract",
+        [
+            "the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract"
+        ],
+    ),
+    (
+        "While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform.",
+        [
+            "While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform."
+        ],
+    ),
+    (
+        "We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree.",
+        [
+            "We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree."
+        ],
+    ),
+    (
+        "LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014).",
+        [
+            "LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014)."
+        ],
+    ),
+    (
+        "1 Introduction\n\nMost models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.",
+        [
+            "1 Introduction\n\n",
+            "Most models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.",
+        ],
+    ),
+    (
+        "In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\nA. Peer-to-Peer\n\nA system built using ROS consists of a number of processes, potentially on a number of different",
+        [
+            "In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\n",
+            "A. Peer-to-Peer\n\n",
+            "A system built using ROS consists of a number of processes, potentially on a number of different",
+        ],
+    ),
+    (
+        "\n\n2 Long Short-Term Memory Networks\n\n\n\n2.1 Overview\n\nRecurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.",
+        [
+            "\n\n2 Long Short-Term Memory Networks\n\n\n\n",
+            "2.1 Overview\n\n",
+            "Recurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.",
+        ],
+    ),
+    (
+        "In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time. Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B). These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).",
+        [
+            "In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time.",
+            "Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B).",
+            "These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).",
+        ],
+    ),
+    (
+        "This is a sentence. (This is an interjected sentence.) This is also a sentence.",
+        [
+            "This is a sentence.",
+            "(This is an interjected sentence.)",
+            "This is also a sentence.",
+        ],
+    ),
+    (
+        "Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system. EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.",
+        [
+            "Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system.",
+            "EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.",
+        ],
+    ),
+    (
+        "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).",
+        [
+            "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007).",
+            "Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior.",
+            "A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).",
+        ],
+    ),
+    (". . .", [". . ."]),
+    (
+        "IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state. If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed. Note that the action's precondition as specified in the domain model must also be satisfied. Figure 5 presents an outline of the system. Each iteration starts with a population of policies (line(2)). Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)). Note that the evaluation of policies is implied when the fittest policy or policies is/are required.",
+        [
+            "IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state.",
+            "If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed.",
+            "Note that the action's precondition as specified in the domain model must also be satisfied.",
+            "Figure 5 presents an outline of the system.",
+            "Each iteration starts with a population of policies (line(2)).",
+            "Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)).",
+            "Note that the evaluation of policies is implied when the fittest policy or policies is/are required.",
+        ],
+    ),
+    (
+        "MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details). Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 . Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities. Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR. Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives. Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.",
+        [
+            "MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details).",
+            "Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 .",
+            "Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities.",
+            "Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR.",
+            "Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives.",
+            "Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.",
+        ],
+    ),
+    (
+        'Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015). Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: "Research that is highly cited or published in top journals may be good for the academic discipline but not for society" (p. 547). Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014). The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest. The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.',
+        [
+            "Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015).",
+            'Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: "Research that is highly cited or published in top journals may be good for the academic discipline but not for society" (p. 547).',
+            "Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014).",
+            "The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest.",
+            "The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.",
+        ],
+    ),
+    (
+        "CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA. Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.",
+        [
+            "CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA.",
+            "Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.",
+        ],
+    ),
+    (
+        "In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun.",
+        [
+            "In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun."
+        ],
+    ),
+    (
+        "A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB. We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.",
+        [
+            "A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB.",
+            "We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.",
+        ],
+    ),
+    pytest.param(
+        "Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B. Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.",
+        [
+            "Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B.",
+            "Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.",
+        ],
+        marks=pytest.mark.xfail,
+    ),
+    (
+        "There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation.",
+        [
+            "There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation."
+        ],
+    ),
+    (
+        "This sentence mentions Eqs. 1-4 and should not be split.",
+        ["This sentence mentions Eqs. 1-4 and should not be split."],
+    ),
+    (
+        "This sentence ends with part an abbreviation that is part of a word material. It also has another sentence after it.",
+        [
+            "This sentence ends with part an abbreviation that is part of a word material.",
+            "It also has another sentence after it.",
+        ],
+    ),
+    (
+        "It also has a sentence before it. This sentence mentions Eqs. 1-4 and should not be split. It also has another sentence after it.",
+        [
+            "It also has a sentence before it.",
+            "This sentence mentions Eqs. 1-4 and should not be split.",
+            "It also has another sentence after it.",
+        ],
+    ),
+    (
+        "This sentence is the last segment and ends with an abbreviation that is part of a word material.",
+        [
+            "This sentence is the last segment and ends with an abbreviation that is part of a word material."
+        ],
+    ),
+    (
+        "PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml).",
+        [
+            "PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml)."
+        ],
+    ),
+    (
+        "    This document starts with whitespaces. Next sentence.",
+        ["    ", "This document starts with whitespaces.", "Next sentence."],
+    ),
+    pytest.param(
+        "How about tomorrow?We can meet at eden garden.",
+        ["How about tomorrow?", "We can meet at eden garden."],
+        marks=pytest.mark.xfail,
+    ),
+]
 
-@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
-def test_custom_segmentation(en_with_combined_rule_tokenizer_and_segmenter_fixture, remove_new_lines_fixture, text, expected_sents):
+
+@pytest.mark.parametrize("text,expected_sents", TEST_CASES)
+def test_custom_segmentation(
+    en_with_combined_rule_tokenizer_and_segmenter_fixture,
+    remove_new_lines_fixture,
+    text,
+    expected_sents,
+):
     doc = en_with_combined_rule_tokenizer_and_segmenter_fixture(text)
     sents = [s.text for s in doc.sents]
     assert sents == expected_sents
 
+
 def test_segmenter(en_with_combined_rule_tokenizer_and_segmenter_fixture):
     # this text used to crash pysbd
     text = r"Then, (S\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction. Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅."
diff --git a/tests/test_hyponym_detector.py b/tests/test_hyponym_detector.py
index d1f14dd..e8ab8f6 100644
--- a/tests/test_hyponym_detector.py
+++ b/tests/test_hyponym_detector.py
@@ -20,8 +20,8 @@ def test_sentences(self):
         )
         doc = self.nlp(text)
         fig_trees = doc[21:23]
-        keystone_plant_species = doc[16:19]
-        assert doc._.hearst_patterns == [("such_as", keystone_plant_species, fig_trees)]
+        plant_species = doc[17:19]
+        assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)]
 
         doc = self.nlp("SARS, or other coronaviruses, are bad.")
         assert doc._.hearst_patterns == [("other", doc[4:5], doc[0:1])]