V1.0 candidate; new deduper options, new taggers (#100)

* added more runs * new plots * tokenizer fix * squatted * new lang id * all fasttext lang id * plots * further plots * wip * progress! * style * fixed format * added configs * dts * configs * more * refine * fix * fix * adding new features to deduper * accidentally removed tests * added cli options * big commit * improvement to tokenizer * bumping version * fix error in empty * new dedupe docs * names * configs * fixed paths * stack * switched to v2 * fixed dedupe config * updated * middle dedupe * mix text length * Reddit processing code (#74) * initial commit of reddit processing scripts * Minor cleanup and added Readme * removed programming subreddits from one stray dir * Update sources/reddit/README.md Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com> * Apply suggestions from code review --------- Co-authored-by: Luca Soldaini <luca@soldaini.net> Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com> * more plots * fixed version * names * different path * added support for retries * wip test * fixed tests * fixed * removing repetitions * dedupe docs * reddit stats * paths * bugfix * base * version of pycld2 that compiles on M macs * new config middle * 3 parts * further s3 tests * decode * still write empty docs to attributes when skip_empty is True * wiki adjusted * wiki config * simple counts * changed path * added new features * plots * added new digits vocab * added config to sample * small * added tokenizer script * code abl * cargo * version bump * made it stable * topics * sampling * rename * new config for 1.6 * llama config * llama config (fix) * figures * adding docs dedupe * added more dedup configs * style * added counts * more cli * style * style * removed autopep8 * resorted * testing change * corner cases * figures * added current paper * reverted cli * documentation --------- Co-authored-by: Dustin Schwenk <drschwenk@users.noreply.github.com> Co-authored-by: Niklas Muennighoff <n.muennighoff@gmail.com>
allenai · Feb 1, 2024 · 2daae12 · 2daae12
1 parent 43b0314
commit 2daae12
Show file tree

Hide file tree

Showing 124 changed files with 8,745 additions and 738 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dolma"
-version = "0.9.4"
+version = "1.0.0"
 edition = "2021"
 license = "Apache-2.0"
 
@@ -11,26 +11,37 @@ crate-type = ["cdylib"]
 
 [dependencies]
 ahash = { version = "0.8.1", features = ["runtime-rng"] }
+anyhow = "1.0"
+atomic-traits = "0.3"
 aws-config = { version = "0.55.0"}
 aws-sdk-s3 = "0.25.0"
 byteorder = "1"
 clap = { version = "4.1.11", features = ["derive"] }
+console = "0.15"
 env_logger = "0.10.0"
 flate2 = { version = "1.0.28", features = ["zlib-ng"], default-features = false }
+glob = "0.3.1"
+humantime = "2.1"
+indicatif = "0.17"
 jsonpath-rust = "0.3.0"
 log = "0.4.17"
-regex = "1.8.4"
+num_cpus = "1.0"
+num-traits = "0.2"
+parse-size = "1.0"
 pyo3 = { version = "0.19.0", features = ["extension-module"] }
 rand = "0.8.4"
 rayon = "1.7.0"
-serde = {version = "1.0.160", features = ["derive"]}
-serde_json = "1.0"
+regex = "1.8.4"
+serde = { version = "1.0.160", features = ["derive", "rc"] }
+serde_json = "1.0.108"
+simple_logger = { version = "3.0", features = ["stderr", "colors"], default-features = false, optional = true }
+structopt = { version = "0.3", optional = true }
+thousands = "0.2"
 threadpool = "1.8.1"
+tokenizers = {version = "0.15.0", features = ["http"]}
 tokio = {version = "1.27.0", features = ["full"]}
 tokio-util = "0.7.7"
 unicode-segmentation = "1.7"
-glob = "0.3.1"
-
 
 # [target.'cfg(target_arch = "aarch64")'.dependencies]
 # flate2 = "1.0.28"
diff --git a/configs/dolma-v1_5/README.md b/configs/dolma-v1_5/README.md
@@ -0,0 +1,3 @@
+# Dolma 1.5
+
+This directory
diff --git a/configs/dolma-v1_5/decontamination/README.md b/configs/dolma-v1_5/decontamination/README.md
@@ -0,0 +1,96 @@
+# Decontamination Runbook
+
+## Step 1: Create decontamination bloom filter
+
+> Okay I think every thing is ready for decon testing now. The finalized ppl suite v3 is in `s3://ai2-llm/eval-data/perplexity/v3/`. And here is my proposed plan for decon testing if you agree and it's not too much compute. The following is the sequence of things to try. At each step if the document removal rate is >0.1% or so we back off to the next step and hope the remove rate is lower:
+>
+> - **Option 1** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2 (`s3://ai2-llm/eval-data/perplexity/v2/`) for full backwards compatibility.
+> - **Option 2** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + PPL Suite v2-small (`s3://ai2-llm/eval-data/perplexity/v2_small/`) for at least full backwards for the in-loop metrics the model team was using.
+> - **Option 3** Decon against PPL Suite v3 (`s3://ai2-llm/eval-data/perplexity/v3/`) + a subset of PPL Suite v2-small requested by Dirk and Iz (`s3://ai2-llm/eval-data/perplexity/v2_small/c4_en/`, `s3://ai2-llm/eval-data/perplexity/v2_small/pile/`, `s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc/`, `s3://ai2-llm/eval-data/perplexity/v2_small/ice/`)
+>
+> Let me know if you disagree with any of this or if there's any thing I can do to help run the decon trials!
+
+
+### Step 1.1: copy data locally
+
+We copy data locally since the directory structure of the eval data in S3 is slightly different from the one we need.
+In particular, we need all documents to be under `documents/` directory.
+
+```bash
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2 $HOME/perplexity/v2/documents
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small $HOME/perplexity/v2_small/documents
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v3 $HOME/perplexity/v3/documents
+
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/c4_en $HOME/perplexity/v2_small_subset/documents/c4_en
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/pile $HOME/perplexity/v2_small_subset/documents/pile
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/m2d2_s2orc $HOME/perplexity/v2_small_subset/documents/m2d2_s2orc
+aws s3 sync s3://ai2-llm/eval-data/perplexity/v2_small/ice $HOME/perplexity/v2_small_subset/documents/ice
+```
+
+### Step 1.1b: change type of IDs in v3 subset (TEMPORARY FIX)
+
+v3 accidentally contains ids that are integers instead of strings. Until that's fixed, run:
+
+```bash
+python config/dolma-v1_5/decontamination/fix_ids_type.py
+```
+
+### Step 1.2: tag out paragraphs by uniseg length
+
+For dolma, we want to decontaminate against paragraphs that are at least 13 uniseg words long,
+so we need to compute their length first.
+
+```bash
+dolma tag --documents "${HOME}/perplexity/v2/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188
+dolma tag --documents "${HOME}/perplexity/v2_small/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188
+dolma tag --documents "${HOME}/perplexity/v3/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188
+dolma tag --documents "${HOME}/perplexity/v2_small_subset/documents/*/*/*.gz" --taggers uniseg_length_paragraphs_with_empty_v1 not_alphanum_paragraph_v1 --processes 188
+```
+
+### Step 1.3: filter out paragraphs that are too short
+
+After tagging, we can filter out to make option 1/2/3.
+
+```bash
+
+dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml mix
+dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml mix
+dolma -c configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option3.yaml mix
+
+```
+
+### Step 1.4: create bloom filter
+
+First, we cat the contents of each dataset to get number of documents:
+
+```bash
+zcat $HOME/perplexity/option1/documents/* | jq '.text' -cr | wc -l
+>>> 3681169
+zcat $HOME/perplexity/option2/documents/* | jq '.text' -cr | wc -l
+>>> 2336120
+zcat $HOME/perplexity/option3/documents/* | jq '.text' -cr | wc -l
+>>> 2020471
+```
+
+We use this numbers in the config files at `bloom_filter.estimated_doc_count`. For all three options, we set a `bloom_filter.desired_false_positive_rate` of 0.00001.
+
+```bash
+dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option1.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option2.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step1_4-create-bloom-filter/option3.yaml dedupe
+```
+
+## Step 2: Run decontamination
+
+Tag content for Dolma V1.5 for decontamination:
+
+
+```bash
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/cc.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/c4.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/stack.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/reddit.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/peS2o.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/books.yaml dedupe
+dolma -c configs/dolma-v1_5/decontamination/step2-run-decontamination/wiki.yaml dedupe
+```
diff --git a/configs/dolma-v1_5/decontamination/fix_ids_type.py b/configs/dolma-v1_5/decontamination/fix_ids_type.py
@@ -0,0 +1,33 @@
+import argparse
+import json
+from dolma.core.paths import glob_path
+import tqdm
+
+import smart_open
+
+
+def fix_path(p: str):
+    with smart_open.open(p, 'rt') as f:
+        data = [json.loads(line) for line in f]
+
+    with smart_open.open(p, 'wt') as f:
+        for d in data:
+            if 'id' in d:
+                d['id'] = str(d['id'])
+            f.write(json.dumps(d) + '\n')
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('path', nargs='+')
+    args = ap.parse_args()
+
+    with tqdm.tqdm(desc='Files') as pbar:
+        for p in args.path:
+            for sp in glob_path(p):
+                fix_path(sp)
+                pbar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option1.yaml
@@ -0,0 +1,86 @@
+streams:
+  - name: "v2"
+    documents:
+      - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/c4_100_domains/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/c4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/c4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/gab/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/gab/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/ice/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/ice/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/m2d2_s2orc/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/m2d2_wiki/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/manosphere/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/manosphere/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/mc4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/pile/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/pile/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/ptb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/ptb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/twitterAEE/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2/documents/wikitext_103/test/*.gz
+
+    output: &output
+      path: ${oc.env:HOME}/perplexity/option1/documents
+      max_size_in_bytes: 500000000
+      discard_fields:
+        - attributes
+
+    attributes: &attributes
+      - uniseg_length_paragraphs_with_empty_v1
+      - not_alphanum_paragraph_v1
+
+    span_replacement: &span_replacement
+      - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph
+        min_score: -12
+        replacement: ""
+      - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct
+        min_score: 0.5
+        replacement: ""
+
+  - name: "v3"
+    documents:
+      - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz
+
+    output: *output
+    attributes: *attributes
+    span_replacement: *span_replacement
diff --git a/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml b/configs/dolma-v1_5/decontamination/step1_3-make-eval-set/option2.yaml
@@ -0,0 +1,86 @@
+streams:
+  - name: "v2_small"
+    documents:
+      - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/c4_100_domains/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/c4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/gab/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/gab/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/ice/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/ice/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_s2orc/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/m2d2_wiki/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/manosphere/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/mc4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/pile/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/pile/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/ptb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/twitterAEE/test/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/val/*.gz
+      - ${oc.env:HOME}/perplexity/v2_small/documents/wikitext_103/test/*.gz
+
+    output: &output
+      path: ${oc.env:HOME}/perplexity/option2/documents
+      max_size_in_bytes: 500000000
+      discard_fields:
+        - attributes
+
+    attributes: &attributes
+      - uniseg_length_paragraphs_with_empty_v1
+      - not_alphanum_paragraph_v1
+
+    span_replacement: &span_replacement
+      - span: $.attributes.uniseg_length_paragraphs_with_empty_v1__uniseg_length_paragraphs_with_empty_v1__negative_paragraph
+        min_score: -12
+        replacement: ""
+      - span: $.attributes.not_alphanum_paragraph_v1__not_alphanum_paragraph_v1__all_punct
+        min_score: 0.5
+        replacement: ""
+
+  - name: "v3"
+    documents:
+      - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/4chan_meta_sep/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_100_domains/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_en/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/c4_en/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma_100_subreddits/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/dolma-v1_5/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/falcon-refinedweb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/gab/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/gab/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ice_fixed/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_s2orc_unsplit/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/m2d2_wikipedia_unsplit/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/manosphere_meta_sep/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/mc4/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/mc4/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/pile/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/pile/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ptb/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/ptb/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/redpajama/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/redpajama/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/twitterAAE_HELM_fixed/test/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/val/*.gz
+      - ${oc.env:HOME}/perplexity/v3/documents/wikitext_103/test/*.gz
+
+    output: *output
+    attributes: *attributes
+    span_replacement: *span_replacement