From 7782867f5239fb61fe41fda95a3dbb51d8dbcd24 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Thu, 1 Jun 2023 15:40:40 -0700
Subject: [PATCH 01/14] formatting

---
 .../src/ai2_llm_filters/core_tools/runtime.py | 20 ++++++++++++--
 .../dedupers/cc-v1-small-head-dedup.json      | 24 +++++++++++++++++
 .../cc-v1-small-head-middle-dedup.json        | 24 +++++++++++++++++
 .../mixers/cc-v1-small-head-dedup.json        | 26 +++++++++++++++++++
 .../mixers/cc-v1-small-head-middle-dedup.json | 26 +++++++++++++++++++
 scripts/prepare_memmap_dataset.py             | 12 ++++++---
 6 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json

diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
index 7e4596ffb..c5f2843f9 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
@@ -3,13 +3,15 @@
 import multiprocessing
 import tempfile
 from contextlib import ExitStack
+from os import read
 from queue import Queue
-from typing import Dict
+from typing import Dict, Optional
 
 import msgspec
 from smashed.utils.io_utils import (
     compress_stream,
     decompress_stream,
+    open_file_for_read,
     open_file_for_write,
     stream_file_for_read,
 )
@@ -70,6 +72,9 @@ def process_single(
         # skip on failure
         skip_on_failure = kwargs.get("skip_on_failure", False)
 
+        # local read cache
+        local_read_cache = kwargs.get("local_read_cache", None)
+
         # interval at which to update the progress bar; will double if it gets
         # too full
         update_interval = 1
@@ -87,7 +92,11 @@ def process_single(
                 # open each file for reading and writing. We use open_file_for_read to handle s3 paths and
                 # download the file locally if needed, while gzip.open is used to
                 # read and write gzipped files.
-                in_file = stack.enter_context(stream_file_for_read(source_path, "rb"))
+                in_file = stack.enter_context(
+                    stream_file_for_read(source_path, "rb")
+                    if local_read_cache is None
+                    else open_file_for_read(local_read_cache, "rb", temp_dir=local_read_cache)
+                )
                 in_stream = stack.enter_context(decompress_stream(in_file, "rt"))
                 out_file = stack.enter_context(open_file_for_write(destination_path, "wb"))
                 out_stream = stack.enter_context(compress_stream(out_file, "wt"))
@@ -173,6 +182,12 @@ def main(cls):
             type=str,
             help="If provided, keeps track of which files have already been processed and skips them. ",
         )
+        ap.add_argument(
+            "--local-read-cache",
+            default=None,
+            type=str,
+            help="If provided, will cache the files locally before processing them.",
+        )
         ap.add_argument(
             "--manually-included-paths",
             default=None,
@@ -231,4 +246,5 @@ def main(cls):
                 taggers_names=opts.taggers,
                 experiment_name=opts.experiment_name,
                 skip_on_failure=opts.skip_on_failure,
+                local_read_cache=opts.local_read_cache,
             )
diff --git a/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json
new file mode 100644
index 000000000..d61f3c0f9
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json
@@ -0,0 +1,24 @@
+{
+  "documents": [
+    "pretraining-data/sources/common-crawl/v1-small-head/documents/v1_small-head-0101.json.gz"
+  ],
+  "work_dir": {
+    "input": "/data2/v1-small-head/deduper/input",
+    "output": "/data2/v1-small-head/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json
new file mode 100644
index 000000000..0c24c59ae
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json
@@ -0,0 +1,24 @@
+{
+  "documents": [
+    "pretraining-data/sources/common-crawl/v1-small-head-middle/documents/v1_small-head-middle-0126.json.gz"
+  ],
+  "work_dir": {
+    "input": "/tmp/v1-small-head-middle/deduper/input",
+    "output": "/tmp/v1-small-head-middle/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json
new file mode 100644
index 000000000..ef8750538
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json
@@ -0,0 +1,26 @@
+{
+  "streams": [
+    {
+      "name": "cc-v1-small-head-dedup",
+      "documents": [
+        "pretraining-data/sources/common-crawl/v1-small-head/documents/*"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/common-crawl/abl-cc-v1-small-head-dedup/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/data2/cc-v1-small-head/mixer/input",
+    "output": "/data2/cc-v1-small-head/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json
new file mode 100644
index 000000000..743cbfa13
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json
@@ -0,0 +1,26 @@
+{
+  "streams": [
+    {
+      "name": "cc-v1-small-head-middle-dedup",
+      "documents": [
+        "pretraining-data/sources/common-crawl/v1-small-head-middle/documents/*"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/common-crawl/abl-cc-v1-small-head-middle-dedup/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-v1-small-head-middle/mixer/input",
+    "output": "/tmp/cc-v1-small-head-middle/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index cc2814d9c..5152983e6 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -77,9 +77,15 @@ def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None,
         input_file = stack.enter_context(stream_file_for_read(path, mode="rb"))
         input_stream = stack.enter_context(decompress_stream(input_file, mode="rt"))
 
-        for line in input_stream:
-            row = decoder.decode(line)
-            yield tokenizer.encode(row.text, add_special_tokens=True)
+        i = 1
+        try:
+            for line in input_stream:
+                row = decoder.decode(line)
+                yield tokenizer.encode(row.text, add_special_tokens=True)
+                i += 1
+        except Exception as e:
+            log.error(f"Error processing {path}:{i:,} -> {e}")
+            pass
 
 
 class MemmapFile:

From 459d133e5d48a19592713b86aa52104223d85bb9 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Thu, 1 Jun 2023 15:45:57 -0700
Subject: [PATCH 02/14] imports

---
 .../filters/src/ai2_llm_filters/core_tools/runtime.py          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
index c5f2843f9..b0a3a524d 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
@@ -3,9 +3,8 @@
 import multiprocessing
 import tempfile
 from contextlib import ExitStack
-from os import read
 from queue import Queue
-from typing import Dict, Optional
+from typing import Dict
 
 import msgspec
 from smashed.utils.io_utils import (

From 28cf491580ef64abf927ca78f691e71bdaafac2d Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Thu, 1 Jun 2023 22:28:19 -0700
Subject: [PATCH 03/14] fixed source path

---
 .../filters/src/ai2_llm_filters/core_tools/runtime.py          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
index b0a3a524d..d08e7926d 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
@@ -91,10 +91,11 @@ def process_single(
                 # open each file for reading and writing. We use open_file_for_read to handle s3 paths and
                 # download the file locally if needed, while gzip.open is used to
                 # read and write gzipped files.
+
                 in_file = stack.enter_context(
                     stream_file_for_read(source_path, "rb")
                     if local_read_cache is None
-                    else open_file_for_read(local_read_cache, "rb", temp_dir=local_read_cache)
+                    else open_file_for_read(source_path, "rb", temp_dir=local_read_cache)
                 )
                 in_stream = stack.enter_context(decompress_stream(in_file, "rt"))
                 out_file = stack.enter_context(open_file_for_write(destination_path, "wb"))

From 189d99fc8346aee70a7796cc350e517db3731417 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Thu, 1 Jun 2023 22:30:07 -0700
Subject: [PATCH 04/14] s2 configs

---
 .../ablations/dedupers/falcon-dedup.json      | 24 +++++++
 .../config/ablations/dedupers/s2-dedup.json   | 63 +++++++++++++++++++
 .../config/ablations/mixers/s2-dedup.json     | 45 +++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/s2-dedup.json

diff --git a/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json
new file mode 100644
index 000000000..bb4a4052d
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json
@@ -0,0 +1,24 @@
+{
+  "documents": [
+    "pretraining-data/sources/falcon-refinedweb/v0/documents/*.json.gz"
+  ],
+  "work_dir": {
+    "input": "/data2/falcon/deduper/input",
+    "output": "/data2/falcon/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json
new file mode 100644
index 000000000..fcd572ac4
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json
@@ -0,0 +1,63 @@
+{
+  "documents": [
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=9/*.gz"
+  ],
+  "work_dir": {
+    "input": "/tmp/s2/deduper/input",
+    "output": "/tmp/s2/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json
new file mode 100644
index 000000000..e33e14aaf
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json
@@ -0,0 +1,45 @@
+{
+  "streams": [
+    {
+      "name": "abl-s2-v3",
+      "documents": [
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=0/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=1/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=2/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=3/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=4/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=5/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=6/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=7/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=8/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=9/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=0/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=1/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=2/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=3/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=4/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=5/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=6/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=7/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=8/*.gz",
+        "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=9/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/s2/abl-s2-v3/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/abl-s2-v3/mixer/input",
+    "output": "/tmp/abl-s2-v3/mixer/output"
+  },
+  "processes": 120
+}

From 79da8196afbe94b7e2bddf72e68ebb40e887d7b2 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Thu, 1 Jun 2023 22:56:40 -0700
Subject: [PATCH 05/14] more configs

---
 .../config/ablations/mixers/falcon-dedup.json | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json

diff --git a/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json b/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json
new file mode 100644
index 000000000..257098314
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json
@@ -0,0 +1,26 @@
+{
+  "streams": [
+    {
+      "name": "falcon-v0",
+      "documents": [
+        "pretraining-data/sources/falcon-refinedweb/v0/documents/*.json.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/falcon-refinedweb/abl-v0/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/data2/falcon-abl-v0/mixer/input",
+    "output": "/data2/falcon-abl-v0/mixer/output"
+  },
+  "processes": 120
+}

From 4aae45b01f3a8903553e1dcc159ae1ce0ebe61b1 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 13:22:26 -0700
Subject: [PATCH 06/14] safe mode

---
 .../ablations/dedupers/s2-v2-dedup.json       | 63 +++++++++++++++++++
 .../{s2-dedup.json => s2-v3-dedup.json}       |  0
 .../config/ablations/mixers/s2-v2-dedup.json  | 45 +++++++++++++
 .../{s2-dedup.json => s2-v3-dedup.json}       |  0
 requirements.txt                              |  1 +
 scripts/prepare_memmap_dataset.py             | 26 ++++++--
 6 files changed, 130 insertions(+), 5 deletions(-)
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json
 rename pretrain_data/mixer/config/ablations/dedupers/{s2-dedup.json => s2-v3-dedup.json} (100%)
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json
 rename pretrain_data/mixer/config/ablations/mixers/{s2-dedup.json => s2-v3-dedup.json} (100%)

diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json
new file mode 100644
index 000000000..61dbd634f
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json
@@ -0,0 +1,63 @@
+{
+  "documents": [
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=9/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=0/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=1/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=2/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=3/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=4/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=5/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=6/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=7/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=8/*.gz",
+    "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=9/*.gz"
+  ],
+  "work_dir": {
+    "input": "/tmp/abl-s2-v2/deduper/input",
+    "output": "/tmp/abl-s2-v2/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-v3-dedup.json
similarity index 100%
rename from pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json
rename to pretrain_data/mixer/config/ablations/dedupers/s2-v3-dedup.json
diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json
new file mode 100644
index 000000000..b1d2dc15e
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json
@@ -0,0 +1,45 @@
+{
+  "streams": [
+    {
+      "name": "abl-s2-v2",
+      "documents": [
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=0/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=1/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=2/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=3/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=4/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=5/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=6/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=7/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=8/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=9/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=0/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=1/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=2/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=3/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=4/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=5/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=6/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=7/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=8/*.gz",
+        "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=9/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/s2/abl-s2-v2/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/abl-s2-v2/mixer/input",
+    "output": "/tmp/abl-s2-v2/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-v3-dedup.json
similarity index 100%
rename from pretrain_data/mixer/config/ablations/mixers/s2-dedup.json
rename to pretrain_data/mixer/config/ablations/mixers/s2-v3-dedup.json
diff --git a/requirements.txt b/requirements.txt
index e3d225e7e..a93b3ff9d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ datasets
 scikit-learn
 smashed[remote]>=0.21.1
 msgspec>=0.14.0
+cached_path
diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 5152983e6..6f250cc88 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -13,6 +13,7 @@
 
 import concurrent.futures
 import functools
+import gzip
 import json
 import logging
 import multiprocessing as mp
@@ -40,6 +41,7 @@
     recursively_list_files,
     stream_file_for_read,
 )
+from cached_path import cached_path
 
 from olmo import Tokenizer
 from olmo.util import prepare_cli_environment
@@ -66,7 +68,7 @@ class InputDocumentSpec(msgspec.Struct):
     text: str
 
 
-def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None, None]:
+def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> Generator[List[int], None, None]:
     """Tokenize a file of documents using the provided tokenizer; file is expected to be a gzipped JSON lines
     file, each containing a field named `text`.
     """
@@ -74,8 +76,12 @@ def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None,
     decoder = msgspec.json.Decoder(InputDocumentSpec)
 
     with ExitStack() as stack:
-        input_file = stack.enter_context(stream_file_for_read(path, mode="rb"))
-        input_stream = stack.enter_context(decompress_stream(input_file, mode="rt"))
+        if safe_mode:
+            local_path = cached_path(path)
+            input_stream = stack.enter_context(gzip.open(local_path, mode="rt"))
+        else:
+            input_file = stack.enter_context(stream_file_for_read(path, mode="rb"))
+            input_stream = stack.enter_context(decompress_stream(input_file, mode="rt"))
 
         i = 1
         try:
@@ -209,6 +215,7 @@ def fill_memmap(
     path: str,
     memmap_path: str,
     dtype: np.dtype,
+    safe_mode: bool = False,
     max_tokens: int = 512 * 1024 * 1024,  # 512M tokens * 2 bytes per token (uint16) = 1GB
 ):
     """Write a memmap file from a file of documents."""
@@ -222,7 +229,8 @@ def fill_memmap(
     file_index = 0
 
     with ExitStack() as stack:
-        for line_no, token_ids in enumerate(tokenize_file(tokenizer=tokenizer, path=path), start=1):
+        it = tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode)
+        for line_no, token_ids in enumerate(it, start=1):
             # flush any 10k lines or so; improves stability
             flush = line_no % 10_000 == 0
 
@@ -287,6 +295,7 @@ def make_source_and_target(src: Tuple[str, ...], output: str) -> Tuple[Tuple[str
     help="Maximum number of tokens to store in a single memmap file (default: 512M tokens or 1GB)",
 )
 @click.option("--debug/--no-debug", default=False, help="Enable debug (single process mode)")
+@click.option("--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open")
 @click.option("-j", "--workers", "max_workers", type=int, default=None, help="Defaults to number of CPUs")
 def main(
     src: Tuple[str, ...],
@@ -295,6 +304,7 @@ def main(
     dtype_str: str,
     validate: bool,
     max_tokens: int,
+    safe_mode: bool,
     debug: bool,
     max_workers: Optional[int] = None,
 ):
@@ -303,7 +313,13 @@ def main(
 
     # creating a partial here with all the arguments we need to pass to fill_memmap except for the paths
     # so that we don't make mistakes between debug and non-debug mode
-    fill_memmap_fn = functools.partial(fill_memmap, tokenizer_id=tokenizer_id, dtype=dtype, max_tokens=max_tokens)
+    fill_memmap_fn = functools.partial(
+        fill_memmap,
+        tokenizer_id=tokenizer_id,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        safe_mode=safe_mode
+    )
 
     if debug:
         log.info("Running in debug mode. Only one process will be used.")

From c8c64620b2738f619be6376e2e042a20d9448f22 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 13:27:00 -0700
Subject: [PATCH 07/14] fix

---
 scripts/prepare_memmap_dataset.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 6f250cc88..944350215 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -74,11 +74,12 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G
     """
 
     decoder = msgspec.json.Decoder(InputDocumentSpec)
+    caching_path = path
 
     with ExitStack() as stack:
         if safe_mode:
-            local_path = cached_path(path)
-            input_stream = stack.enter_context(gzip.open(local_path, mode="rt"))
+            caching_path = cached_path(path)
+            input_stream = stack.enter_context(gzip.open(caching_path, mode="rt"))
         else:
             input_file = stack.enter_context(stream_file_for_read(path, mode="rb"))
             input_stream = stack.enter_context(decompress_stream(input_file, mode="rt"))
@@ -93,6 +94,9 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G
             log.error(f"Error processing {path}:{i:,} -> {e}")
             pass
 
+    if caching_path != path and os.path.exists(caching_path):
+        os.remove(caching_path)
+
 
 class MemmapFile:
     """Context manager responsible for writing, resizing, and closing / uploading a memmap file."""

From 4c2f5fe43616f4397f46fb365f12a8c6178bc489 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 13:27:37 -0700
Subject: [PATCH 08/14] deleting caching path

---
 scripts/prepare_memmap_dataset.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 944350215..5c767f1bd 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -27,6 +27,7 @@
 import click
 import msgspec
 import numpy as np
+from cached_path import cached_path
 from rich.progress import (
     BarColumn,
     MofNCompleteColumn,
@@ -41,7 +42,6 @@
     recursively_list_files,
     stream_file_for_read,
 )
-from cached_path import cached_path
 
 from olmo import Tokenizer
 from olmo.util import prepare_cli_environment
@@ -299,7 +299,9 @@ def make_source_and_target(src: Tuple[str, ...], output: str) -> Tuple[Tuple[str
     help="Maximum number of tokens to store in a single memmap file (default: 512M tokens or 1GB)",
 )
 @click.option("--debug/--no-debug", default=False, help="Enable debug (single process mode)")
-@click.option("--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open")
+@click.option(
+    "--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open"
+)
 @click.option("-j", "--workers", "max_workers", type=int, default=None, help="Defaults to number of CPUs")
 def main(
     src: Tuple[str, ...],
@@ -318,11 +320,7 @@ def main(
     # creating a partial here with all the arguments we need to pass to fill_memmap except for the paths
     # so that we don't make mistakes between debug and non-debug mode
     fill_memmap_fn = functools.partial(
-        fill_memmap,
-        tokenizer_id=tokenizer_id,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        safe_mode=safe_mode
+        fill_memmap, tokenizer_id=tokenizer_id, dtype=dtype, max_tokens=max_tokens, safe_mode=safe_mode
     )
 
     if debug:

From d57213d0b019a7e1143dea763236fa17f19f3397 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 17:22:40 -0700
Subject: [PATCH 09/14] --safe mode for filters

---
 .../src/ai2_llm_filters/core_tools/runtime.py | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
index d08e7926d..205f2abc8 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
@@ -1,16 +1,18 @@
 import argparse
+import gzip
 import logging
 import multiprocessing
+import os
 import tempfile
 from contextlib import ExitStack
 from queue import Queue
 from typing import Dict
 
 import msgspec
+from cached_path import cached_path
 from smashed.utils.io_utils import (
     compress_stream,
     decompress_stream,
-    open_file_for_read,
     open_file_for_write,
     stream_file_for_read,
 )
@@ -86,18 +88,22 @@ def process_single(
         encoder = msgspec.json.Encoder()
         decoder = msgspec.json.Decoder(InputSpec)
 
+        # this will be used to cache the file locally if needed
+        caching_path = source_path
+
         with ExitStack() as stack:
             try:
                 # open each file for reading and writing. We use open_file_for_read to handle s3 paths and
                 # download the file locally if needed, while gzip.open is used to
                 # read and write gzipped files.
 
-                in_file = stack.enter_context(
-                    stream_file_for_read(source_path, "rb")
-                    if local_read_cache is None
-                    else open_file_for_read(source_path, "rb", temp_dir=local_read_cache)
-                )
-                in_stream = stack.enter_context(decompress_stream(in_file, "rt"))
+                if local_read_cache is not None:
+                    caching_path = cached_path(source_path, cache_dir=local_read_cache)
+                    in_stream = stack.enter_context(gzip.open(caching_path, mode="rt"))
+                else:
+                    input_file = stack.enter_context(stream_file_for_read(source_path, mode="rb"))
+                    in_stream = stack.enter_context(decompress_stream(input_file, mode="rt"))
+
                 out_file = stack.enter_context(open_file_for_write(destination_path, "wb"))
                 out_stream = stack.enter_context(compress_stream(out_file, "wt"))
 
@@ -137,6 +143,9 @@ def process_single(
                 logger.warning("\n" + msg)
                 if not skip_on_failure:
                     raise Ai2LlmFilterError(msg) from e
+            finally:
+                if caching_path != source_path and os.path.exists(caching_path):
+                    os.remove(caching_path)
 
         # increment the files progress bar
         cls.increment_progressbar(queue, files=1, documents=docs_cnt)
@@ -197,6 +206,9 @@ def main(cls):
         ap.add_argument(
             "--manually-excluded-paths", default=None, nargs="+", help="If provided, these paths will be skipped."
         )
+        ap.add_argument(
+            "--safe-mode", action="store_true", help="Run in safe mode; will download locally before processing."
+        )
         opts = ap.parse_args()
 
         if opts.list_taggers:
@@ -228,10 +240,15 @@ def main(cls):
                 f"skip on fail: {opts.skip_on_failure}\n"
                 f"reuse prev:   {not ignore_existing}\n"
                 f"workdir:      {metadata_workdir}\n"
+                f"safe mode:    {opts.safe_mode}\n"
+                f"local cache:  {opts.local_read_cache}\n"
                 "---------------------------\n"
             )
             print(msg)
 
+            # use a local read cache if we are in safe mode or if a local read cache is provided
+            local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None)
+
             parallel_compute = cls(
                 source_prefix=source_prefix,
                 destination_prefix=destination_prefix,
@@ -246,5 +263,5 @@ def main(cls):
                 taggers_names=opts.taggers,
                 experiment_name=opts.experiment_name,
                 skip_on_failure=opts.skip_on_failure,
-                local_read_cache=opts.local_read_cache,
+                local_read_cache=local_read_cache,
             )

From 9452e4f99ad5f039f869f6bdafd533192c7782f7 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 17:23:44 -0700
Subject: [PATCH 10/14] printing

---
 .../filters/src/ai2_llm_filters/core_tools/runtime.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
index 205f2abc8..fc49d99bb 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py
@@ -225,6 +225,9 @@ def main(cls):
         source_prefix = f"{cls.BASE_S3_PREFIX}/{opts.dataset}/documents"
         destination_prefix = f"{cls.BASE_S3_PREFIX}/{opts.dataset}/attributes/{opts.experiment_name}"
 
+        # use a local read cache if we are in safe mode or if a local read cache is provided
+        local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None)
+
         with tempfile.TemporaryDirectory() as tempdir:
             metadata_workdir = opts.reuse_existing or tempdir
             ignore_existing = opts.reuse_existing is None
@@ -241,14 +244,11 @@ def main(cls):
                 f"reuse prev:   {not ignore_existing}\n"
                 f"workdir:      {metadata_workdir}\n"
                 f"safe mode:    {opts.safe_mode}\n"
-                f"local cache:  {opts.local_read_cache}\n"
+                f"local cache:  {local_read_cache}\n"
                 "---------------------------\n"
             )
             print(msg)
 
-            # use a local read cache if we are in safe mode or if a local read cache is provided
-            local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None)
-
             parallel_compute = cls(
                 source_prefix=source_prefix,
                 destination_prefix=destination_prefix,

From fe6227bee9209f362870eae0e759f81c099746b2 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Fri, 2 Jun 2023 17:26:23 -0700
Subject: [PATCH 11/14] ignoring empty lines

---
 scripts/prepare_memmap_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 5c767f1bd..2586aad44 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -88,7 +88,8 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G
         try:
             for line in input_stream:
                 row = decoder.decode(line)
-                yield tokenizer.encode(row.text, add_special_tokens=True)
+                if row.text.strip():
+                    yield tokenizer.encode(row.text, add_special_tokens=True)
                 i += 1
         except Exception as e:
             log.error(f"Error processing {path}:{i:,} -> {e}")

From 4d8c599dce4261cea27083172488c0be9ceca485 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Sun, 4 Jun 2023 14:18:35 -0700
Subject: [PATCH 12/14] added flags

---
 scripts/olmo-small-ablation-on-lumi.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/olmo-small-ablation-on-lumi.sh b/scripts/olmo-small-ablation-on-lumi.sh
index 620e074f2..375384bfd 100644
--- a/scripts/olmo-small-ablation-on-lumi.sh
+++ b/scripts/olmo-small-ablation-on-lumi.sh
@@ -40,6 +40,7 @@ export CONFIG_PATH=configs/olmo-small-ablation.yaml
 # get run name, we will postpend it with the job id of this slurm run
 export RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//')
 
+# actually run the training script
 srun \
   --cpus-per-task=$SLURM_CPUS_PER_TASK \
   --distribution=block:block \
@@ -55,4 +56,5 @@ srun \
     $PROJECT_DIR/containers/$OLMO_CONTAINER \
     python scripts/train.py $CONFIG_PATH \
       --run_name="${RUN_NAME}_${SLURM_JOB_ID}" \
-      --wandb.project=$WANDB_PROJECT
+      --wandb.project=$WANDB_PROJECT \
+      ${@}

From e056e9cc83c3e4dcaed57715d5dbd2939c1ee1a4 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Sun, 4 Jun 2023 16:47:49 -0700
Subject: [PATCH 13/14] new decontamination

---
 .../config/ablations/dedupers/rp-dedup.json   | 44 ++++++++++++++++++
 .../config/ablations/mixers/rp-dedup.json     | 46 +++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json
 create mode 100644 pretrain_data/mixer/config/ablations/mixers/rp-dedup.json

diff --git a/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json
new file mode 100644
index 000000000..a3769587d
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json
@@ -0,0 +1,44 @@
+{
+  "documents": [
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=github/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=arxiv/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=book/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=c4/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=common_crawl/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=github/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=stackexchange/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=wikipedia/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=arxiv/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=book/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=c4/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=common_crawl/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=github/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=stackexchange/*.gz",
+      "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=wikipedia/*.gz"
+  ],
+  "work_dir": {
+    "input": "/data2/redpajama-v1/deduper/input",
+    "output": "/data2/redpajama-v1/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json b/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json
new file mode 100644
index 000000000..2c87b5910
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json
@@ -0,0 +1,46 @@
+{
+  "streams": [
+    {
+      "name": "redpajama-v1",
+      "documents": [
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=github/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=arxiv/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=book/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=c4/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=common_crawl/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=github/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=stackexchange/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=wikipedia/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=arxiv/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=book/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=c4/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=common_crawl/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=github/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=stackexchange/*.gz",
+        "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=wikipedia/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/redpajama/abl-v1/documents",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": ["decontamination"],
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]"
+        ]
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/data2/redpajama-v1/mixer/input",
+    "output": "/data2/redpajama-v1/mixer/output"
+  },
+  "processes": 120
+}

From 45d74edcdbd8be69252fa886cb87cdaf9a8ceeea Mon Sep 17 00:00:00 2001
From: Luca Soldaini <luca@soldaini.net>
Date: Sun, 4 Jun 2023 20:55:22 -0700
Subject: [PATCH 14/14] tagger path

---
 pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py | 2 +-
 tokenizer/src/olmo_tokenizer/hf/train.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py b/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py
index 40ea1a43a..e7f3951fa 100644
--- a/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py
+++ b/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py
@@ -42,4 +42,4 @@ class FastTextJigsawNsfwDocumentTagger(FastTextJigsawHatespeechDocumentTagger):
 
 @TaggerRegistry.add("jigsaw_nsfw_sencence_v2")
 class FastTextJigsawNsfwSentenceTagger(FastTextJigsawHatespeechSentenceTagger):
-    ...
+    MODEL_PATH = "https://ai2-s2-research-public.s3.us-west-2.amazonaws.com/aakankshan/olmo-data-filters/jigsaw_fasttext_bigrams_nsfw_final.bin"  # noqa: E501
diff --git a/tokenizer/src/olmo_tokenizer/hf/train.py b/tokenizer/src/olmo_tokenizer/hf/train.py
index bcf9f8858..83b6314ad 100644
--- a/tokenizer/src/olmo_tokenizer/hf/train.py
+++ b/tokenizer/src/olmo_tokenizer/hf/train.py
@@ -187,7 +187,7 @@ class TrainConfig:
     input_dir: Optional[str] = None
     input_dirs: Optional[List[str]] = None
     save_path: str = sp.MISSING
-    normalization: Union[str, None] = sp.field(default="NFD", help="Choose between NFD, NFKD, NFC, or NFKC")
+    normalization: Union[str, None] = sp.field(default="NFC", help="Choose between NFD, NFKD, NFC, or NFKC")
     vocab_size: int = 64_000
     model: str = sp.field(default="BPE", help="Choose between BPE (default) or Unigram.")