From 7782867f5239fb61fe41fda95a3dbb51d8dbcd24 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 1 Jun 2023 15:40:40 -0700 Subject: [PATCH 01/14] formatting --- .../src/ai2_llm_filters/core_tools/runtime.py | 20 ++++++++++++-- .../dedupers/cc-v1-small-head-dedup.json | 24 +++++++++++++++++ .../cc-v1-small-head-middle-dedup.json | 24 +++++++++++++++++ .../mixers/cc-v1-small-head-dedup.json | 26 +++++++++++++++++++ .../mixers/cc-v1-small-head-middle-dedup.json | 26 +++++++++++++++++++ scripts/prepare_memmap_dataset.py | 12 ++++++--- 6 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py index 7e4596ffb..c5f2843f9 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py +++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py @@ -3,13 +3,15 @@ import multiprocessing import tempfile from contextlib import ExitStack +from os import read from queue import Queue -from typing import Dict +from typing import Dict, Optional import msgspec from smashed.utils.io_utils import ( compress_stream, decompress_stream, + open_file_for_read, open_file_for_write, stream_file_for_read, ) @@ -70,6 +72,9 @@ def process_single( # skip on failure skip_on_failure = kwargs.get("skip_on_failure", False) + # local read cache + local_read_cache = kwargs.get("local_read_cache", None) + # interval at which to update the progress bar; will double if it gets # too full update_interval = 1 @@ -87,7 +92,11 @@ def process_single( # open each file for reading and writing. We use open_file_for_read to handle s3 paths and # download the file locally if needed, while gzip.open is used to # read and write gzipped files. - in_file = stack.enter_context(stream_file_for_read(source_path, "rb")) + in_file = stack.enter_context( + stream_file_for_read(source_path, "rb") + if local_read_cache is None + else open_file_for_read(local_read_cache, "rb", temp_dir=local_read_cache) + ) in_stream = stack.enter_context(decompress_stream(in_file, "rt")) out_file = stack.enter_context(open_file_for_write(destination_path, "wb")) out_stream = stack.enter_context(compress_stream(out_file, "wt")) @@ -173,6 +182,12 @@ def main(cls): type=str, help="If provided, keeps track of which files have already been processed and skips them. ", ) + ap.add_argument( + "--local-read-cache", + default=None, + type=str, + help="If provided, will cache the files locally before processing them.", + ) ap.add_argument( "--manually-included-paths", default=None, @@ -231,4 +246,5 @@ def main(cls): taggers_names=opts.taggers, experiment_name=opts.experiment_name, skip_on_failure=opts.skip_on_failure, + local_read_cache=opts.local_read_cache, ) diff --git a/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json new file mode 100644 index 000000000..d61f3c0f9 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-dedup.json @@ -0,0 +1,24 @@ +{ + "documents": [ + "pretraining-data/sources/common-crawl/v1-small-head/documents/v1_small-head-0101.json.gz" + ], + "work_dir": { + "input": "/data2/v1-small-head/deduper/input", + "output": "/data2/v1-small-head/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json new file mode 100644 index 000000000..0c24c59ae --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/cc-v1-small-head-middle-dedup.json @@ -0,0 +1,24 @@ +{ + "documents": [ + "pretraining-data/sources/common-crawl/v1-small-head-middle/documents/v1_small-head-middle-0126.json.gz" + ], + "work_dir": { + "input": "/tmp/v1-small-head-middle/deduper/input", + "output": "/tmp/v1-small-head-middle/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json new file mode 100644 index 000000000..ef8750538 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-dedup.json @@ -0,0 +1,26 @@ +{ + "streams": [ + { + "name": "cc-v1-small-head-dedup", + "documents": [ + "pretraining-data/sources/common-crawl/v1-small-head/documents/*" + ], + "output": { + "path": "pretraining-data/sources/common-crawl/abl-cc-v1-small-head-dedup/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/data2/cc-v1-small-head/mixer/input", + "output": "/data2/cc-v1-small-head/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json new file mode 100644 index 000000000..743cbfa13 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/cc-v1-small-head-middle-dedup.json @@ -0,0 +1,26 @@ +{ + "streams": [ + { + "name": "cc-v1-small-head-middle-dedup", + "documents": [ + "pretraining-data/sources/common-crawl/v1-small-head-middle/documents/*" + ], + "output": { + "path": "pretraining-data/sources/common-crawl/abl-cc-v1-small-head-middle-dedup/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/tmp/cc-v1-small-head-middle/mixer/input", + "output": "/tmp/cc-v1-small-head-middle/mixer/output" + }, + "processes": 120 +} diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index cc2814d9c..5152983e6 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -77,9 +77,15 @@ def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None, input_file = stack.enter_context(stream_file_for_read(path, mode="rb")) input_stream = stack.enter_context(decompress_stream(input_file, mode="rt")) - for line in input_stream: - row = decoder.decode(line) - yield tokenizer.encode(row.text, add_special_tokens=True) + i = 1 + try: + for line in input_stream: + row = decoder.decode(line) + yield tokenizer.encode(row.text, add_special_tokens=True) + i += 1 + except Exception as e: + log.error(f"Error processing {path}:{i:,} -> {e}") + pass class MemmapFile: From 459d133e5d48a19592713b86aa52104223d85bb9 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 1 Jun 2023 15:45:57 -0700 Subject: [PATCH 02/14] imports --- .../filters/src/ai2_llm_filters/core_tools/runtime.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py index c5f2843f9..b0a3a524d 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py +++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py @@ -3,9 +3,8 @@ import multiprocessing import tempfile from contextlib import ExitStack -from os import read from queue import Queue -from typing import Dict, Optional +from typing import Dict import msgspec from smashed.utils.io_utils import ( From 28cf491580ef64abf927ca78f691e71bdaafac2d Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 1 Jun 2023 22:28:19 -0700 Subject: [PATCH 03/14] fixed source path --- .../filters/src/ai2_llm_filters/core_tools/runtime.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py index b0a3a524d..d08e7926d 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py +++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py @@ -91,10 +91,11 @@ def process_single( # open each file for reading and writing. We use open_file_for_read to handle s3 paths and # download the file locally if needed, while gzip.open is used to # read and write gzipped files. + in_file = stack.enter_context( stream_file_for_read(source_path, "rb") if local_read_cache is None - else open_file_for_read(local_read_cache, "rb", temp_dir=local_read_cache) + else open_file_for_read(source_path, "rb", temp_dir=local_read_cache) ) in_stream = stack.enter_context(decompress_stream(in_file, "rt")) out_file = stack.enter_context(open_file_for_write(destination_path, "wb")) From 189d99fc8346aee70a7796cc350e517db3731417 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 1 Jun 2023 22:30:07 -0700 Subject: [PATCH 04/14] s2 configs --- .../ablations/dedupers/falcon-dedup.json | 24 +++++++ .../config/ablations/dedupers/s2-dedup.json | 63 +++++++++++++++++++ .../config/ablations/mixers/s2-dedup.json | 45 +++++++++++++ 3 files changed, 132 insertions(+) create mode 100644 pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/mixers/s2-dedup.json diff --git a/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json new file mode 100644 index 000000000..bb4a4052d --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/falcon-dedup.json @@ -0,0 +1,24 @@ +{ + "documents": [ + "pretraining-data/sources/falcon-refinedweb/v0/documents/*.json.gz" + ], + "work_dir": { + "input": "/data2/falcon/deduper/input", + "output": "/data2/falcon/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json new file mode 100644 index 000000000..fcd572ac4 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json @@ -0,0 +1,63 @@ +{ + "documents": [ + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=valid/part_id=9/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=valid/part_id=9/*.gz" + ], + "work_dir": { + "input": "/tmp/s2/deduper/input", + "output": "/tmp/s2/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json new file mode 100644 index 000000000..e33e14aaf --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json @@ -0,0 +1,45 @@ +{ + "streams": [ + { + "name": "abl-s2-v3", + "documents": [ + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2ag/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v3/documents/dataset=s2orc/split=train/part_id=9/*.gz" + ], + "output": { + "path": "pretraining-data/sources/s2/abl-s2-v3/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/tmp/abl-s2-v3/mixer/input", + "output": "/tmp/abl-s2-v3/mixer/output" + }, + "processes": 120 +} From 79da8196afbe94b7e2bddf72e68ebb40e887d7b2 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 1 Jun 2023 22:56:40 -0700 Subject: [PATCH 05/14] more configs --- .../config/ablations/mixers/falcon-dedup.json | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json diff --git a/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json b/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json new file mode 100644 index 000000000..257098314 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/falcon-dedup.json @@ -0,0 +1,26 @@ +{ + "streams": [ + { + "name": "falcon-v0", + "documents": [ + "pretraining-data/sources/falcon-refinedweb/v0/documents/*.json.gz" + ], + "output": { + "path": "pretraining-data/sources/falcon-refinedweb/abl-v0/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/data2/falcon-abl-v0/mixer/input", + "output": "/data2/falcon-abl-v0/mixer/output" + }, + "processes": 120 +} From 4aae45b01f3a8903553e1dcc159ae1ce0ebe61b1 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 13:22:26 -0700 Subject: [PATCH 06/14] safe mode --- .../ablations/dedupers/s2-v2-dedup.json | 63 +++++++++++++++++++ .../{s2-dedup.json => s2-v3-dedup.json} | 0 .../config/ablations/mixers/s2-v2-dedup.json | 45 +++++++++++++ .../{s2-dedup.json => s2-v3-dedup.json} | 0 requirements.txt | 1 + scripts/prepare_memmap_dataset.py | 26 ++++++-- 6 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json rename pretrain_data/mixer/config/ablations/dedupers/{s2-dedup.json => s2-v3-dedup.json} (100%) create mode 100644 pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json rename pretrain_data/mixer/config/ablations/mixers/{s2-dedup.json => s2-v3-dedup.json} (100%) diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json new file mode 100644 index 000000000..61dbd634f --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/s2-v2-dedup.json @@ -0,0 +1,63 @@ +{ + "documents": [ + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=valid/part_id=9/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=valid/part_id=9/*.gz" + ], + "work_dir": { + "input": "/tmp/abl-s2-v2/deduper/input", + "output": "/tmp/abl-s2-v2/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/s2-v3-dedup.json similarity index 100% rename from pretrain_data/mixer/config/ablations/dedupers/s2-dedup.json rename to pretrain_data/mixer/config/ablations/dedupers/s2-v3-dedup.json diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json new file mode 100644 index 000000000..b1d2dc15e --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/s2-v2-dedup.json @@ -0,0 +1,45 @@ +{ + "streams": [ + { + "name": "abl-s2-v2", + "documents": [ + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2ag/split=train/part_id=9/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=0/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=1/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=2/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=3/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=4/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=5/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=6/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=7/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=8/*.gz", + "pretraining-data/sources/s2/v2/documents/dataset=s2orc/split=train/part_id=9/*.gz" + ], + "output": { + "path": "pretraining-data/sources/s2/abl-s2-v2/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/tmp/abl-s2-v2/mixer/input", + "output": "/tmp/abl-s2-v2/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/mixers/s2-dedup.json b/pretrain_data/mixer/config/ablations/mixers/s2-v3-dedup.json similarity index 100% rename from pretrain_data/mixer/config/ablations/mixers/s2-dedup.json rename to pretrain_data/mixer/config/ablations/mixers/s2-v3-dedup.json diff --git a/requirements.txt b/requirements.txt index e3d225e7e..a93b3ff9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ datasets scikit-learn smashed[remote]>=0.21.1 msgspec>=0.14.0 +cached_path diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 5152983e6..6f250cc88 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -13,6 +13,7 @@ import concurrent.futures import functools +import gzip import json import logging import multiprocessing as mp @@ -40,6 +41,7 @@ recursively_list_files, stream_file_for_read, ) +from cached_path import cached_path from olmo import Tokenizer from olmo.util import prepare_cli_environment @@ -66,7 +68,7 @@ class InputDocumentSpec(msgspec.Struct): text: str -def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None, None]: +def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> Generator[List[int], None, None]: """Tokenize a file of documents using the provided tokenizer; file is expected to be a gzipped JSON lines file, each containing a field named `text`. """ @@ -74,8 +76,12 @@ def tokenize_file(tokenizer: Tokenizer, path: str) -> Generator[List[int], None, decoder = msgspec.json.Decoder(InputDocumentSpec) with ExitStack() as stack: - input_file = stack.enter_context(stream_file_for_read(path, mode="rb")) - input_stream = stack.enter_context(decompress_stream(input_file, mode="rt")) + if safe_mode: + local_path = cached_path(path) + input_stream = stack.enter_context(gzip.open(local_path, mode="rt")) + else: + input_file = stack.enter_context(stream_file_for_read(path, mode="rb")) + input_stream = stack.enter_context(decompress_stream(input_file, mode="rt")) i = 1 try: @@ -209,6 +215,7 @@ def fill_memmap( path: str, memmap_path: str, dtype: np.dtype, + safe_mode: bool = False, max_tokens: int = 512 * 1024 * 1024, # 512M tokens * 2 bytes per token (uint16) = 1GB ): """Write a memmap file from a file of documents.""" @@ -222,7 +229,8 @@ def fill_memmap( file_index = 0 with ExitStack() as stack: - for line_no, token_ids in enumerate(tokenize_file(tokenizer=tokenizer, path=path), start=1): + it = tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode) + for line_no, token_ids in enumerate(it, start=1): # flush any 10k lines or so; improves stability flush = line_no % 10_000 == 0 @@ -287,6 +295,7 @@ def make_source_and_target(src: Tuple[str, ...], output: str) -> Tuple[Tuple[str help="Maximum number of tokens to store in a single memmap file (default: 512M tokens or 1GB)", ) @click.option("--debug/--no-debug", default=False, help="Enable debug (single process mode)") +@click.option("--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open") @click.option("-j", "--workers", "max_workers", type=int, default=None, help="Defaults to number of CPUs") def main( src: Tuple[str, ...], @@ -295,6 +304,7 @@ def main( dtype_str: str, validate: bool, max_tokens: int, + safe_mode: bool, debug: bool, max_workers: Optional[int] = None, ): @@ -303,7 +313,13 @@ def main( # creating a partial here with all the arguments we need to pass to fill_memmap except for the paths # so that we don't make mistakes between debug and non-debug mode - fill_memmap_fn = functools.partial(fill_memmap, tokenizer_id=tokenizer_id, dtype=dtype, max_tokens=max_tokens) + fill_memmap_fn = functools.partial( + fill_memmap, + tokenizer_id=tokenizer_id, + dtype=dtype, + max_tokens=max_tokens, + safe_mode=safe_mode + ) if debug: log.info("Running in debug mode. Only one process will be used.") From c8c64620b2738f619be6376e2e042a20d9448f22 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 13:27:00 -0700 Subject: [PATCH 07/14] fix --- scripts/prepare_memmap_dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 6f250cc88..944350215 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -74,11 +74,12 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G """ decoder = msgspec.json.Decoder(InputDocumentSpec) + caching_path = path with ExitStack() as stack: if safe_mode: - local_path = cached_path(path) - input_stream = stack.enter_context(gzip.open(local_path, mode="rt")) + caching_path = cached_path(path) + input_stream = stack.enter_context(gzip.open(caching_path, mode="rt")) else: input_file = stack.enter_context(stream_file_for_read(path, mode="rb")) input_stream = stack.enter_context(decompress_stream(input_file, mode="rt")) @@ -93,6 +94,9 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G log.error(f"Error processing {path}:{i:,} -> {e}") pass + if caching_path != path and os.path.exists(caching_path): + os.remove(caching_path) + class MemmapFile: """Context manager responsible for writing, resizing, and closing / uploading a memmap file.""" From 4c2f5fe43616f4397f46fb365f12a8c6178bc489 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 13:27:37 -0700 Subject: [PATCH 08/14] deleting caching path --- scripts/prepare_memmap_dataset.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 944350215..5c767f1bd 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -27,6 +27,7 @@ import click import msgspec import numpy as np +from cached_path import cached_path from rich.progress import ( BarColumn, MofNCompleteColumn, @@ -41,7 +42,6 @@ recursively_list_files, stream_file_for_read, ) -from cached_path import cached_path from olmo import Tokenizer from olmo.util import prepare_cli_environment @@ -299,7 +299,9 @@ def make_source_and_target(src: Tuple[str, ...], output: str) -> Tuple[Tuple[str help="Maximum number of tokens to store in a single memmap file (default: 512M tokens or 1GB)", ) @click.option("--debug/--no-debug", default=False, help="Enable debug (single process mode)") -@click.option("--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open") +@click.option( + "--safe-mode/--fast-mode", default=False, help="Safe mode caches locally and decompresses using gzip.open" +) @click.option("-j", "--workers", "max_workers", type=int, default=None, help="Defaults to number of CPUs") def main( src: Tuple[str, ...], @@ -318,11 +320,7 @@ def main( # creating a partial here with all the arguments we need to pass to fill_memmap except for the paths # so that we don't make mistakes between debug and non-debug mode fill_memmap_fn = functools.partial( - fill_memmap, - tokenizer_id=tokenizer_id, - dtype=dtype, - max_tokens=max_tokens, - safe_mode=safe_mode + fill_memmap, tokenizer_id=tokenizer_id, dtype=dtype, max_tokens=max_tokens, safe_mode=safe_mode ) if debug: From d57213d0b019a7e1143dea763236fa17f19f3397 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 17:22:40 -0700 Subject: [PATCH 09/14] --safe mode for filters --- .../src/ai2_llm_filters/core_tools/runtime.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py index d08e7926d..205f2abc8 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py +++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py @@ -1,16 +1,18 @@ import argparse +import gzip import logging import multiprocessing +import os import tempfile from contextlib import ExitStack from queue import Queue from typing import Dict import msgspec +from cached_path import cached_path from smashed.utils.io_utils import ( compress_stream, decompress_stream, - open_file_for_read, open_file_for_write, stream_file_for_read, ) @@ -86,18 +88,22 @@ def process_single( encoder = msgspec.json.Encoder() decoder = msgspec.json.Decoder(InputSpec) + # this will be used to cache the file locally if needed + caching_path = source_path + with ExitStack() as stack: try: # open each file for reading and writing. We use open_file_for_read to handle s3 paths and # download the file locally if needed, while gzip.open is used to # read and write gzipped files. - in_file = stack.enter_context( - stream_file_for_read(source_path, "rb") - if local_read_cache is None - else open_file_for_read(source_path, "rb", temp_dir=local_read_cache) - ) - in_stream = stack.enter_context(decompress_stream(in_file, "rt")) + if local_read_cache is not None: + caching_path = cached_path(source_path, cache_dir=local_read_cache) + in_stream = stack.enter_context(gzip.open(caching_path, mode="rt")) + else: + input_file = stack.enter_context(stream_file_for_read(source_path, mode="rb")) + in_stream = stack.enter_context(decompress_stream(input_file, mode="rt")) + out_file = stack.enter_context(open_file_for_write(destination_path, "wb")) out_stream = stack.enter_context(compress_stream(out_file, "wt")) @@ -137,6 +143,9 @@ def process_single( logger.warning("\n" + msg) if not skip_on_failure: raise Ai2LlmFilterError(msg) from e + finally: + if caching_path != source_path and os.path.exists(caching_path): + os.remove(caching_path) # increment the files progress bar cls.increment_progressbar(queue, files=1, documents=docs_cnt) @@ -197,6 +206,9 @@ def main(cls): ap.add_argument( "--manually-excluded-paths", default=None, nargs="+", help="If provided, these paths will be skipped." ) + ap.add_argument( + "--safe-mode", action="store_true", help="Run in safe mode; will download locally before processing." + ) opts = ap.parse_args() if opts.list_taggers: @@ -228,10 +240,15 @@ def main(cls): f"skip on fail: {opts.skip_on_failure}\n" f"reuse prev: {not ignore_existing}\n" f"workdir: {metadata_workdir}\n" + f"safe mode: {opts.safe_mode}\n" + f"local cache: {opts.local_read_cache}\n" "---------------------------\n" ) print(msg) + # use a local read cache if we are in safe mode or if a local read cache is provided + local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None) + parallel_compute = cls( source_prefix=source_prefix, destination_prefix=destination_prefix, @@ -246,5 +263,5 @@ def main(cls): taggers_names=opts.taggers, experiment_name=opts.experiment_name, skip_on_failure=opts.skip_on_failure, - local_read_cache=opts.local_read_cache, + local_read_cache=local_read_cache, ) From 9452e4f99ad5f039f869f6bdafd533192c7782f7 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 17:23:44 -0700 Subject: [PATCH 10/14] printing --- .../filters/src/ai2_llm_filters/core_tools/runtime.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py index 205f2abc8..fc49d99bb 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py +++ b/pretrain_data/filters/src/ai2_llm_filters/core_tools/runtime.py @@ -225,6 +225,9 @@ def main(cls): source_prefix = f"{cls.BASE_S3_PREFIX}/{opts.dataset}/documents" destination_prefix = f"{cls.BASE_S3_PREFIX}/{opts.dataset}/attributes/{opts.experiment_name}" + # use a local read cache if we are in safe mode or if a local read cache is provided + local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None) + with tempfile.TemporaryDirectory() as tempdir: metadata_workdir = opts.reuse_existing or tempdir ignore_existing = opts.reuse_existing is None @@ -241,14 +244,11 @@ def main(cls): f"reuse prev: {not ignore_existing}\n" f"workdir: {metadata_workdir}\n" f"safe mode: {opts.safe_mode}\n" - f"local cache: {opts.local_read_cache}\n" + f"local cache: {local_read_cache}\n" "---------------------------\n" ) print(msg) - # use a local read cache if we are in safe mode or if a local read cache is provided - local_read_cache = opts.local_read_cache or (tempfile.gettempdir() if opts.safe_mode else None) - parallel_compute = cls( source_prefix=source_prefix, destination_prefix=destination_prefix, From fe6227bee9209f362870eae0e759f81c099746b2 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 2 Jun 2023 17:26:23 -0700 Subject: [PATCH 11/14] ignoring empty lines --- scripts/prepare_memmap_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 5c767f1bd..2586aad44 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -88,7 +88,8 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G try: for line in input_stream: row = decoder.decode(line) - yield tokenizer.encode(row.text, add_special_tokens=True) + if row.text.strip(): + yield tokenizer.encode(row.text, add_special_tokens=True) i += 1 except Exception as e: log.error(f"Error processing {path}:{i:,} -> {e}") From 4d8c599dce4261cea27083172488c0be9ceca485 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sun, 4 Jun 2023 14:18:35 -0700 Subject: [PATCH 12/14] added flags --- scripts/olmo-small-ablation-on-lumi.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/olmo-small-ablation-on-lumi.sh b/scripts/olmo-small-ablation-on-lumi.sh index 620e074f2..375384bfd 100644 --- a/scripts/olmo-small-ablation-on-lumi.sh +++ b/scripts/olmo-small-ablation-on-lumi.sh @@ -40,6 +40,7 @@ export CONFIG_PATH=configs/olmo-small-ablation.yaml # get run name, we will postpend it with the job id of this slurm run export RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//') +# actually run the training script srun \ --cpus-per-task=$SLURM_CPUS_PER_TASK \ --distribution=block:block \ @@ -55,4 +56,5 @@ srun \ $PROJECT_DIR/containers/$OLMO_CONTAINER \ python scripts/train.py $CONFIG_PATH \ --run_name="${RUN_NAME}_${SLURM_JOB_ID}" \ - --wandb.project=$WANDB_PROJECT + --wandb.project=$WANDB_PROJECT \ + ${@} From e056e9cc83c3e4dcaed57715d5dbd2939c1ee1a4 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sun, 4 Jun 2023 16:47:49 -0700 Subject: [PATCH 13/14] new decontamination --- .../config/ablations/dedupers/rp-dedup.json | 44 ++++++++++++++++++ .../config/ablations/mixers/rp-dedup.json | 46 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json create mode 100644 pretrain_data/mixer/config/ablations/mixers/rp-dedup.json diff --git a/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json new file mode 100644 index 000000000..a3769587d --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/rp-dedup.json @@ -0,0 +1,44 @@ +{ + "documents": [ + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=wikipedia/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=wikipedia/*.gz" + ], + "work_dir": { + "input": "/data2/redpajama-v1/deduper/input", + "output": "/data2/redpajama-v1/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json b/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json new file mode 100644 index 000000000..2c87b5910 --- /dev/null +++ b/pretrain_data/mixer/config/ablations/mixers/rp-dedup.json @@ -0,0 +1,46 @@ +{ + "streams": [ + { + "name": "redpajama-v1", + "documents": [ + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=train/dataset=wikipedia/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=valid/dataset=wikipedia/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=arxiv/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=book/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=c4/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=common_crawl/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=github/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=stackexchange/*.gz", + "pretraining-data/sources/redpajama/v1/documents/split=test/dataset=wikipedia/*.gz" + ], + "output": { + "path": "pretraining-data/sources/redpajama/abl-v1/documents", + "max_size_in_bytes": 3894967296 + }, + "attributes": ["decontamination"], + "filter": { + "include": [], + "exclude": [ + "$@.attributes[?(@.bff_duplicate_paragraph_spans && @.bff_duplicate_paragraph_spans[0] && @.bff_duplicate_paragraph_spans[0][2] >= 1.0)]" + ] + } + } + ], + "work_dir": { + "input": "/data2/redpajama-v1/mixer/input", + "output": "/data2/redpajama-v1/mixer/output" + }, + "processes": 120 +} From 45d74edcdbd8be69252fa886cb87cdaf9a8ceeea Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sun, 4 Jun 2023 20:55:22 -0700 Subject: [PATCH 14/14] tagger path --- pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py | 2 +- tokenizer/src/olmo_tokenizer/hf/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py b/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py index 40ea1a43a..e7f3951fa 100644 --- a/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py +++ b/pretrain_data/filters/src/ai2_llm_filters/taggers/jigsaw.py @@ -42,4 +42,4 @@ class FastTextJigsawNsfwDocumentTagger(FastTextJigsawHatespeechDocumentTagger): @TaggerRegistry.add("jigsaw_nsfw_sencence_v2") class FastTextJigsawNsfwSentenceTagger(FastTextJigsawHatespeechSentenceTagger): - ... + MODEL_PATH = "https://ai2-s2-research-public.s3.us-west-2.amazonaws.com/aakankshan/olmo-data-filters/jigsaw_fasttext_bigrams_nsfw_final.bin" # noqa: E501 diff --git a/tokenizer/src/olmo_tokenizer/hf/train.py b/tokenizer/src/olmo_tokenizer/hf/train.py index bcf9f8858..83b6314ad 100644 --- a/tokenizer/src/olmo_tokenizer/hf/train.py +++ b/tokenizer/src/olmo_tokenizer/hf/train.py @@ -187,7 +187,7 @@ class TrainConfig: input_dir: Optional[str] = None input_dirs: Optional[List[str]] = None save_path: str = sp.MISSING - normalization: Union[str, None] = sp.field(default="NFD", help="Choose between NFD, NFKD, NFC, or NFKC") + normalization: Union[str, None] = sp.field(default="NFC", help="Choose between NFD, NFKD, NFC, or NFKC") vocab_size: int = 64_000 model: str = sp.field(default="BPE", help="Choose between BPE (default) or Unigram.")