From 22b0582705bf55735bef3622fb04244fbc8e6c74 Mon Sep 17 00:00:00 2001 From: Rodney Kinney Date: Wed, 24 May 2023 14:37:08 -0700 Subject: [PATCH] configs --- pretrain_data/common_crawl/NOTES.md | 9 ++++ pretrain_data/mixer/config/v0-small.json | 2 +- pretrain_data/mixer/config/v0/en-head.json | 43 +++++++++++++++++++ pretrain_data/mixer/config/v0/en-middle.json | 43 +++++++++++++++++++ pretrain_data/mixer/config/v0/en-tail.json | 43 +++++++++++++++++++ .../mixer/config/v1-small/head-middle.json | 25 +++++++++++ pretrain_data/mixer/config/v1-small/head.json | 24 +++++++++++ pretrain_data/mixer/src/shard.rs | 2 +- 8 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 pretrain_data/mixer/config/v0/en-head.json create mode 100644 pretrain_data/mixer/config/v0/en-middle.json create mode 100644 pretrain_data/mixer/config/v0/en-tail.json create mode 100644 pretrain_data/mixer/config/v1-small/head-middle.json create mode 100644 pretrain_data/mixer/config/v1-small/head.json diff --git a/pretrain_data/common_crawl/NOTES.md b/pretrain_data/common_crawl/NOTES.md index 49c8495cc..f11ac8d08 100644 --- a/pretrain_data/common_crawl/NOTES.md +++ b/pretrain_data/common_crawl/NOTES.md @@ -6,6 +6,8 @@ We ran the CCNet pipeline over 25 dumps from 2020-05 to 2023-06. Different vers Sharded output of CCNet pipline. Duplicate paragraphs removed (exact match, but only comparing against a ~2% sample of paragraphs in the corpus). Bucketed by language (fasttext), and English perplexity on wikipedia-trained 5-gram language model. +**v0-en** is the re-sharded English content of `v0` + ### v1 Post-process of v0. Drop non-English documents. Deduplicate whole documents by URL. Coalesce shards. @@ -14,10 +16,17 @@ Post-process of v0. Drop non-English documents. Deduplicate whole documents by U **v1-small** is an 8.5% sample of `v1`, about 300B tokens. +**v1-small-head** is a sample of the `cc_en_head` (low-perplexity) subset of `v1` + +**v1-small-head-middle** is a sample of the `cc_en_head` and `cc_en_middle` (low- and mid-perplexity) setset of `v1` + ### v2 Post-process of v1. Remove duplicate paragraphs across the entire corpus +**v2-small** is a post-process of `v1-small` to remove duplicate paragraphs. + + ## CCNet Overview We run a fork of CCNet at https://github.com/allenai/cc_net.git diff --git a/pretrain_data/mixer/config/v0-small.json b/pretrain_data/mixer/config/v0-small.json index c9d2812fd..d7f261f8a 100644 --- a/pretrain_data/mixer/config/v0-small.json +++ b/pretrain_data/mixer/config/v0-small.json @@ -1,7 +1,7 @@ { "streams": [ { - "name": "v_small", + "name": "v0_small", "documents": [ "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_head.json.gz", "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_head.json.gz", diff --git a/pretrain_data/mixer/config/v0/en-head.json b/pretrain_data/mixer/config/v0/en-head.json new file mode 100644 index 000000000..d97bcbdaf --- /dev/null +++ b/pretrain_data/mixer/config/v0/en-head.json @@ -0,0 +1,43 @@ +{ + "streams": [ + { + "name": "cc_en_head", + "documents": [ + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_head.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_head.json.gz" + ], + "attributes": [], + "output": { + "path": "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_head", + "max_size_in_bytes": 4294967296 + } + } + ], + "work_dir": { + "input": "/data1/work/input", + "output": "/data2/work/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/v0/en-middle.json b/pretrain_data/mixer/config/v0/en-middle.json new file mode 100644 index 000000000..6575fd6a9 --- /dev/null +++ b/pretrain_data/mixer/config/v0/en-middle.json @@ -0,0 +1,43 @@ +{ + "streams": [ + { + "name": "cc_en_middle", + "documents": [ + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_middle.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_middle.json.gz" + ], + "attributes": [], + "output": { + "path": "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_middle", + "max_size_in_bytes": 4294967296 + } + } + ], + "work_dir": { + "input": "/data1/work/input", + "output": "/data2/work/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/v0/en-tail.json b/pretrain_data/mixer/config/v0/en-tail.json new file mode 100644 index 000000000..03897593f --- /dev/null +++ b/pretrain_data/mixer/config/v0/en-tail.json @@ -0,0 +1,43 @@ +{ + "streams": [ + { + "name": "cc_en_tail", + "documents": [ + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_tail.json.gz", + "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_tail.json.gz" + ], + "attributes": [], + "output": { + "path": "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_tail", + "max_size_in_bytes": 4294967296 + } + } + ], + "work_dir": { + "input": "/data1/work/input", + "output": "/data2/work/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/v1-small/head-middle.json b/pretrain_data/mixer/config/v1-small/head-middle.json new file mode 100644 index 000000000..a11669d65 --- /dev/null +++ b/pretrain_data/mixer/config/v1-small/head-middle.json @@ -0,0 +1,25 @@ +{ + "streams": [ + { + "name": "v1_small-head-middle", + "documents": [ + "pretraining-data/sources/common-crawl/v1/documents/cc_en_head/*", + "pretraining-data/sources/common-crawl/v1/documents/cc_en_middle/*" + ], + "output": { + "path": "pretraining-data/sources/common-crawl/v1-small-head-middle/documents", + "max_size_in_bytes": 21474836480 + }, + "attributes": ["sample"], + "filter": { + "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.17)]"], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/data1/work/input", + "output": "/data2/work/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/v1-small/head.json b/pretrain_data/mixer/config/v1-small/head.json new file mode 100644 index 000000000..da2d2dba2 --- /dev/null +++ b/pretrain_data/mixer/config/v1-small/head.json @@ -0,0 +1,24 @@ +{ + "streams": [ + { + "name": "v1_small-head", + "documents": [ + "pretraining-data/sources/common-crawl/v1/documents/cc_en_head/*" + ], + "output": { + "path": "pretraining-data/sources/common-crawl/v1-small-head/documents", + "max_size_in_bytes": 8589934592 + }, + "attributes": ["sample"], + "filter": { + "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.34)]"], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/data1/work/input", + "output": "/data2/work/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/src/shard.rs b/pretrain_data/mixer/src/shard.rs index 39b354d92..ab7d35b07 100644 --- a/pretrain_data/mixer/src/shard.rs +++ b/pretrain_data/mixer/src/shard.rs @@ -89,7 +89,7 @@ impl Shard { }; shards.push(shard); stream_shard_count += 1; - shard_size = 0; + shard_size = *size; shard_inputs = Vec::new(); } shard_inputs.push(input.clone());