From 4dddcc309e5ae655b44969ff9c583cf0a6fa27c0 Mon Sep 17 00:00:00 2001 From: Rodney Kinney Date: Fri, 26 May 2023 09:06:23 -0700 Subject: [PATCH 1/3] v0-small config --- pretrain_data/mixer/config/v0-small.json | 81 ++---------------------- 1 file changed, 4 insertions(+), 77 deletions(-) diff --git a/pretrain_data/mixer/config/v0-small.json b/pretrain_data/mixer/config/v0-small.json index d7f261f8a..2406e1a11 100644 --- a/pretrain_data/mixer/config/v0-small.json +++ b/pretrain_data/mixer/config/v0-small.json @@ -3,89 +3,16 @@ { "name": "v0_small", "documents": [ - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_tail.json.gz" + "pretraining-data/sources/common-crawl/v0-en/cc_en_head/*", + "pretraining-data/sources/common-crawl/v0-en/cc_en_head/*", ], "output": { "path": "pretraining-data/sources/common-crawl/v0-small/documents", - "max_size_in_bytes": 42949672960 + "max_size_in_bytes": 85899345920 }, "attributes": ["sample"], "filter": { - "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.085)]"], + "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.05)]"], "exclude": [] } } From 265ee4add6c3548bf3e8115c6a5e1eeb458b01e5 Mon Sep 17 00:00:00 2001 From: Rodney Kinney Date: Fri, 26 May 2023 09:08:46 -0700 Subject: [PATCH 2/3] fixes --- pretrain_data/mixer/config/v0-small.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pretrain_data/mixer/config/v0-small.json b/pretrain_data/mixer/config/v0-small.json index 2406e1a11..6772e945a 100644 --- a/pretrain_data/mixer/config/v0-small.json +++ b/pretrain_data/mixer/config/v0-small.json @@ -4,7 +4,8 @@ "name": "v0_small", "documents": [ "pretraining-data/sources/common-crawl/v0-en/cc_en_head/*", - "pretraining-data/sources/common-crawl/v0-en/cc_en_head/*", + "pretraining-data/sources/common-crawl/v0-en/cc_en_middle/*", + "pretraining-data/sources/common-crawl/v0-en/cc_en_tail/*" ], "output": { "path": "pretraining-data/sources/common-crawl/v0-small/documents", From acbaf08dccefe247168e6235621af45f693e29b0 Mon Sep 17 00:00:00 2001 From: Rodney Kinney Date: Fri, 26 May 2023 09:09:25 -0700 Subject: [PATCH 3/3] fixes --- pretrain_data/mixer/config/v0-small.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pretrain_data/mixer/config/v0-small.json b/pretrain_data/mixer/config/v0-small.json index 6772e945a..7cc7775c7 100644 --- a/pretrain_data/mixer/config/v0-small.json +++ b/pretrain_data/mixer/config/v0-small.json @@ -3,9 +3,9 @@ { "name": "v0_small", "documents": [ - "pretraining-data/sources/common-crawl/v0-en/cc_en_head/*", - "pretraining-data/sources/common-crawl/v0-en/cc_en_middle/*", - "pretraining-data/sources/common-crawl/v0-en/cc_en_tail/*" + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_head/*", + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_middle/*", + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_tail/*" ], "output": { "path": "pretraining-data/sources/common-crawl/v0-small/documents",