diff --git a/pretrain_data/mixer/config/v0-small.json b/pretrain_data/mixer/config/v0-small.json index d7f261f8a..7cc7775c7 100644 --- a/pretrain_data/mixer/config/v0-small.json +++ b/pretrain_data/mixer/config/v0-small.json @@ -3,89 +3,17 @@ { "name": "v0_small", "documents": [ - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_head.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_middle.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_tail.json.gz", - "pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_tail.json.gz" + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_head/*", + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_middle/*", + "pretraining-data/sources/common-crawl/v0-en/documents/cc_en_tail/*" ], "output": { "path": "pretraining-data/sources/common-crawl/v0-small/documents", - "max_size_in_bytes": 42949672960 + "max_size_in_bytes": 85899345920 }, "attributes": ["sample"], "filter": { - "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.085)]"], + "include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.05)]"], "exclude": [] } }