Skip to content

Commit

Permalink
Merge pull request #181 from allenai/v0-small
Browse files Browse the repository at this point in the history
v0-small config
  • Loading branch information
rodneykinney committed May 30, 2023
2 parents 3478cb0 + a987277 commit 2437cdf
Showing 1 changed file with 5 additions and 77 deletions.
82 changes: 5 additions & 77 deletions pretrain_data/mixer/config/v0-small.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,89 +3,17 @@
{
"name": "v0_small",
"documents": [
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_head.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_middle.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2023-06/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-49/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-40/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-33/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-27/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-21/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2022-05/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-49/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-43/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-39/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-31/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-25/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-21/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-17/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-10/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2021-04/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-50/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-45/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-40/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-34/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-29/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-24/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-16/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-10/*/en_tail.json.gz",
"pretraining-data/sources/common-crawl/v0/documents/mined_split/2020-05/*/en_tail.json.gz"
"pretraining-data/sources/common-crawl/v0-en/documents/cc_en_head/*",
"pretraining-data/sources/common-crawl/v0-en/documents/cc_en_middle/*",
"pretraining-data/sources/common-crawl/v0-en/documents/cc_en_tail/*"
],
"output": {
"path": "pretraining-data/sources/common-crawl/v0-small/documents",
"max_size_in_bytes": 42949672960
"max_size_in_bytes": 85899345920
},
"attributes": ["sample"],
"filter": {
"include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.085)]"],
"include": ["$.attributes[?(@.sample__random_number_v1__random[0][2] < 0.05)]"],
"exclude": []
}
}
Expand Down

0 comments on commit 2437cdf

Please sign in to comment.