diff --git a/configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml b/configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml new file mode 100644 index 00000000..7aef47d5 --- /dev/null +++ b/configs/dolma-v1_7/v1_5-baseline/300g_sample.yaml @@ -0,0 +1,97 @@ + +streams: + - name: books + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz + attributes: &attributes + - paloma_paragraphs + - paloma_documents + - random_number_v1 + output: &output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/books + max_size_in_bytes: 4294967296 + discard_fields: + - attributes + filter: &filter + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.10)]" + exclude: + - "$@.attributes[?(@.paloma_documents_bff_duplicates && @.paloma_documents_bff_duplicates[0] && @.paloma_documents_bff_duplicates[0][2] >= 1.0)]" + - "$@.attributes[?(@.paloma_paragraphs_bff_duplicates && @.paloma_paragraphs_bff_duplicates[0] && @.paloma_paragraphs_bff_duplicates[0][2] >= 1.0)]" + + - name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/c4 + filter: *filter + + - name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_head + filter: *filter + + - name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_middle + filter: *filter + + - name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_tail + filter: *filter + + - name: pes2o + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/pes2o + filter: *filter + + - name: reddit + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/reddit + filter: *filter + + - name: stack + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/stack + filter: *filter + + - name: wiki + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/wiki + filter: *filter + +work_dir: + input: "/tmp/olmo-mix-v1_5/input" + output: "/tmp/olmo-mix-v1_5/output" +processes: 188 diff --git a/configs/dolma-v1_7/v1_5-baseline/300g_tok.yml b/configs/dolma-v1_7/v1_5-baseline/300g_tok.yml new file mode 100644 index 00000000..47d18062 --- /dev/null +++ b/configs/dolma-v1_7/v1_5-baseline/300g_tok.yml @@ -0,0 +1,23 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_5-300G-decon/gpt-neox-olmo-dolma-v1_6 +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/books + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/c4 + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_head + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_middle + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/cc_en_tail + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/pes2o + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/reddit + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/stack + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5-300G-decon/documents/wiki + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz + +processes: 40 +seed: 3920 +max_size: 21_474_836_480 + +tokenizer: + name_or_path: allenai/gpt-neox-olmo-dolma-v1_5 + bos_token_id: null + eos_token_id: 50279 + pad_token_id: 1 + segment_before_tokenization: false diff --git a/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_docs.yaml b/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_docs.yaml new file mode 100644 index 00000000..ca16cc5b --- /dev/null +++ b/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_docs.yaml @@ -0,0 +1,26 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz + + +dedupe: + name: paloma_documents + documents: + attribute_name: paloma_documents_bff_duplicates + key: $.text + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 188815 + desired_false_positive_rate: 1e-15 + file: ${oc.env:HOME}/perplexity/filters/paloma_documents.bin + +processes: 94 diff --git a/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_para.yaml b/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_para.yaml new file mode 100644 index 00000000..7bdfdedc --- /dev/null +++ b/configs/dolma-v1_7/v1_5-baseline/step2-run-decontamination/dolma-v1_5_para.yaml @@ -0,0 +1,29 @@ +documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/books/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_head/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_middle/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/cc_en_tail/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/pes2o/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/reddit/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/stack/*.gz + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/wiki/*.gz + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + + +dedupe: + name: paloma_paragraphs + paragraphs: + attribute_name: paloma_paragraphs_bff_duplicates + skip_empty: true + +bloom_filter: + read_only: true + estimated_doc_count: 2336120 + # size_in_bytes: 104857600 # 100 MB; smaller causes too many FPs + desired_false_positive_rate: 1e-15 + # file: s3://ai2-llm/bloom-filters/perplexity-suite-v3_option2.bin + file: ${oc.env:HOME}/perplexity/filters/paloma_paragraphs.bin + +processes: 94 diff --git a/configs/dolma-v1_7/v1_6-baseline/300g_sample.yml b/configs/dolma-v1_7/v1_6-baseline/300g_sample.yml new file mode 100644 index 00000000..9fe1f883 --- /dev/null +++ b/configs/dolma-v1_7/v1_6-baseline/300g_sample.yml @@ -0,0 +1,96 @@ +streams: + - name: books + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/books/*.gz + attributes: &attributes + - paloma_paragraphs + - paloma_documents + - random_number_v1 + output: &output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/books + max_size_in_bytes: 38949672960 + discard_fields: + - attributes + filter: &filter + include: + - "$.attributes[?(@.random_number_v1__random_number_v1__random[0][2] < 0.10)]" + exclude: + - "$@.attributes[?(@.paloma_documents_bff_duplicates && @.paloma_documents_bff_duplicates[0] && @.paloma_documents_bff_duplicates[0][2] >= 1.0)]" + - "$@.attributes[?(@.paloma_paragraphs_bff_duplicates && @.paloma_paragraphs_bff_duplicates[0] && @.paloma_paragraphs_bff_duplicates[0][2] >= 1.0)]" + + - name: c4 + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/c4/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/c4 + filter: *filter + + - name: cc_en_head + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_head/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_head + filter: *filter + + - name: cc_en_middle + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_middle/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_middle + filter: *filter + + - name: cc_en_tail + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/cc_en_tail/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_tail + filter: *filter + + - name: pes2o + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/pes2o/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/pes2o + filter: *filter + + - name: reddit + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/reddit/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/reddit + filter: *filter + + - name: stack + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/stack/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/stack + filter: *filter + + - name: wiki + documents: + - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5r2/documents/wiki/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/wiki + filter: *filter + +work_dir: + input: "/tmp/olmo-mix-v1_6/input" + output: "/tmp/olmo-mix-v1_6/output" +processes: 188 diff --git a/configs/dolma-v1_7/v1_6-baseline/300g_tok.yml b/configs/dolma-v1_7/v1_6-baseline/300g_tok.yml new file mode 100644 index 00000000..298d902c --- /dev/null +++ b/configs/dolma-v1_7/v1_6-baseline/300g_tok.yml @@ -0,0 +1,23 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_6-300G-decon/gpt-neox-olmo-dolma-v1_6 +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/books + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/c4 + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_head + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_middle + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_tail + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/pes2o + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/reddit + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/stack + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/wiki + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz + +processes: 40 +seed: 3920 +max_size: 21_474_836_480 + +tokenizer: + name_or_path: allenai/gpt-neox-olmo-dolma-v1_5 + bos_token_id: null + eos_token_id: 50279 + pad_token_id: 1 + segment_before_tokenization: false diff --git a/configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh b/configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh new file mode 100644 index 00000000..3f5eb42e --- /dev/null +++ b/configs/dolma-v1_7/v1_6-baseline/tok_per_source.sh @@ -0,0 +1,31 @@ +#!/bin/env bash + +sources=( + "books,1" + "c4,5" + "cc_en_head,10" + "cc_en_middle,10" + "cc_en_tail,10" + "pes2o,3" + "reddit,2" + "stack,5" + "wiki,1" +) + +set -x + +for i in "${!sources[@]}"; do + # split source and number of processes + source=$(echo "${sources[$i]}" | cut -d',' -f1) + processes=$(echo "${sources[$i]}" | cut -d',' -f2) + + dolma tokens \ + --destination "s3://ai2-llm/preprocessed/olmo-mix/v1_6-300G-decon/gpt-neox-olmo-dolma-v1_6_persource/${source}" \ + --documents "${HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/${source}" \ + --tokenizer.name_or_path "allenai/gpt-neox-olmo-dolma-v1_5" \ + --tokenizer.eos_token_id 50279 \ + --tokenizer.pad_token_id 1 \ + --processes ${processes} \ + --seed 3920 \ + --max_size "21_474_836_480" +done diff --git a/configs/dolma-v1_7/v1_7-blocklist/300g_sample.yml b/configs/dolma-v1_7/v1_7-blocklist/300g_sample.yml new file mode 100644 index 00000000..93e284ab --- /dev/null +++ b/configs/dolma-v1_7/v1_7-blocklist/300g_sample.yml @@ -0,0 +1,101 @@ +streams: + - name: books + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/books/*.gz + attributes: &attributes + - url_filtering + output: &output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/books + max_size_in_bytes: 38949672960 + # discard_fields: + # - attributes + + filter: &no_filter + include: + - "$.id" + + - name: pes2o + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/pes2o/*.gz + attributes: *attributes + filter: *no_filter + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/pes2o + + - name: reddit + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/reddit/*.gz + attributes: *attributes + filter: *no_filter + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/reddit + + - name: stack + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/stack/*.gz + attributes: *attributes + filter: *no_filter + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/stack + + - name: wiki + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/wiki/*.gz + attributes: *attributes + filter: *no_filter + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/wiki + + - name: c4 + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/c4/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/c4 + filter: &filter + exclude: + # only filter if not in allowlist + - "$.attributes[?(@['url_filtering__blocklist_hosts_gambling_v1__url'] && !@['url_filtering__allowlist_wikidata_cleaned_v1__url'])]" + - "$.attributes[?(@['url_filtering__blocklist_hosts_adware_malware_v1__url'] && !@['url_filtering__allowlist_wikidata_cleaned_v1__url'])]" + - "$.attributes[?(@['url_filtering__blocklist_hosts_porn_v1__url'] && !@['url_filtering__allowlist_wikidata_cleaned_v1__url'])]" + + # no intersection with wikidata allowlist for these + - "$.attributes[?(@['url_filtering__blocklist_hosts_fakenews_v1__url'])]" + - "$.attributes[?(@['url_filtering__blocklist_hosts_social_v1__url'])]" + + - name: cc_en_head + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_head/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_head + filter: *filter + + - name: cc_en_middle + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_middle/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_middle + filter: *filter + + - name: cc_en_tail + documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon/documents/cc_en_tail/*.gz + attributes: *attributes + output: + <<: *output + path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_tail + filter: *filter + +work_dir: + input: "/tmp/olmo-mix-v1_6_url/input" + output: "/tmp/olmo-mix-v1_6_url/output" +processes: 188 diff --git a/configs/dolma-v1_7/v1_7-blocklist/300g_tok.yml b/configs/dolma-v1_7/v1_7-blocklist/300g_tok.yml new file mode 100644 index 00000000..3790e7a8 --- /dev/null +++ b/configs/dolma-v1_7/v1_7-blocklist/300g_tok.yml @@ -0,0 +1,23 @@ +destination: ${oc.env:HOME}/ai2-llm/preprocessed/olmo-mix/v1_6-300G-decon-urlfilter/gpt-neox-olmo-dolma-v1_6 +documents: + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/books + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/c4 + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_head + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_middle + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/cc_en_tail + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/pes2o + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/reddit + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/stack + - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/olmo-mix/v1_6-300G-decon-urlfilter/documents/wiki + # - ${oc.env:HOME}/ai2-llm/pretraining-data/sources/starcoder/v0/documents/*/*.json.gz + +processes: 40 +seed: 3920 +max_size: 21_474_836_480 + +tokenizer: + name_or_path: allenai/gpt-neox-olmo-dolma-v1_5 + bos_token_id: null + eos_token_id: 50279 + pad_token_id: 1 + segment_before_tokenization: false diff --git a/python/dolma/models/__init__.py b/python/dolma/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/dolma/tokenizer/executor.py b/python/dolma/tokenizer/executor.py index c5673c47..117ea895 100644 --- a/python/dolma/tokenizer/executor.py +++ b/python/dolma/tokenizer/executor.py @@ -5,7 +5,7 @@ import tempfile from contextlib import ExitStack from math import ceil, log10 -from queue import Queue +from queue import Queue # pylint: disable=unused-import from typing import Any, Dict, List, Optional import numpy as np @@ -14,7 +14,7 @@ from ..core.loggers import get_logger from ..core.parallel import BaseParallelProcessor, QueueType from ..core.paths import glob_path, join_path, mkdir_p -from .data_types import TokenizerOutput +from .data_types import TokenizerOutput # pylint: disable=unused-import from .memmap_writer import MemmapWriter from .tokenizer import Tokenizer, tokenize_file @@ -24,7 +24,7 @@ class MemMapParallelWriter(BaseParallelProcessor): @classmethod - def increment_progressbar( # type: ignore[override] + def increment_progressbar( # type: ignore[override] # pylint: disable=arguments-differ cls, queue: QueueType, /, @@ -130,7 +130,7 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp memwriter = stack.enter_context( MemmapWriter( path=destination_path + f"-{mm_cnt:05d}", - dtype=np.dtype("uint16"), + dtype=np.dtype("uint16"), # pyright: ignore max_tokens=max_size, ) ) @@ -163,17 +163,36 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A all_source_paths = [p for p in set([p for source in self.src_prefixes for p in glob_path(source)])] random.shuffle(all_source_paths) - # group source paths into a number determined by num_readers or the (# sources / number of processes) - num_readers_per_writer = float(num_readers or len(all_source_paths) / self.num_processes) - grouped_source_prefixes = [] - i = 0.0 - while i < len(all_source_paths): - grouped_source_prefixes.append(all_source_paths[int(i) : int(i + num_readers_per_writer)]) - i += num_readers_per_writer - - # redefine num_processes to be the number of groups + # TRICKY BIT: Group source paths into buckets + # First, check what the step size should be. The step is the minimum between the + # number of readers requested, and the number of source_paths per process. + # The float("inf") bit is required to handle the case when num_readers is None. + step_size = min(num_readers or float("inf"), len(all_source_paths) / self.num_processes) + + # Now, we step over all files in increments of step_size, and group them into buckets + # we need to make sure we don't add empty buckets. + grouped_source_prefixes: List[List[str]] = [] + current_step = 0.0 + while current_step < len(all_source_paths): # can't use range here because of the float + prefix_slice = all_source_paths[int(current_step) : int(current_step + step_size)] + if prefix_slice: + grouped_source_prefixes.append(prefix_slice) + current_step += step_size + + # Finally, we optionally redefine num_processes to be the number of groups otherwise some + # processors will not have any work to do. self.num_processes = min(len(grouped_source_prefixes), self.num_processes) + # We have one set of sanity checks here to make sure that the grouping was done correctly + if any(len(bucket) == 0 for bucket in grouped_source_prefixes): + raise ValueError("Some buckets are empty. This should not happen.") + if len(grouped_source_prefixes) < self.num_processes: + raise ValueError("The number of groups is less than the number of processes. This should not happen.") + if len(all_source_paths) < len(grouped_source_prefixes): + raise ValueError( + "The number of groups is greater than the number of source paths. This should not happen." + ) + # this is a bit of a hack but: we pass indices to grouped_source_prefixes to the processors # so that they can load the correct source paths source_indices = [str(i) for i in range(len(grouped_source_prefixes))] @@ -188,21 +207,21 @@ def __call__(self, num_readers: Optional[int] = None, **process_single_kwargs: A # each parallel processor will write a file name like part-dddddd-dddd.npy and part-dddddd-dddd.csv.gz digits = int(ceil(log10(len(grouped_source_prefixes) + 1))) - destinations = [ + all_destination_paths = [ join_path(None, destination, f"part-{i:0{digits}d}") for i in range(len(grouped_source_prefixes)) ] # same for metadata metadata = self.meta_prefixes[0] mkdir_p(metadata) - metadatas = [join_path(None, metadata, f"{i}.done") for i in range(len(destinations))] + all_metadata_path = [join_path(None, metadata, f"{i}.done") for i in range(len(all_destination_paths))] # finally run the processors fn = self._debug_run_all if self.debug else self._multiprocessing_run_all fn( all_source_paths=source_indices, - all_destination_paths=destinations, - all_metadata_paths=metadatas, + all_destination_paths=all_destination_paths, + all_metadata_paths=all_metadata_path, grouped_source_prefixes=grouped_source_prefixes, **process_single_kwargs, ) @@ -226,8 +245,31 @@ def tokenize_in_parallel( dtype: str = "uint16", debug: bool = False, ): - # variables for the nice debugging and tokenizers - os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" + """ + Tokenizes the input sources in parallel using multiple writers and readers. + + Args: + sources (List[str]): List of source file paths to tokenize. + destination (str): Destination directory to store the tokenized files. + num_writers (int, optional): Number of writer processes to use. Defaults to 1. + num_readers (int, optional): Number of reader processes to use. Defaults to None. + local_shuffle (int, optional): Number of lines to shuffle locally before writing. Defaults to 10_000. + ring_size (int, optional): Size of the ring buffer for inter-process communication. Defaults to 8. + tokenizer_name_or_path (str, optional): Name or path of the tokenizer to use. + Defaults to "allenai/gpt-neox-olmo-dolma-v1_5". Note that, if the tokenizer is changed, + you may need to adjust `bos_token_id`, `eos_token_id`, and `pad_token_id` accordingly. + bos_token_id (int, optional): ID of the beginning-of-sentence token. Defaults to None. + eos_token_id (int, optional): ID of the end-of-sentence token. Defaults to 50279. + pad_token_id (int, optional): ID of the padding token. Defaults to 1. + segment_before_tokenization (bool, optional): Whether to segment the input before tokenization. + Defaults to False. + seed (int, optional): Seed value for randomization. Defaults to 3920. + metadata_dir (str, optional): Directory to store metadata files. Defaults to None. + max_size (int, optional): Maximum size of each tokenized file. Defaults to 1024 * 1024 * 1024. + dtype (str, optional): Data type for tokenized files. Defaults to "uint16". + debug (bool, optional): Whether to enable debug mode. Defaults to False. + """ + # variables to avoid issues with parallelism os.environ["TOKENIZERS_PARALLELISM"] = "false" # do it once so it gets cached (unless it's local path, so no need)