allenai · soldni · May 7, 2024 · May 7, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.devcontainer/postInstall.sh b/.devcontainer/postInstall.sh
@@ -2,4 +2,4 @@
 
 PATH=/home/vscode/.cargo/bin:$PATH
 cd dolma
-source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin[patchelf]>=1.1,<2.0"
+source /home/vscode/miniforge3/bin/activate && pip install cmake "maturin>=1.5,<2.0"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -19,6 +19,7 @@ permissions:
 env:
   DOLMA_TESTS_SKIP_AWS: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'true' || 'false' }}
   DOLMA_TEST_S3_PREFIX: s3://dolma-tests
+  DOLMA_TEST_SKIP_LARGE_MODELS: "true"
   RUST_CHANNEL: stable
 
 jobs:

diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ setup:
 	$(shell "${PROTOBUF_SETUP}")
 	$(shell "${OPENSSL_SETUP}")
 	which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-	which maturin || pip install maturin[patchelf]
+	which maturin || pip install 'maturin>=1.5,<2.0'
 
 publish:
 	maturin publish

diff --git a/configs/cc-news/dedupe.sh b/configs/cc-news/dedupe.sh
@@ -0,0 +1,73 @@
+#! /usr/bin/env bash
+
+# documents:
+#   - s3://ai2-llm/pretraining-data/sources/c4/v0/documents/train/*.gz
+
+# dedupe:
+#   name: dedupe_para_ngrams_13_1
+#   paragraphs:
+#     attribute_name: dedupe_para_ngrams_13_1
+#     by_ngram:
+#       ngram_length: 13
+#       stride: 1
+#       overlap_threshold: 0.5
+#   skip_empty: true
+
+# bloom_filter:
+#   file: ${oc.env:HOME}/c4_dedupe_para_ngrams_13_1.bin
+#   read_only: false
+#   # estimated doc count is obtained by counting number of words in paragraphs
+#   # then dividing by 13 (ngram_length) and multiplying by 2 (for each ngram)
+#   estimated_doc_count: 359_916_731_334
+#   desired_false_positive_rate: 0.1
+
+# processes: 188
+# work_dir:
+#   input: /tmp/c4_dedupe_para_ngrams_13_1/input
+#   output: /tmp/c4_dedupe_para_ngrams_13_1/output
+
+# run years between 2016 and 2024
+for year in {2016..2024}; do
+    # run months between 1 and 12
+    for month in {1..12}; do
+        # skip months after 7 if year is 2024
+        if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
+            continue
+        fi
+
+        # skip months before 8 if year is 2016
+        if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
+            continue
+        fi
+
+        # rename month to 2 digits
+        month=$(printf "%02d" $month)
+
+        documents="s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/*.zst"
+
+        size=$(aws s3api list-objects --bucket ai2-llm --prefix "pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month}/" --output json --query "[sum(Contents[].Size)]" | jq '.[0]' -rc)
+
+        # run deduplication
+        echo "Running fuzzy dedupe for ${year}-${month} with ${size} bytes Bloom filter"
+
+        set -ex
+
+        dolma dedupe \
+            --documents ${documents} \
+            --dedupe.name dedupe_ngrams_13_1 \
+            --dedupe.paragraphs.attribute_name dedupe_ngrams_13_1 \
+            --dedupe.paragraphs.by_ngram.ngram_length 13 \
+            --dedupe.paragraphs.by_ngram.stride 1 \
+            --dedupe.paragraphs.by_ngram.overlap_threshold 0.5 \
+            --dedupe.skip_empty \
+            --bloom_filter.file "${HOME}/cc-news/dedupe_ngrams_13_1-${year}-${month}.bin" \
+            --no-bloom_filter.read_only \
+            --bloom_filter.estimated_doc_count $size \
+            --bloom_filter.desired_false_positive_rate 0.01 \
+            --processes "$(expr $(nproc) - 4)" \
+            --work_dir.input /tmp/cc-news/dedupe_ngrams_13_1/${year}-${month}/input \
+            --work_dir.output /tmp/cc-news/dedupe_ngrams_13_1/${year}-${month}/output
+
+        set +ex
+    done
+done
diff --git a/configs/cc-news/extract.sh b/configs/cc-news/extract.sh
@@ -0,0 +1,43 @@
+#! /usr/bin/env bash
+
+# run years between 2016 and 2024
+for year in {2016..2024}; do
+    # run months between 1 and 12
+    for month in {1..12}; do
+        # skip months after 7 if year is 2024
+        if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
+            continue
+        fi
+
+        # skip months before 8 if year is 2016
+        if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
+            continue
+        fi
+
+        # rename month to 2 digits
+        month=$(printf "%02d" $month)
+
+        documents="s3://ai2-russella/crawl-data/CC-NEWS/${year}/${month}/*.warc.gz"
+
+        # run the extraction
+        echo "Running extraction for ${year}-${month}"
+
+        set -ex
+
+        dolma warc \
+            --documents ${documents} \
+            --destination s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${year}-${month} \
+            --processes "$(expr $(nproc) - 4)" \
+            --source_name cc-news_${year}-${month} \
+            --linearizer resiliparse \
+            --pre.taggers cc_re \
+            --no-pre.skip \
+            --no-store.html \
+            --store.attr_spans 500 \
+            --skip_duplicate_urls \
+            --work_dir.input /tmp/cc-news/${year}-${month}/input \
+            --work_dir.output /tmp/cc-news/${year}-${month}/output
+
+        set +ex
+    done
+done
diff --git a/configs/cc-news/make_lang_partition.py b/configs/cc-news/make_lang_partition.py
@@ -0,0 +1,64 @@
+import json
+from typing import List
+import smart_open
+
+
+SRC_BASE = "s3://ai2-llm/pretraining-data/sources/cc-news"
+SRC_PRFX = "v1-resiliparse"
+LANG_THR = 100_000
+DST_BASE = "${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news"
+DST_PRFX = f"v2-resiliparse-l{LANG_THR // 1000}k"
+
+
+def base_stream_config(lang: str, year: int, months: List[int]):
+    return {
+        "name": f"cc-news_{year:04d}_{lang}",
+        "documents": [
+            f"{SRC_BASE}/{SRC_PRFX}/documents/{year:04d}-{month:02d}/*.zst"
+            for month in months
+        ],
+        "compression": {"input": "zst", "output": "zst"},
+        "output": {
+            "path": f"{DST_BASE}/{DST_PRFX}/documents/{lang}/{year:04d}",
+            "max_size_in_bytes": 10_000_000_000,
+        },
+        "attributes": ["ft_lang_id_1e2", "dolma_v2_tokenizer"],
+        "filter": {
+            "include": [
+                # at least 100 tokens
+                ".attributes.dolma_v2_tokenizer__dolma_v2_tokenizer__length[0][-1] >= 100",
+                # make sure the language is present and the confidence is high enough and that it is the highest confidence
+                (
+                    f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang} != null) and "
+                    + f"(.attributes.ft_lang_id_1e2__ft_lang_id_1e2__{lang}[0][-1] >= 0.5) and "
+                    + f'((.attributes | to_entries | map(select(.key | startswith("ft_lang_id_1e2__ft_lang_id_1e2__"))) | max_by(.value) | .key ) == "ft_lang_id_1e2__ft_lang_id_1e2__{lang}")'
+                ),
+            ],
+            "syntax": "jq",
+        },
+    }
+
+
+def main():
+    with smart_open.open("s3://ai2-llm/stats/cc-news/v1-resiliparse/attributes/ft_lang_id_1e2_summary.json") as f:
+        lang_counts = json.load(f)
+
+    languages = {k: v for k, v in lang_counts.items() if v >= LANG_THR}
+
+    streams = []
+    for year in range(2016, 2025):
+        if year == 2016:
+            months = list(range(8, 13))
+        elif year == 2024:
+            months = list(range(1, 8))
+        else:
+            months = list(range(1, 13))
+
+        streams.extend([base_stream_config(lang, year, months) for lang in languages])
+
+    with smart_open.open("configs/cc-news/mix_v2.json", "wt") as f:
+        json.dump({"processes": 1, "streams": streams}, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/cc-news/mix_v1.sh b/configs/cc-news/mix_v1.sh
@@ -0,0 +1,45 @@
+#! /usr/bin/env bash
+
+
+# get script directory
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  # if $SOURCE was a relative symlink, we need to resolve it
+  # relative to the path where the symlink file was located
+  [[ $SOURCE != /* ]] && SOURCE="$SCRIPT_DIR/$SOURCE"
+done
+SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+
+
+# run years between 2016 and 2024
+for year in {2016..2024}; do
+    # run months between 1 and 12
+    for month in {1..12}; do
+        # skip months after 7 if year is 2024
+        if [ $year -eq 2024 ] && [ $month -gt 7 ]; then
+            continue
+        fi
+
+        # skip months before 8 if year is 2016
+        if [ $year -eq 2016 ] && [ $month -lt 8 ]; then
+            continue
+        fi
+
+        # rename month to 2 digits
+        month=$(printf "%02d" $month)
+
+        # run deduplication
+        echo "Mixing ${year}-${month}"
+
+        export MIX_MONTH=${month}
+        export MIX_YEAR=${year}
+
+        set -ex
+
+        dolma -c ${SCRIPT_DIR}/mix_v1.yaml mix
+
+        set +ex
+    done
+done
diff --git a/configs/cc-news/mix_v1.yaml b/configs/cc-news/mix_v1.yaml
@@ -0,0 +1,25 @@
+streams:
+    - name: cc-news_${oc.env:MIX_YEAR}-${oc.env:MIX_MONTH}
+      documents:
+        - s3://ai2-llm/pretraining-data/sources/cc-news/v0-resiliparse/documents/${oc.env:MIX_YEAR}-${oc.env:MIX_MONTH}/*.zst
+
+      compression:
+        input: zst
+        output: zst
+
+      output:
+        path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/cc-news/v1-resiliparse/documents/${oc.env:MIX_YEAR}-${oc.env:MIX_MONTH}
+        max_size_in_bytes: 1_000_000_000
+
+      attributes:
+        - dedupe_ngrams_13_1
+
+      filter:
+        exclude:
+          - >-
+           (.attributes.dedupe_ngrams_13_1 | length > 0) and
+           ((.attributes.dedupe_ngrams_13_1 | map(.[2] * (.[1] - .[0])) | add) / (.text | length) >= 0.5)
+
+        syntax: jq
+
+processes: 1