In [42]:
import dolma

In [43]:
config_path = "../configs/cccc"
base_config = """
documents:
{paths}

dedupe:
  name: dedupe_url
  documents:
    attribute_name: dedupe_url
    key: $.metadata.url
  skip_empty: true


bloom_filter:
  file: /tmp/dedupe-url/cccc-{snapshot}/filter.bloom
  read_only: false
  estimated_doc_count: 100_000_000
  desired_false_positive_rate: 1e-06

processes: 188
work_dir:
  input: /tmp/dedupe-url/cccc-{snapshot}/input
  output: /tmp/dedupe-url/cccc-{snapshot}/output
"""

In [44]:
import boto3


def list_zst_files(bucket_name, prefix, extension: str = ".zst"):
    s3_client = boto3.client("s3")
    paginator = s3_client.get_paginator("list_objects_v2")

    # Create a paginator to iterate through the objects
    operation_parameters = {"Bucket": bucket_name, "Prefix": prefix}
    page_iterator = paginator.paginate(**operation_parameters)

    zst_files = []

    for page in page_iterator:
        if "Contents" in page:
            for obj in page["Contents"]:
                if obj["Key"].endswith(extension):
                    zst_files.append(obj["Key"])

    return zst_files

In [45]:
from dolma.core.paths import glob_path, mkdir_p, exists
import itertools

config_doc_path = config_path + "/dedupe-url"
mkdir_p(config_doc_path)

SKIP_SNAPSHOTS = [
    "CC-MAIN-2016-26",
    "CC-MAIN-2016-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2017-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2019-51",
    "CC-MAIN-2019-43",
    "CC-MAIN-2020-24",
    "CC-MAIN-2020-10",
    "CC-MAIN-2020-40",
    "CC-MAIN-2023-40",
]
commands = []

for snapshot_path in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/*"):
    snapshot = snapshot_path.split("/")[-1]

    if exists(f"s3://ai2-llm/pretraining-data/sources/cccc/v0/attributes/dedupe_url/{snapshot}"):
        print("Already deduped", snapshot)
        continue

    if snapshot in SKIP_SNAPSHOTS:
        print("Skipping", snapshot)
        continue

    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    paths = [f"    - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix)]
    print(snapshot_path, len(paths))
    config = base_config.format(paths="\n".join(paths), snapshot=snapshot)
    with open(config_doc_path + f"/{snapshot}.yaml", "w") as f:
        f.write(config)

    commands.append(f"dolma -c {config_doc_path.lstrip('../')}/{snapshot}.yaml dedupe")

with open(config_path + "/run-dedupe-url-v2.sh", "w") as f:
    f.write("#!/bin/bash\n\n")
    f.write("set -ex\n\n")
    f.write("\n".join(commands))


Already deduped CC-MAIN-2013-20
Already deduped CC-MAIN-2013-48
Already deduped CC-MAIN-2014-10
Already deduped CC-MAIN-2014-15
Already deduped CC-MAIN-2014-23
Already deduped CC-MAIN-2014-35
Already deduped CC-MAIN-2014-41
Already deduped CC-MAIN-2014-42
Already deduped CC-MAIN-2014-49
Already deduped CC-MAIN-2014-52
Already deduped CC-MAIN-2015-06
Already deduped CC-MAIN-2015-11
Already deduped CC-MAIN-2015-14
Already deduped CC-MAIN-2015-18
Already deduped CC-MAIN-2015-22
Already deduped CC-MAIN-2015-27
Already deduped CC-MAIN-2016-07
Already deduped CC-MAIN-2016-18
Already deduped CC-MAIN-2016-22
Skipping CC-MAIN-2016-26
Skipping CC-MAIN-2016-30
Already deduped CC-MAIN-2017-04
Already deduped CC-MAIN-2017-09
Already deduped CC-MAIN-2017-13
Already deduped CC-MAIN-2017-17
Already deduped CC-MAIN-2017-22
Already deduped CC-MAIN-2017-26
Skipping CC-MAIN-2017-30
Already deduped CC-MAIN-2017-51
Skipping CC-MAIN-2018-09
Already deduped CC-MAIN-2018-13
Already deduped CC-MAIN-2018-22
Alre

In [46]:
config_path = "../configs/cccc"
base_doc_config = """
documents:
{paths}

dedupe:
  name: dedupe_doc
  documents:
    attribute_name: dedupe_doc
    key: $.text
  skip_empty: true


bloom_filter:
  file: /tmp/dedupe-doc/cccc-{snapshot}/filter.bloom
  read_only: false
  estimated_doc_count: 100_000_000
  desired_false_positive_rate: 1e-05

processes: 188
work_dir:
  input: /tmp/dedupe-doc-v1/cccc-{snapshot}/input
  output: /tmp/dedupe-doc-v1/cccc-{snapshot}/output
"""

In [47]:
# to_skip = dedupe_doc_snapshot["step-1"] + dedupe_doc_snapshot["step-2"]
# print(
#     "\n".join(
#         f'"{s}",'
#         for p in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents")
#         if (s := p.rsplit("/", 1)[1]) not in to_skip
#     )
# )

In [48]:
dedupe_doc_snapshot = {
    "step-1": [
        "CC-MAIN-2014-42",
        "CC-MAIN-2014-49",
        "CC-MAIN-2016-07",
        "CC-MAIN-2016-18",
        "CC-MAIN-2016-22",
        "CC-MAIN-2017-09",
        "CC-MAIN-2017-22",
        "CC-MAIN-2017-26",
        "CC-MAIN-2018-13",
        "CC-MAIN-2018-22",
        "CC-MAIN-2018-26",
        "CC-MAIN-2018-30",
        "CC-MAIN-2019-04",
        "CC-MAIN-2019-09",
        "CC-MAIN-2019-13",
        "CC-MAIN-2019-35",
        "CC-MAIN-2019-39",
        "CC-MAIN-2020-29",
        "CC-MAIN-2020-34",
        "CC-MAIN-2021-17",
        "CC-MAIN-2021-43",
        "CC-MAIN-2021-49",
        "CC-MAIN-2022-05",
        "CC-MAIN-2023-06",
        "CC-MAIN-2023-14",
        "CC-MAIN-2023-23",
        "CC-MAIN-2023-50",
        "CC-MAIN-2024-10",
        "CC-MAIN-2024-18",
    ],
    "step-2": [
        "CC-MAIN-2013-20",
        "CC-MAIN-2013-48",
        "CC-MAIN-2014-10",
        "CC-MAIN-2014-15",
        "CC-MAIN-2014-23",
        "CC-MAIN-2014-35",
        "CC-MAIN-2014-41",
        "CC-MAIN-2014-52",
        "CC-MAIN-2015-06",
        "CC-MAIN-2015-11",
        "CC-MAIN-2015-14",
        "CC-MAIN-2015-18",
        "CC-MAIN-2015-22",
        "CC-MAIN-2015-27",
        "CC-MAIN-2017-04",
        "CC-MAIN-2017-13",
        "CC-MAIN-2017-17",
        "CC-MAIN-2017-51",
        "CC-MAIN-2018-34",
        "CC-MAIN-2018-47",
        "CC-MAIN-2018-51",
        "CC-MAIN-2019-30",
        "CC-MAIN-2021-39",
    ],
    "step-3": [
        "CC-MAIN-2016-26",
        "CC-MAIN-2016-30",
        "CC-MAIN-2017-30",
        "CC-MAIN-2018-09",
        "CC-MAIN-2019-43",
        "CC-MAIN-2019-51",
        "CC-MAIN-2020-10",
        "CC-MAIN-2020-24",
        "CC-MAIN-2020-40",
        "CC-MAIN-2023-40",
    ]
}

from dolma.core.paths import glob_path, mkdir_p
import itertools

config_doc_path = config_path + "/dedupe-doc"
base_prefix = "s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents"
mkdir_p(config_doc_path)

for part, snapshots in dedupe_doc_snapshot.items():
    commands = []
    for snapshot in snapshots:
        snapshot_path = f"{base_prefix}/{snapshot}"

        bucket_name, prefix = snapshot_path.split("/", 3)[2:]
        paths = [
            f"  - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix, extension=".gz")
        ]
        print(snapshot_path, len(paths))
        config = base_doc_config.format(paths="\n".join(paths), snapshot=snapshot)
        with open(config_doc_path + f"/{snapshot}.yaml", "w") as f:
            f.write(config)

        commands.append(f"dolma -c {config_doc_path.lstrip('../')}/{snapshot}.yaml dedupe")

    with open(config_path + f"/run-dedupe-doc-{part}.sh", "w") as f:
        f.write("#!/bin/bash\n\n")
        f.write("set -ex\n\n")
        f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-42 65
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-49 33
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-07 35
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-18 26
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-22 25
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-09 44
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-22 49
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-26 54
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-13 43
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-22 36
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-26 41
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_f

In [49]:
mkdir_p(config_doc_path)

for part, snapshots in dedupe_doc_snapshot.items():
    commands = []
    for snapshot in snapshots:
        snapshot_path = f"{base_prefix}/{snapshot}"

        bucket_name, prefix = snapshot_path.split("/", 3)[2:]
        paths = [
            f"  - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix, extension=".gz")
        ]
        print(snapshot_path, len(paths))
        config = base_doc_config.format(paths="\n".join(paths), snapshot=snapshot)
        with open(config_doc_path + f"/{snapshot}.yaml", "w") as f:
            f.write(config)

        commands.append(f"dolma -c {config_doc_path.lstrip('../')}/{snapshot}.yaml dedupe")

    with open(config_path + f"/run-dedupe-doc-{part}.sh", "w") as f:
        f.write("#!/bin/bash\n\n")
        f.write("set -ex\n\n")
        f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-42 65
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-49 33
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-07 35
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-18 26
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-22 25
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-09 44
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-22 49
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-26 54
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-13 43
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-22 36
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-26 41
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_f

In [50]:
config_path = "../configs/cccc"
base_para_config = """
documents:
{paths}

dedupe:
  name: dedupe_para
  paragraphs:
    attribute_name: dedupe_para
    by_ngram:
      ngram_length: 20
      stride: 1
      overlap_threshold: 0.5
      skip_short_paragraphs: true
  skip_empty: true


bloom_filter:
  file: /tmp/dedupe-para/cccc-{snapshot}/filter.bloom
  read_only: false
  estimated_doc_count: 50_000_000_000
  desired_false_positive_rate: 1e-02

processes: 188
work_dir:
  input: /tmp/dedupe-para-v1/cccc-{snapshot}/input
  output: /tmp/dedupe-para-v1/cccc-{snapshot}/output
"""

In [51]:
dedupe_para_snapshots = {
    'part-1': [
        "CC-MAIN-2016-22",
        "CC-MAIN-2017-26",
        "CC-MAIN-2018-26",
        "CC-MAIN-2018-30",
        "CC-MAIN-2019-09",
        "CC-MAIN-2019-13",
        "CC-MAIN-2019-35",
        "CC-MAIN-2020-29",
        "CC-MAIN-2020-34",
        "CC-MAIN-2021-17",
        "CC-MAIN-2021-49",
        "CC-MAIN-2022-05",
        "CC-MAIN-2023-14",
        "CC-MAIN-2024-10",
        "CC-MAIN-2024-18"
    ],
    'part-2': [
        "CC-MAIN-2018-26"
        "CC-MAIN-2021-43"
        "CC-MAIN-2017-51"
        "CC-MAIN-2019-39"
        "CC-MAIN-2023-50"
        "CC-MAIN-2018-13"
        "CC-MAIN-2016-18"
        "CC-MAIN-2016-07"
        "CC-MAIN-2023-23"
        "CC-MAIN-2018-22"
        "CC-MAIN-2019-04"
        "CC-MAIN-2017-22"
        "CC-MAIN-2023-06"
    ]
}

In [52]:
from dolma.core.paths import glob_path, mkdir_p
import itertools

config_doc_path = config_path + "/dedupe-para"
base_prefix = "s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents"
mkdir_p(config_doc_path)

for part, snapshots in dedupe_para_snapshots.items():
    commands = []
    for snapshot in snapshots:
        snapshot_path = f"{base_prefix}/{snapshot}"

        bucket_name, prefix = snapshot_path.split("/", 3)[2:]
        paths = [f"    - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix, extension=".gz")]
        print(snapshot_path, len(paths))
        config = base_para_config.format(paths="\n".join(paths), snapshot=snapshot)
        with open(config_doc_path + f"/{snapshot}.yaml", "w") as f:
            f.write(config)

        commands.append(f"dolma -c {config_doc_path.lstrip('../')}/{snapshot}.yaml dedupe")

    with open(config_path + f"/run-dedupe-para-{part}.sh", "w") as f:
        f.write("#!/bin/bash\n\n")
        f.write("set -ex\n\n")
        f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-22 25


s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-26 54
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-26 41
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-30 45
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2019-09 35
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2019-13 31
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2019-35 51
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2020-29 53
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2020-34 47
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2021-17 17
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2021-49 37
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2022-05 42
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_f

In [53]:
from dolma.core.paths import make_relative

all_snapshots = set(glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/*"))
counted_snapshots = set(
    f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot_path.rsplit('/', 1)[-1]}"
    for snapshot_path in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/attributes/dedupe_url/*")
)
to_skip = set([f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot}" for snapshot in SKIP_SNAPSHOTS])

# print(len(all_snapshots))
# print(len(counted_snapshots))
# print(len(to_skip))
# print(len(all_snapshots - counted_snapshots - to_skip))


to_glob = []
for snapshot_path in sorted(all_snapshots - counted_snapshots - to_skip):
    snapshot = snapshot_path.split("/")[-1]
    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    all_zst = list_zst_files(bucket_name, prefix)

    nested = sum(1 for path in all_zst if "/warc/" in path)

    if nested > 0:
        print(f"  - s3://{bucket_name}/{prefix}/*/warc/*.zst")
    if nested < len(all_zst):
        print(f"  - s3://{bucket_name}/{prefix}/*.zst")

In [54]:
mixer_config = """
streams:
  - name: cccc-{snapshot}
    documents:{paths}
    attributes:
      - c4_v2
      - dedupe_url
      - ft_lang_id_1e2
      - gopher_v2
      - tokenizer_repetitions_v2r2
      - whitespace_tokenizer_v1
    output:
      max_size_in_bytes: 2_000_000_000
      path: s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/{snapshot}
      min_text_length: 25   # matches wikipedia
    filter:
      syntax: jq
      include:
        # Only English
        - >-
          (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en != null) and
          (.attributes.ft_lang_id_1e2__ft_lang_id_1e2__en[0][2] > 0.5)
      exclude:
        # Duplicated URLs
        - (.attributes.dedupe_url | length > 1)

        # C4 Rules
        - >-
          (.attributes.c4_v2__c4_v2__has_curly_brace != null) and
          (.attributes.c4_v2__c4_v2__has_curly_brace[0][2] > 0.5)
        - >-
          (.attributes.c4_v2__c4_v2__has_lorem_ipsum != null) and
          (.attributes.c4_v2__c4_v2__has_lorem_ipsum[0][2] > 0.5)
        - >-
          (.attributes.c4_v2__c4_v2__has_javascript != null) and
          (.attributes.c4_v2__c4_v2__has_javascript[0][2] > 0.5)

        # Gopher Rules
        - >-
          (.attributes.gopher_v2__gopher_v2__word_count != null) and
          (.attributes.gopher_v2__gopher_v2__word_count[0][2] < 50)
        - >-
          (.attributes.gopher_v2__gopher_v2__word_count != null) and
          (.attributes.gopher_v2__gopher_v2__word_count[0][2] > 100000)
        - >-
          (.attributes.gopher_v2__gopher_v2__median_word_length != null) and
          (.attributes.gopher_v2__gopher_v2__median_word_length[0][2] < 3)
        - >-
          (.attributes.gopher_v2__gopher_v2__median_word_length != null) and
          (.attributes.gopher_v2__gopher_v2__median_word_length[0][2] > 10)
        - >-
          (.attributes.gopher_v2__gopher_v2__symbol_to_word_ratio != null) and
          (.attributes.gopher_v2__gopher_v2__symbol_to_word_ratio[0][2] > 0.1)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_words_with_alpha_character != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_words_with_alpha_character[0][2] < 0.8)
        - >-
          (.attributes.gopher_v2__gopher_v2__required_word_count != null) and
          (.attributes.gopher_v2__gopher_v2__required_word_count[0][2] < 2)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_lines_starting_with_bullet_point != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_lines_ending_with_ellipsis != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_duplicate_lines != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_duplicate_lines[0][2] > 0.3)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_lines != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_2gram != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_3gram != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_4gram != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_5grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_6grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_7grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_8grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_9grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)
        - >-
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_10grams != null) and
          (.attributes.gopher_v2__gopher_v2__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)

        # Remove repetitions
        - >-
          (.attributes.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition != null) and
          (.attributes.tokenizer_repetitions_v2r2__tokenizer_repetitions_v2r2__doc_max_score_repetition > 10)

        # Remove NC or ND licensed pages
        - >-
          (.metadata.attribute_spans != null) and
          (.metadata.attribute_spans | keys | map(select(test("_nc_"|"_nd_"))) | length > 0)

work_dir:
  input: "/tmp/cccc-{snapshot}/mix/input"
  output: "/tmp/cccc-{snapshot}/mix/output"

processes: 100
"""

In [55]:
part_1_snapshots = [
    "CC-MAIN-2024-18", "CC-MAIN-2024-10", "CC-MAIN-2023-50", "CC-MAIN-2023-40", "CC-MAIN-2023-23", "CC-MAIN-2023-14", "CC-MAIN-2023-06", "CC-MAIN-2023-06", "CC-MAIN-2022-05", "CC-MAIN-2022-05", "CC-MAIN-2021-49", "CC-MAIN-2021-49", "CC-MAIN-2021-43", "CC-MAIN-2021-43", "CC-MAIN-2021-17", "CC-MAIN-2021-17", "CC-MAIN-2020-40", "CC-MAIN-2020-34", "CC-MAIN-2020-29", "CC-MAIN-2020-24", "CC-MAIN-2020-10", "CC-MAIN-2020-10", "CC-MAIN-2019-51", "CC-MAIN-2019-51", "CC-MAIN-2019-43", "CC-MAIN-2019-43", "CC-MAIN-2019-39", "CC-MAIN-2019-39", "CC-MAIN-2019-35", "CC-MAIN-2019-35", "CC-MAIN-2019-13", "CC-MAIN-2019-13", "CC-MAIN-2019-09", "CC-MAIN-2019-09", "CC-MAIN-2019-04", "CC-MAIN-2019-04", "CC-MAIN-2018-30", "CC-MAIN-2018-30", "CC-MAIN-2018-26", "CC-MAIN-2018-26", "CC-MAIN-2018-22", "CC-MAIN-2018-22", "CC-MAIN-2018-13", "CC-MAIN-2018-09", "CC-MAIN-2017-51", "CC-MAIN-2017-30", "CC-MAIN-2017-30", "CC-MAIN-2017-26", "CC-MAIN-2017-26", "CC-MAIN-2017-22", "CC-MAIN-2017-22", "CC-MAIN-2016-30", "CC-MAIN-2016-30", "CC-MAIN-2016-26", "CC-MAIN-2016-26", "CC-MAIN-2016-22", "CC-MAIN-2016-22", "CC-MAIN-2016-18", "CC-MAIN-2016-18", "CC-MAIN-2016-07",
    "CC-MAIN-2016-07"
]
part_1_snapshots = set(part_1_snapshots)

In [56]:
part_2_snapshots = set(["CC-MAIN-2013-20", "CC-MAIN-2013-48", "CC-MAIN-2014-10", "CC-MAIN-2014-15", "CC-MAIN-2014-15", "CC-MAIN-2014-23", "CC-MAIN-2014-23", "CC-MAIN-2014-35", "CC-MAIN-2014-35", "CC-MAIN-2014-41", "CC-MAIN-2014-41", "CC-MAIN-2014-42", "CC-MAIN-2014-42", "CC-MAIN-2014-49", "CC-MAIN-2014-49", "CC-MAIN-2014-52", "CC-MAIN-2014-52", "CC-MAIN-2015-06", "CC-MAIN-2015-06", "CC-MAIN-2015-11", "CC-MAIN-2015-11", "CC-MAIN-2015-14", "CC-MAIN-2015-14", "CC-MAIN-2015-18", "CC-MAIN-2015-18", "CC-MAIN-2015-22", "CC-MAIN-2015-22", "CC-MAIN-2015-27", "CC-MAIN-2015-27", "CC-MAIN-2017-04", "CC-MAIN-2017-04", "CC-MAIN-2017-09", "CC-MAIN-2017-09", "CC-MAIN-2017-13", "CC-MAIN-2017-13", "CC-MAIN-2017-17", "CC-MAIN-2017-17", "CC-MAIN-2018-34", "CC-MAIN-2018-34", "CC-MAIN-2018-47", "CC-MAIN-2018-47", "CC-MAIN-2018-51", "CC-MAIN-2018-51", "CC-MAIN-2019-30", "CC-MAIN-2019-30", "CC-MAIN-2021-39", "CC-MAIN-2021-39"])

In [57]:
from dolma.core.paths import glob_path, mkdir_p
import itertools

mixer_path = config_path + "/mixer"
mkdir_p(mixer_path)

all_snapshots = {
    "part-1": set(
        [f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot}" for snapshot in part_1_snapshots]
    ),
    "part-2": set(
        [f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot}" for snapshot in part_2_snapshots]
    ),
}

commands = []
snapshot_part = 'part-2'

for snapshot_path in all_snapshots[snapshot_part]:
    snapshot = snapshot_path.split("/")[-1]

    if snapshot in SKIP_SNAPSHOTS:
        print("Skipping", snapshot)
        continue

    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    paths = [f"          - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix)]
    print(snapshot_path, len(paths))
    config = mixer_config.format(paths="\n" + "\n".join(paths), snapshot=snapshot)
    with open(mixer_path + f"/{snapshot}.yaml", "w") as f:
        f.write(config)

    commands.append(f"dolma -c {mixer_path.lstrip('../')}/{snapshot}.yaml mix")

with open(config_path + f"/run-mix-{snapshot_part}.sh", "w") as f:
    f.write("#!/bin/bash\n\n")
    f.write("set -ex\n\n")
    f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-15 465
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-22 372
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-06 252
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-14 292
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2018-51 639
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2017-04 578
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-10 557
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2021-39 685
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-52 437
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2017-13 665
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-11 331
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-48 519
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2017-09 652
s3://ai2-llm

In [63]:
mixer_v2_config = """
streams:
  - name: cccc-{snapshot}
    documents:{paths}
    attributes:
      - dedupe_doc
      - dedupe_para
    output:
      max_size_in_bytes: 2_000_000_000
      path: s3://ai2-llm/pretraining-data/sources/cccc/v2/documents/{snapshot}
    filter:
      syntax: jq
      include:

      exclude:
        # Exact duplicates
        - .attributes.dedupe_doc != null

        # Fuzzy duplicates with above 0.5 of shared ngrams (TO CHANGE)
        - >-
          (.attributes.dedupe_para | length > 0) and
          ((.attributes.dedupe_para | map(.[2] * (.[1] - .[0])) | add) / (.text | length) >= 0.8)

work_dir:
  input: "/tmp/cccc-{snapshot}/mix-v2/input"
  output: "/tmp/cccc-{snapshot}/mix-v2/output"

processes: 100
"""

In [66]:
from dolma.core.paths import glob_path, mkdir_p
import itertools

mixer_v2_path = config_path + "/mix-v2"
base_prefix = "s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents"
mkdir_p(mixer_v2_path)

for part, snapshots in dedupe_doc_snapshot.items():
    commands = []
    for snapshot in snapshots:
        snapshot_path = f"{base_prefix}/{snapshot}"

        bucket_name, prefix = snapshot_path.split("/", 3)[2:]
        paths = [f"      - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix, extension=".gz")]
        print(snapshot_path, len(paths))
        config = mixer_v2_config.format(paths="\n" + "\n".join(paths), snapshot=snapshot)
        with open(mixer_v2_path + f"/{snapshot}.yaml", "w") as f:
            f.write(config)

        commands.append(f"dolma -c {mixer_v2_path.lstrip('../')}/{snapshot}.yaml mix")

    with open(config_path + f"/run-mix-v2-{part}.sh", "w") as f:
        f.write("#!/bin/bash\n\n")
        f.write("set -ex\n\n")
        f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-42 65
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2014-49 33
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-07 35
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-18 26
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2016-22 25
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-09 44
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-22 49
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2017-26 54
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-13 43
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-22 36
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_fix/documents/CC-MAIN-2018-26 41
s3://ai2-llm/pretraining-data/sources/cccc/v1_gopher_f