In [1]:
import dolma

In [11]:
config_path = "../configs/cccc"
base_config = """
documents:
{paths}

dedupe:
  name: dedupe_url
  documents:
    attribute_name: dedupe_url
    key: $.metadata.url
  skip_empty: true


bloom_filter:
  file: /tmp/dedupe-url/cccc-{snapshot}/filter.bloom
  read_only: false
  estimated_doc_count: 100_000_000
  desired_false_positive_rate: 1e-06

processes: 188
work_dir:
  input: /tmp/dedupe-url/cccc-{snapshot}/input
  output: /tmp/dedupe-url/cccc-{snapshot}/output
"""

In [12]:
import boto3


def list_zst_files(bucket_name, prefix):
    s3_client = boto3.client("s3")
    paginator = s3_client.get_paginator("list_objects_v2")

    # Create a paginator to iterate through the objects
    operation_parameters = {"Bucket": bucket_name, "Prefix": prefix}
    page_iterator = paginator.paginate(**operation_parameters)

    zst_files = []

    for page in page_iterator:
        if "Contents" in page:
            for obj in page["Contents"]:
                if obj["Key"].endswith(".zst"):
                    zst_files.append(obj["Key"])

    return zst_files

In [13]:
from dolma.core.paths import glob_path, mkdir_p
import itertools

config_url_path = config_path + "/dedupe-url"
mkdir_p(config_url_path)

SKIP_SNAPSHOTS = [
    "CC-MAIN-2016-26",
    "CC-MAIN-2016-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2017-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2019-51",
    "CC-MAIN-2019-43",
    "CC-MAIN-2020-24",
    "CC-MAIN-2020-10",
    "CC-MAIN-2020-40",
    "CC-MAIN-2023-40",
]
commands = []

for snapshot_path in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/*"):
    snapshot = snapshot_path.split("/")[-1]

    if snapshot in SKIP_SNAPSHOTS:
        print("Skipping", snapshot)
        continue

    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    paths = [f"    - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix)]
    print(snapshot_path, len(paths))
    config = base_config.format(paths="\n".join(paths), snapshot=snapshot)
    with open(config_url_path + f"/{snapshot}.yaml", "w") as f:
        f.write(config)

    commands.append(f"dolma -c {config_url_path.lstrip('../')}/{snapshot}.yaml dedupe")

with open(config_path + "/run-dedupe-url.sh", "w") as f:
    f.write("#!/bin/bash\n\n")
    f.write("set -ex\n\n")
    f.write("\n".join(commands))


s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-20 316
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-48 519
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-10 557
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-15 465
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-23 636
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-35 529
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-41 483
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-42 597
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-49 320
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-52 437
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-06 252
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-11 331
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-14 292
s3://ai2-llm

In [18]:
config_path = "../configs/cccc"
base_para_config = """
documents:
{paths}

dedupe:
  name: dedupe_para
  paragraphs:
    attribute_name: dedupe_para
    by_ngram:
      ngram_length: 20
      stride: 1
      overlap_threshold: 0.5
      skip_short_paragraphs: true
  skip_empty: true


bloom_filter:
  file: /tmp/dedupe-para/cccc-{snapshot}/filter.bloom
  read_only: false
  estimated_doc_count: 50_000_000_000
  desired_false_positive_rate: 1e-02

processes: 188
work_dir:
  input: /tmp/dedupe-para/cccc-{snapshot}/input
  output: /tmp/dedupe-para/cccc-{snapshot}/output
"""

In [19]:
from dolma.core.paths import glob_path, mkdir_p
import itertools

config_url_path = config_path + "/dedupe-para"
mkdir_p(config_url_path)

SKIP_SNAPSHOTS = [
    "CC-MAIN-2016-26",
    "CC-MAIN-2016-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2017-30",
    "CC-MAIN-2018-09",
    "CC-MAIN-2019-51",
    "CC-MAIN-2019-43",
    "CC-MAIN-2020-24",
    "CC-MAIN-2020-10",
    "CC-MAIN-2020-40",
    "CC-MAIN-2023-40",
]
commands = []

for snapshot_path in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/*"):
    snapshot = snapshot_path.split("/")[-1]

    if snapshot in SKIP_SNAPSHOTS:
        print("Skipping", snapshot)
        continue

    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    paths = [f"    - s3://{bucket_name}/{path}" for path in list_zst_files(bucket_name, prefix)]
    print(snapshot_path, len(paths))
    config = base_para_config.format(paths="\n".join(paths), snapshot=snapshot)
    with open(config_url_path + f"/{snapshot}.yaml", "w") as f:
        f.write(config)

    commands.append(f"dolma -c {config_url_path.lstrip('../')}/{snapshot}.yaml dedupe")

with open(config_path + "/run-dedupe-para.sh", "w") as f:
    f.write("#!/bin/bash\n\n")
    f.write("set -ex\n\n")
    f.write("\n".join(commands))

s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-20 316
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-48 519
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-10 557
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-15 465
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-23 636
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-35 529
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-41 483
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-42 597
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-49 320
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-52 437
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-06 252
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-11 331
s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2015-14 292
s3://ai2-llm

In [34]:
from dolma.core.paths import make_relative

all_snapshots = set(glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/*"))
counted_snapshots = set(
    f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot_path.rsplit('/', 1)[-1]}"
    for snapshot_path in glob_path("s3://ai2-llm/pretraining-data/sources/cccc/v0/attributes/c4_v2/*")
)
to_skip = set([f"s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/{snapshot}" for snapshot in SKIP_SNAPSHOTS])

# print(len(all_snapshots))
# print(len(counted_snapshots))
# print(len(to_skip))
# print(len(all_snapshots - counted_snapshots - to_skip))


to_glob = []
for snapshot_path in sorted(all_snapshots - counted_snapshots - to_skip):
    snapshot = snapshot_path.split("/")[-1]
    bucket_name, prefix = snapshot_path.split("/", 3)[2:]
    all_zst = list_zst_files(bucket_name, prefix)

    nested = sum(1 for path in all_zst if "/warc/" in path)

    if nested > 0:
        print(f"  - s3://{bucket_name}/{prefix}/*/warc/*.zst")
    if nested < len(all_zst):
        print(f"  - s3://{bucket_name}/{prefix}/*.zst")

  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-20/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2013-48/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-10/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-15/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-15/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-23/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-23/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-35/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-35/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-41/*/warc/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0/documents/CC-MAIN-2014-41/*.zst
  - s3://ai2-llm/pretraining-data/sources/cccc/v0