In [1]:
from datasets import load_dataset
import json

def get_fineweb_edu_data_sharded(
    shard_size = 50000,
    max_samples = 20000000,
    out_prefix = "./train_shard",
    val_filename = "./val_shard.json",
    val_size = 1000
):
    """
    Stream the FineWeb-Edu dataset and write out training samples in shards. Also create a validation shard of 'val_size' samples at the beginning.
    These are stored as raw samples.

    Params:
        @shard_size: Number of samples per training shard.
        @max_samples: Total samples for training. If `None`, read until dataset ends.
        @out_prefix: Filename prefix for train shards.
        @val_filename: Filename for the validation shard.
        @val_size: Number of samples in the validation set.
    """
    ds = load_dataset("HuggingFaceFW/fineweb-edu", name="default", split="train", streaming=True)
    ds = ds.filter(lambda x: x.get("language") == "en") #and x.get("score") >= 4
    ds_iter = iter(ds)

    # ------------------------------------------------
    # Collect validation samples
    # ------------------------------------------------
    val_data = []
    for _ in range(val_size):
        sample = next(ds_iter, None)
        if sample is None:
            break
        val_data.append(sample["text"])

    with open(val_filename, "w", encoding="utf-8") as f:
        json.dump(val_data, f, ensure_ascii=False)
    print(f"Saved {len(val_data)} validation samples to {val_filename}")

    # ------------------------------------------------
    # Collect training shards in a single pass
    # ------------------------------------------------
    total_written = 0
    shard_idx = 0

    while True:
        # If we have a max_samples limit and we've reached it, stop
        if max_samples is not None and total_written >= max_samples:
            break

        # Gather up to shard_size items
        chunk = []
        for _ in range(shard_size):
            sample = next(ds_iter, None)
            if sample is None:
                # No more data in the stream
                break
            chunk.append(sample)

        if not chunk:
            break  # We reached EOF on the stream

        # Extract text from each sample
        texts = [x["text"] for x in chunk]

        # Write shard
        shard_path = f"{out_prefix}_{shard_idx}.json"
        with open(shard_path, "w", encoding="utf-8") as f:
            json.dump(texts, f, ensure_ascii=False)

        shard_idx += 1
        total_written += len(chunk)
        print(f"Wrote shard {shard_path} with {len(chunk)} samples (total so far: {total_written}).")

    print("Done generating shards.")

get_fineweb_edu_data_sharded()

README.md:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Saved 1000 validation samples to ./val_shard.json
Wrote shard ./train_shard_0.json with 50000 samples (total so far: 50000).
Wrote shard ./train_shard_1.json with 50000 samples (total so far: 100000).
Wrote shard ./train_shard_2.json with 50000 samples (total so far: 150000).
Wrote shard ./train_shard_3.json with 50000 samples (total so far: 200000).
Wrote shard ./train_shard_4.json with 50000 samples (total so far: 250000).
Wrote shard ./train_shard_5.json with 50000 samples (total so far: 300000).
Wrote shard ./train_shard_6.json with 50000 samples (total so far: 350000).
Wrote shard ./train_shard_7.json with 50000 samples (total so far: 400000).
Wrote shard ./train_shard_8.json with 50000 samples (total so far: 450000).
Wrote shard ./train_shard_9.json with 50000 samples (total so far: 500000).
Wrote shard ./train_shard_10.json with 50000 samples (total so far: 550000).
Wrote shard ./train_shard_11.json with 50000 samples (total so far: 600000).
Wrote shard ./train_shard_12.json wit

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1cdddff9-3ba3-4d6d-aa58-5cc19481dde3)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-20/train-00006-of-00014.parquet
Retrying in 1s [Retry 1/5].


Wrote shard ./train_shard_108.json with 50000 samples (total so far: 5450000).


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 62774ec6-d85c-41db-8789-f7b59f14f203)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-20/train-00006-of-00014.parquet
Retrying in 1s [Retry 1/5].


Wrote shard ./train_shard_109.json with 50000 samples (total so far: 5500000).
Wrote shard ./train_shard_110.json with 50000 samples (total so far: 5550000).
Wrote shard ./train_shard_111.json with 50000 samples (total so far: 5600000).
Wrote shard ./train_shard_112.json with 50000 samples (total so far: 5650000).
Wrote shard ./train_shard_113.json with 50000 samples (total so far: 5700000).
Wrote shard ./train_shard_114.json with 50000 samples (total so far: 5750000).
Wrote shard ./train_shard_115.json with 50000 samples (total so far: 5800000).
Wrote shard ./train_shard_116.json with 50000 samples (total so far: 5850000).
Wrote shard ./train_shard_117.json with 50000 samples (total so far: 5900000).
Wrote shard ./train_shard_118.json with 50000 samples (total so far: 5950000).
Wrote shard ./train_shard_119.json with 50000 samples (total so far: 6000000).
Wrote shard ./train_shard_120.json with 50000 samples (total so far: 6050000).
Wrote shard ./train_shard_121.json with 50000 sample

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 97224f4d-a96e-40bc-b210-41d5c05675bd)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-20/train-00009-of-00014.parquet
Retrying in 1s [Retry 1/5].


Wrote shard ./train_shard_141.json with 50000 samples (total so far: 7100000).
Wrote shard ./train_shard_142.json with 50000 samples (total so far: 7150000).
Wrote shard ./train_shard_143.json with 50000 samples (total so far: 7200000).
Wrote shard ./train_shard_144.json with 50000 samples (total so far: 7250000).
Wrote shard ./train_shard_145.json with 50000 samples (total so far: 7300000).
Wrote shard ./train_shard_146.json with 50000 samples (total so far: 7350000).
Wrote shard ./train_shard_147.json with 50000 samples (total so far: 7400000).
Wrote shard ./train_shard_148.json with 50000 samples (total so far: 7450000).
Wrote shard ./train_shard_149.json with 50000 samples (total so far: 7500000).
Wrote shard ./train_shard_150.json with 50000 samples (total so far: 7550000).
Wrote shard ./train_shard_151.json with 50000 samples (total so far: 7600000).
Wrote shard ./train_shard_152.json with 50000 samples (total so far: 7650000).
Wrote shard ./train_shard_153.json with 50000 sample

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: a43a0749-66a2-4186-a2a5-3611dedfac8a)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-20/train-00013-of-00014.parquet
Retrying in 1s [Retry 1/5].


Wrote shard ./train_shard_212.json with 50000 samples (total so far: 10650000).
Wrote shard ./train_shard_213.json with 50000 samples (total so far: 10700000).
Wrote shard ./train_shard_214.json with 50000 samples (total so far: 10750000).
Wrote shard ./train_shard_215.json with 50000 samples (total so far: 10800000).
Wrote shard ./train_shard_216.json with 50000 samples (total so far: 10850000).
Wrote shard ./train_shard_217.json with 50000 samples (total so far: 10900000).
Wrote shard ./train_shard_218.json with 50000 samples (total so far: 10950000).
Wrote shard ./train_shard_219.json with 50000 samples (total so far: 11000000).
Wrote shard ./train_shard_220.json with 50000 samples (total so far: 11050000).
Wrote shard ./train_shard_221.json with 50000 samples (total so far: 11100000).
Wrote shard ./train_shard_222.json with 50000 samples (total so far: 11150000).
Wrote shard ./train_shard_223.json with 50000 samples (total so far: 11200000).


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 2c91a873-dc81-4690-b7d7-d183fb17a512)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-48/train-00000-of-00014.parquet
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 24790192-bd73-4bde-ae46-14d60c2ce859)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-48/train-00000-of-00014.parquet
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 27929727-e6db-4213-972e-87c4a9fd8c2c)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fi

Wrote shard ./train_shard_224.json with 50000 samples (total so far: 11250000).
Wrote shard ./train_shard_225.json with 50000 samples (total so far: 11300000).
Wrote shard ./train_shard_226.json with 50000 samples (total so far: 11350000).
Wrote shard ./train_shard_227.json with 50000 samples (total so far: 11400000).
Wrote shard ./train_shard_228.json with 50000 samples (total so far: 11450000).
Wrote shard ./train_shard_229.json with 50000 samples (total so far: 11500000).
Wrote shard ./train_shard_230.json with 50000 samples (total so far: 11550000).
Wrote shard ./train_shard_231.json with 50000 samples (total so far: 11600000).
Wrote shard ./train_shard_232.json with 50000 samples (total so far: 11650000).
Wrote shard ./train_shard_233.json with 50000 samples (total so far: 11700000).
Wrote shard ./train_shard_234.json with 50000 samples (total so far: 11750000).
Wrote shard ./train_shard_235.json with 50000 samples (total so far: 11800000).
Wrote shard ./train_shard_236.json with 

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 64b0746a-185d-4ba9-9bcb-eb78d8b7b2f5)')' thrown while requesting GET https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu/resolve/4863ab07d7520451e6f73e2912ad8bfee7d97c11/data/CC-MAIN-2013-48/train-00007-of-00014.parquet
Retrying in 1s [Retry 1/5].


Wrote shard ./train_shard_329.json with 50000 samples (total so far: 16500000).
Wrote shard ./train_shard_330.json with 50000 samples (total so far: 16550000).
Wrote shard ./train_shard_331.json with 50000 samples (total so far: 16600000).
Wrote shard ./train_shard_332.json with 50000 samples (total so far: 16650000).
Wrote shard ./train_shard_333.json with 50000 samples (total so far: 16700000).
Wrote shard ./train_shard_334.json with 50000 samples (total so far: 16750000).
Wrote shard ./train_shard_335.json with 50000 samples (total so far: 16800000).
Wrote shard ./train_shard_336.json with 50000 samples (total so far: 16850000).
Wrote shard ./train_shard_337.json with 50000 samples (total so far: 16900000).
Wrote shard ./train_shard_338.json with 50000 samples (total so far: 16950000).
Wrote shard ./train_shard_339.json with 50000 samples (total so far: 17000000).
Wrote shard ./train_shard_340.json with 50000 samples (total so far: 17050000).
Wrote shard ./train_shard_341.json with 

In [None]:
# Old, non-sharded
# def get_fineweb_edu_data(n_samples: int = 1000):
#     dataset = load_dataset("HuggingFaceFW/fineweb-edu", name = "default", split = 'train', streaming = True)
#     dataset = dataset.filter(lambda x: x.get('language') == 'en' and x.get('score') >= 4)
#     dataset_pulled = list(islice(dataset, n_samples))  # Convert to a list of the first 1,000 samples
#     dataset_pulled = [x['text'] for x in dataset_pulled]    
#     return dataset_pulled