In [None]:
import glob
from datasets import load_dataset, IterableDataset
from itertools import islice, chain
from tqdm import tqdm
import json
from pathlib import Path

In [None]:
# Paths to all files
all_files = glob.glob('../../data/**/*.json.gz', recursive=True)

# Separate StarCoder files from the rest
starcoder_files = [f for f in all_files if 'starcoder' in f]
other_files = [f for f in all_files if 'starcoder' not in f]

def filter_valid_text(example):
    return example.get("text") not in [None, "", "null"]

def keep_text_source(example):
    return {
        "text": example["text"],
        "source": example.get("source")
    }

def load_and_sample(files, count, shuffle_buffer, seed):
    return islice(
        load_dataset("json", data_files=files, split="train", streaming=True)
        .filter(filter_valid_text)
        .map(keep_text_source)
        .shuffle(buffer_size=shuffle_buffer, seed=seed),
        count
    )

def combined_generator():
    starcoder = load_and_sample(starcoder_files, 250_000, shuffle_buffer=100_000, seed=42)
    other = load_and_sample(other_files, 250_000, shuffle_buffer=1_000_000, seed=42)
    return chain(starcoder, other)

In [None]:
combined_dataset = IterableDataset.from_generator(combined_generator)

In [12]:
print("Shuffling interleaved dataset...")
shuffled_combined = combined_dataset.shuffle(buffer_size=500_000, seed=42)

Shuffling interleaved dataset...


In [13]:
print("Writing to file...")
output_path = Path("../../data/dolma/tokenizer_corpus_500K.txt")
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8") as f:
    for record in tqdm(shuffled_combined):
        f.write(json.dumps(record['text']) + "\n")

Writing to file...


0it [00:00, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2370 [00:00<?, ?it/s]

500000it [05:19, 1564.94it/s] 


In [None]:
from pathlib import Path
import random

# Define input and output paths
file1 = Path("../../data/dolma/tokenizer_corpus_500K.txt")
file2 = Path("../../data/mafat/hebrew/tokenizer_corpus_500K.txt")
output_file = Path("../../data/tokenizer_corpus_1M.txt")

# Read and merge lines
with file1.open("r", encoding="utf-8") as f1, file2.open("r", encoding="utf-8") as f2:
    lines = f1.readlines() + f2.readlines()

# Shuffle
random.seed(42)  # for reproducibility
random.shuffle(lines)

# Write output
with output_file.open("w", encoding="utf-8") as out:
    out.writelines(lines)
