In [4]:
import multiprocessing
from datasets import Dataset
from transformers import AutoTokenizer

# Configurable parameters
cleaned_data_path = "/ephemeral/cc100_mn_cleaned.txt"
output_dir = "/ephemeral/tokenized_dataset_dir_remaining"
num_workers = multiprocessing.cpu_count() - 15

# Function to read and preprocess lines in parallel
def load_lines_in_batches(args):
    start, end = args
    with open(cleaned_data_path, "r", encoding="utf-8") as f:
        lines = f.readlines()[start:end]
    return [line.strip() for line in lines if line.strip()]

# Determine the total number of lines in the file
with open(cleaned_data_path, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

# Define ranges for lines after the first 1M
print("Calculating ranges for remaining lines...")
batch_size = 100_000
start_line = 1_000_000
num_batches = (total_lines - start_line) // batch_size
batch_ranges = [
    (start_line + i * batch_size, start_line + (i + 1) * batch_size)
    for i in range(num_batches)
]

# Process the remaining lines using multiprocessing
print("Loading cleaned data (remaining lines)...")
with multiprocessing.Pool(num_workers) as pool:
    results = pool.map(load_lines_in_batches, batch_ranges)

# Flatten the results into a single list
lines = [line for batch in results for line in batch]

# Add the final batch (remaining lines if not divisible by batch size)
with open(cleaned_data_path, "r", encoding="utf-8") as f:
    lines.extend([line.strip() for line in f.readlines()[start_line + num_batches * batch_size :] if line.strip()])

# Convert the data into a Dataset
dataset = Dataset.from_dict({"text": lines})
print("Cleaned data (remaining lines) loaded as a Dataset.")

# Load tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from transformers import LlamaTokenizer, LlamaTokenizerFast

# Log in to Hugging Face
login(token="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")# Replace with your tokenizer/model
tokenizer = LlamaTokenizer.from_pretrained("Billyyy/mongolian-llama-untrained")

# Define the tokenizer function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Ensures all inputs are fixed-length
        max_length=256,
        return_tensors=None,   # Return lists, not PyTorch tensors (needed by `datasets`)
    )

# Tokenize the dataset
print("Tokenizing the dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8000,
    num_proc=num_workers,
    remove_columns=["text"],
    load_from_cache_file=False,
)

# Save the tokenized dataset
print("Saving tokenized dataset...")
tokenized_dataset.save_to_disk(output_dir)

print(f"Tokenized dataset saved to {output_dir}")


Calculating ranges for remaining lines...
Loading cleaned data (remaining lines)...
Cleaned data (remaining lines) loaded as a Dataset.
Tokenizing the dataset...


Map (num_proc=13):   0%|          | 0/7377722 [00:00<?, ? examples/s]

Saving tokenized dataset...


Saving the dataset (0/20 shards):   0%|          | 0/7377722 [00:00<?, ? examples/s]

Tokenized dataset saved to /ephemeral/tokenized_dataset_dir_remaining


In [2]:
import os

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)