In [15]:
import os
import torch

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)

In [2]:
from datasets import load_dataset

# Load and save the dataset into the /ephemeral directory
dataset = load_dataset("cc100", "mn", trust_remote_code=True)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 15098167
    })
})

In [3]:
from datasets import load_dataset
import re

# Load the CC100 dataset for Mongolian
dataset = load_dataset("cc100", "mn", split="train", trust_remote_code=True)

# Define a function to clean Mongolian text
def clean_mongolian_text(text):
    """
    Cleans Mongolian text by:
    1. Converting to lowercase
    2. Removing URLs
    3. Removing HTML tags
    4. Removing non-Mongolian characters
    5. Normalizing whitespace
    6. Removing short or meaningless lines
    """
    if not text:  # Handle empty or None input
        return None

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove non-Mongolian characters (retain Cyrillic, spaces, punctuation)
    mongolian_pattern = r"[А-Яа-яҮүӨөЁёЭэ0-9\s.,!?;:\"'()-]"
    text = "".join([char if re.match(mongolian_pattern, char) else " " for char in text])

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Remove excessive punctuation (optional)
    text = re.sub(r"[!?]{2,}", "!", text)  # Replace multiple exclamation marks
    text = re.sub(r"[.]{2,}", ".", text)   # Replace multiple periods with a single one

    # Filter out very short lines
    if len(text) < 20:  # Adjust this threshold as needed
        return None

    return text

# Apply cleaning safely
def clean_dataset(example):
    """
    Cleans text and ensures only valid rows are processed.
    """
    cleaned_text = clean_mongolian_text(example["text"])
    # Return the cleaned text if valid, otherwise keep the row unchanged
    return {"text": cleaned_text} if cleaned_text else {"text": ""}

# Remove invalid rows before mapping
filtered_dataset = dataset.filter(lambda x: clean_mongolian_text(x["text"]) is not None, num_proc=27)

# Apply cleaning and update the text
cleaned_dataset = filtered_dataset.map(clean_dataset, num_proc=27)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Remove the 'id' column from the cleaned dataset
dataset = cleaned_dataset.remove_columns("id")

In [17]:
dataset

Dataset({
    features: ['text', 'input_ids'],
    num_rows: 11585734
})

In [5]:
def merge_sequences_with_text(dataset, tokenizer, max_length=1024):
    """
    Merge sequences in a dataset while keeping 'text' and 'input_ids' aligned.
    
    Args:
        dataset (Dataset): Hugging Face Dataset with 'text' and 'input_ids'.
        tokenizer: Tokenizer used to tokenize the text.
        max_length (int): Maximum length for merged sequences.

    Returns:
        dict: A dictionary with merged 'text' and 'input_ids'.
    """
    merged_texts = []
    merged_input_ids = []
    current_text = []
    current_input_ids = []

    for text, input_ids in zip(dataset["text"], dataset["input_ids"]):
        # Check if adding the current sequence exceeds the max length
        if len(current_input_ids) + len(input_ids) > max_length:
            # Append the merged sequences
            merged_texts.append(" ".join(current_text))
            merged_input_ids.append(current_input_ids[:max_length])
            # Reset for the next sequence
            current_text = []
            current_input_ids = []
        
        # Extend the current sequence
        current_text.append(text)
        current_input_ids.extend(input_ids)
    
    # Add the final batch if it exists
    if current_input_ids:
        merged_texts.append(" ".join(current_text))
        merged_input_ids.append(current_input_ids[:max_length])

    return {"text": merged_texts, "input_ids": merged_input_ids}


from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Billyyy/bloomz_mn")

# Add 'input_ids' column if not already in the dataset
if "input_ids" not in dataset.column_names:
    dataset = dataset.map(
        lambda examples: {"input_ids": tokenizer(examples["text"], truncation=True, padding=False, max_length=1024)["input_ids"]},
        batched=True,
        num_proc=27
    )

# Apply the merging function
merged_dataset = dataset.map(
    lambda batch: merge_sequences_with_text(batch, tokenizer, max_length=1024),
    batched=True, 
    num_proc=27
)

In [None]:
from datasets import DatasetDict
from huggingface_hub import login

# Save the cleaned dataset to disk in Arrow format
merged_dataset.save_to_disk("/ephemeral/cleaned_mongolian_dataset")

# Reload the dataset as a DatasetDict if it's not already
dataset_dict = DatasetDict({"train": merged_dataset})

# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub("Billyyy/cleaned-mongolian-dataset", private=False)

Saving the dataset (14/14 shards): 100%|██████████| 997125/997125 [00:06<00:00, 145578.37 examples/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 35.68ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 34.24ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 34.59ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 32.57ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 29.96ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 31.90ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 34.70ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 35.62ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 35.07ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72 [00:02<00:00, 34.81ba/s]
Creating parquet from Arrow format: 100%|██████████| 72/72

CommitInfo(commit_url='https://huggingface.co/datasets/Billyyy/cleaned-mongolian-dataset/commit/e350efd88be98bc4237f6554f59a790c31c38963', commit_message='Upload dataset', commit_description='', oid='e350efd88be98bc4237f6554f59a790c31c38963', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Billyyy/cleaned-mongolian-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Billyyy/cleaned-mongolian-dataset'), pr_revision=None, pr_num=None)