In [None]:
from datasets import load_dataset
from multiprocessing import cpu_count
from indusnlp import TextCleaner, HindiTextCleaner
from clean import clean_text


In [None]:
url_pattern = r'https?://\S+|www\.\S+'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b|\(\d{3}\)\s?\d{3}[-.\s]?\d{4}\b|\+\d{2}\s?\d{10}\b|\+\d{2}-\d{10}\b'
# Configuration for the text cleaning
config = [
    # ('remove_patterns', [url_pattern,email_pattern,phone_pattern]),
    ("remove_line_with_keyword",
        [
            "Updated", "Published by", "Link Copied",
            "Follow Us", "Next Article", "Followed",
            "अमर उजाला,", "News in Hindi", "Hindi news",
            "सब्सक्राइब करें", "डाउनलोड करें", "सब्सक्रिप्शन",
            "Disclaimer", "एड फ्री अनुभव", "Get all Sports",
            "ब्यूरो", "ब्यूरो,", "Get all India News", "Read the latest",
            "हम डाटा संग्रह टूल्स", "लेटेस्ट अपडेट्स",
            "सब्सक्राइब",
        ]),

    # ("remove_lines_with_repeated_seqs",3),
    ("handle_whitespace", None),
    # ('remove_line_on_char_percentage',80),
    ("remove_redundant_lines", None),
    ("remove_blank_lines", None),
]
textcleaner = TextCleaner(config, clean_punctuation=False)
hicleaner = HindiTextCleaner(transliterate=True)

def clean_content(example):
    text = example["text"].strip()
    cleaned_text = hicleaner(textcleaner(clean_text(text))) if text else ""
    return {"text": cleaned_text}


In [None]:
dataset = load_dataset("zicsx/mC4-hindi",split='train')
dataset = dataset.remove_columns(['timestamp', 'url'])


In [None]:
# Apply the cleaning function
dataset = dataset.map(clean_content,num_proc=cpu_count())


In [None]:
# Remove empty rows
dataset = dataset.filter(lambda example: example["text"] is not None and len(example["text"]) > 0)


In [None]:
dataset.save_to_disk('mC4-hindi-Cleaned')
dataset.push_to_hub('zicsx/mC4-hindi-Cleaned')
