## Aggregate

In [1]:
from datasets import load_dataset, concatenate_datasets

sources = ['84000', 'Hopkins', 'GNOME', 'LotsawaHouse', 'NLLB', 'Tatoeba', 'TED2020']
ds_list = []

for source in sources:
    ds = load_dataset(f'billingsmoore/{source}-bo-en', split='train')
    ds = ds.add_column('source', [f'{source}']*len(ds))
    ds_list.append(ds)

ds = concatenate_datasets(ds_list)
ds[0]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'source_file': 'toh126',
 'source': '84000',
 'topic': None}

## Cleaning

In [2]:
df = ds.to_pandas()
df.head()

Unnamed: 0,bo,en,source_file,source,topic
0,འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན...,The Noble Mahāyāna Sūtra Like Gold Dust,toh126,84000,
1,སངས་རྒྱས་དང་། བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚ...,Homage to all buddhas and bodhisattvas.,toh126,84000,
2,འདི་སྐད་བདག་གིས་ཐོས་པ་དུས་གཅིག་ན།,Thus did I hear at one time.,toh126,84000,
3,བཅོམ་ལྡན་འདས་མཉན་དུ་ཡོད་པ་ན་རྒྱལ་བུ་རྒྱལ་བྱེད་...,The Blessed One was staying at Prince Jeta’s G...,toh126,84000,
4,དེ་ནས་ཚེ་དང་ལྡན་པ་ཀུན་དགའ་བོ་སྟན་ལས་ལངས་ཏེ་བླ་...,"Venerable Ānanda rose from his seat, draped hi...",toh126,84000,


In [3]:
len(df)

992332

### Deduplicate

In [4]:
df = df.drop_duplicates(subset='en')
len(df)

885177

### Demoji

In [5]:
import re

# Regular expression to match emojis
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "]+", 
    flags=re.UNICODE
)

# Remove emojis from both 'bo' and 'en' columns
df['bo'] = df['bo'].str.replace(emoji_pattern, '', regex=True)
df['en'] = df['en'].str.replace(emoji_pattern, '', regex=True)

### Remove Rows Whose English is Just Numbers or Punctuation

In [6]:
# Regular expression to match rows with only numbers and punctuation
df = df[~df['en'].str.fullmatch(r'[0-9\W]+', na=False)]
len(df)

885139

### Remove Rows Whose English is Just Roman Numerals

In [7]:
roman_numeral_pattern = r'^(?=[MDCLXVI])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.?$'

# Remove rows where 'target' matches the pattern
df = df[~df['en'].str.fullmatch(roman_numeral_pattern, na=False)]
len(df)

885137

### Remove Rows With Empty Strings

In [15]:
df = df[(df['bo'] != '') & (df['en'] != '')]
len(df)

885100

### Remove Rows With non-Tibetan characters in Tibetan or non-Latin characters in English

In [22]:
# Match *obvious foreign scripts* inside Tibetan text
latin_chars = re.compile(r'[A-Za-z]')
hangul_chars = re.compile(r'[\uAC00-\uD7AF]')  # Korean Hangul
cyrillic_chars = re.compile(r'[\u0400-\u04FF]')  # Cyrillic block
# Add more scripts here if needed (e.g., Chinese: \u4E00–\u9FFF)

def clean_for_check(text):
    return str(text).replace('\u00A0', ' ').replace('\u200B', '').strip()

def is_clean_tibetan(text):
    cleaned = clean_for_check(text)
    return not (latin_chars.search(cleaned) or hangul_chars.search(cleaned) or cyrillic_chars.search(cleaned))

# For English: just exclude if it has **non-Latin letters**, keep punctuation, emoji, etc.
non_latin_text = re.compile(r'[^\x00-\x7F]')  # Anything outside ASCII

def is_clean_english(text):
    cleaned = clean_for_check(text)
    return not non_latin_text.search(cleaned)

# Filter
df_clean = df[df['bo'].apply(is_clean_tibetan)]# & df['en'].apply(is_clean_english)]
len(df_clean)

712321

## Push to Hub

In [25]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds[0]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'source_file': 'toh126',
 'source': '84000',
 'topic': None,
 '__index_level_0__': 0}

In [26]:
ds = ds.remove_columns(['__index_level_0__'])
ds[0]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'source_file': 'toh126',
 'source': '84000',
 'topic': None}

In [31]:
ds.train_test_split(.1)

DatasetDict({
    train: Dataset({
        features: ['bo', 'en', 'source_file', 'source', 'topic'],
        num_rows: 796590
    })
    test: Dataset({
        features: ['bo', 'en', 'source_file', 'source', 'topic'],
        num_rows: 88510
    })
})

In [32]:
ds.push_to_hub('billingsmoore/Aggregated-bo-en')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/886 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/billingsmoore/Aggregated-bo-en/commit/7caadb33397b0413da72fd82d79a8ad1febb9489', commit_message='Upload dataset', commit_description='', oid='7caadb33397b0413da72fd82d79a8ad1febb9489', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/billingsmoore/Aggregated-bo-en', endpoint='https://huggingface.co', repo_type='dataset', repo_id='billingsmoore/Aggregated-bo-en'), pr_revision=None, pr_num=None)