## Aggregate

In [1]:
from datasets import load_dataset, concatenate_datasets

sources = ['84000', 'Hopkins', 'GNOME', 'LotsawaHouse', 'NLLB', 'Tatoeba', 'TED2020']
ds_list = []

for source in sources:
    ds = load_dataset(f'billingsmoore/{source}-bo-en', split='train')
    ds = ds.add_column('source', [f'{source}']*len(ds))
    ds_list.append(ds)

ds = concatenate_datasets(ds_list)
ds[0]

README.md:   0%|          | 0.00/985 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127921 [00:00<?, ? examples/s]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'topic': 'Jewels, Water, Bodhisattvas',
 'source': '84000'}

## Cleaning

In [2]:
df = ds.to_pandas()
df.head()

Unnamed: 0,bo,en,topic,source
0,འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན...,The Noble Mahāyāna Sūtra Like Gold Dust,"Jewels, Water, Bodhisattvas",84000
1,སངས་རྒྱས་དང་། བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚ...,Homage to all buddhas and bodhisattvas.,"Dharma, Rite, Purity",84000
2,འདི་སྐད་བདག་གིས་ཐོས་པ་དུས་གཅིག་ན།,Thus did I hear at one time.,"Light, Reality, Assembly",84000
3,བཅོམ་ལྡན་འདས་མཉན་དུ་ཡོད་པ་ན་རྒྱལ་བུ་རྒྱལ་བྱེད་...,The Blessed One was staying at Prince Jeta’s G...,"Buddhism, Enlightenment, Ānanda",84000
4,དེ་ནས་ཚེ་དང་ལྡན་པ་ཀུན་དགའ་བོ་སྟན་ལས་ལངས་ཏེ་བླ་...,"Venerable Ānanda rose from his seat, draped hi...","Buddha, Qualities, Enlightenment",84000


In [3]:
len(df)

984633

### Deduplicate

In [4]:
df = df.drop_duplicates(subset='en')
len(df)

878044

### Demoji

In [5]:
import re

# Regular expression to match emojis
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "]+", 
    flags=re.UNICODE
)

# Remove emojis from both 'bo' and 'en' columns
df['bo'] = df['bo'].str.replace(emoji_pattern, '', regex=True)
df['en'] = df['en'].str.replace(emoji_pattern, '', regex=True)

### Remove Rows Whose English is Just Numbers or Punctuation

In [6]:
# Regular expression to match rows with only numbers and punctuation
df = df[~df['en'].str.fullmatch(r'[0-9\W]+', na=False)]
len(df)

878006

### Remove Rows Whose English is Just Roman Numerals

In [7]:
roman_numeral_pattern = r'^(?=[MDCLXVI])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\.?$'

# Remove rows where 'target' matches the pattern
df = df[~df['en'].str.fullmatch(roman_numeral_pattern, na=False)]
len(df)

878004

### Remove Rows With Empty Strings

In [8]:
df = df[(df['bo'] != '') & (df['en'] != '')]
len(df)

878003

## Push to Hub

In [9]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds[0]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'topic': 'Jewels, Water, Bodhisattvas',
 'source': '84000',
 '__index_level_0__': 0}

In [10]:
ds = ds.remove_columns(['__index_level_0__'])
ds[0]

{'bo': 'འཕགས་པ་གསེར་གྱི་བྱེ་མ་ལྟ་བུ་ཞེས་བྱ་བ་ཐེག་པ་ཆེན་པོའི་མདོ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'topic': 'Jewels, Water, Bodhisattvas',
 'source': '84000'}

In [11]:
ds.push_to_hub('billingsmoore/Aggregated-bo-en')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/879 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/billingsmoore/Aggregated-bo-en/commit/79c06789d52b15d515693851360ded94654f373e', commit_message='Upload dataset', commit_description='', oid='79c06789d52b15d515693851360ded94654f373e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/billingsmoore/Aggregated-bo-en', endpoint='https://huggingface.co', repo_type='dataset', repo_id='billingsmoore/Aggregated-bo-en'), pr_revision=None, pr_num=None)