In [23]:
!pip install datasets gitpython tqdm
from tqdm.auto import tqdm
import os
import git
from pathlib import Path
from datasets import Dataset
import glob



In [25]:
# Check if repo exists before cloning
repo_path = 'data-tm-alignments'
if not os.path.exists(repo_path):
    print("Cloning repository...")
    !git clone https://github.com/84000/data-tm-alignments.git
else:
    print("Repository already exists, skipping clone...")

# Get the current working directory
base_path = os.getcwd()

def get_aligned_pairs(base_path):
    tibetan_lines = []
    english_lines = []

    # Get all toh directories
    toh_path = os.path.join(base_path, 'data-tm-alignments/aligned/toh*')
    toh_dirs = glob.glob(toh_path)

    skipped_pairs = 0
    total_pairs = 0

    for toh_dir in tqdm(toh_dirs, desc="Processing directories"):
        # Look for *-bo.txt and *-en.txt files
        bo_files = glob.glob(os.path.join(toh_dir, '*-bo.txt'))

        for bo_file in tqdm(bo_files, desc=f"Processing files in {os.path.basename(toh_dir)}", leave=False):
            en_file = bo_file.replace('-bo.txt', '-en.txt')

            if os.path.exists(en_file):
                try:
                    with open(bo_file, 'r', encoding='utf-8') as f:
                        bo_lines = [line.strip() for line in f if line.strip()]

                    with open(en_file, 'r', encoding='utf-8') as f:
                        en_lines = [line.strip() for line in f if line.strip()]

                    # Process each line pair
                    for bo_line, en_line in zip(bo_lines, en_lines):
                        total_pairs += 1
                        # Only add if both lines have content
                        if bo_line and en_line:
                            tibetan_lines.append(bo_line)
                            english_lines.append(en_line)
                        else:
                            skipped_pairs += 1

                    # Report if we had to skip any lines due to length mismatch
                    remainder = abs(len(bo_lines) - len(en_lines))
                    if remainder > 0:
                        skipped_pairs += remainder
                        print(f"Skipped {remainder} unpaired lines in {bo_file}")

                except Exception as e:
                    print(f"Error processing {bo_file}: {str(e)}")

    print(f"\nProcessing Summary:")
    print(f"Total pairs processed: {total_pairs}")
    print(f"Pairs skipped: {skipped_pairs}")
    print(f"Final pairs kept: {len(tibetan_lines)}")

    return tibetan_lines, english_lines

# Get aligned pairs
print("Reading aligned pairs...")
tibetan_lines, english_lines = get_aligned_pairs(base_path)

# Create dataset
print("Creating Hugging Face dataset...")
data_dict = {
    'tibetan': tibetan_lines,
    'english': english_lines
}

dataset = Dataset.from_dict(data_dict)

# Print statistics
print("\nDataset statistics:")
total_pairs = len(dataset)
print(f"Total number of translation pairs: {total_pairs}")

print("\nFirst few examples:")
for i in range(min(3, len(dataset))):
    print(f"\nExample {i+1}:")
    print(f"Tibetan: {dataset[i]['tibetan']}")
    print(f"English: {dataset[i]['english']}")

# Save dataset
output_path = 'tibetan_english_dataset'
print(f"\nSaving dataset to {output_path}...")
dataset.save_to_disk(output_path)
print("Dataset saved successfully")

Repository already exists, skipping clone...
Reading aligned pairs...


Processing directories:   0%|          | 0/86 [00:00<?, ?it/s]

Processing files in toh42:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh42/TMtoh42_84000-bo.txt


Processing files in toh54:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh54/TMtoh54_84000-bo.txt


Processing files in toh583:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh583/TMtoh583_84000-bo.txt


Processing files in toh539e:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh539e/TMtoh539e_84000-bo.txt


Processing files in toh647:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh732:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh732/TMtoh732_84000-bo.txt


Processing files in toh214:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh214/TMtoh214_84000-bo.txt


Processing files in toh331:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh331/TMtoh331_84000-bo.txt


Processing files in toh329:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh329/TMtoh329_84000-bo.txt


Processing files in toh579:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh238:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 2 unpaired lines in /content/data-tm-alignments/aligned/toh238/TMtoh238_84000-bo.txt


Processing files in toh670:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh670/TMtoh670_84000-bo.txt


Processing files in toh8d:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh8d/TMtoh8d_84000-bo.txt


Processing files in toh321:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh321/TMtoh321_84000-bo.txt


Processing files in toh156:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh156/TMtoh156_84000-bo.txt


Processing files in toh100:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 2 unpaired lines in /content/data-tm-alignments/aligned/toh100/TMtoh100_84000-bo.txt


Processing files in toh170:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh170/TMtoh170_84000-bo.txt


Processing files in toh666:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh668:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh668/TMtoh668_84000-bo.txt


Processing files in toh381:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh747:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh747/TMtoh747_84000-bo.txt


Processing files in toh521:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh521/TMtoh521_84000-bo.txt


Processing files in toh865:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh865/TMtoh865_84000-bo.txt


Processing files in toh566:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh566/TMtoh566_84000-bo.txt


Processing files in toh8b:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh565:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh565/TMtoh565_84000-bo.txt


Processing files in toh41:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh41/TMtoh41_84000-bo.txt


Processing files in toh8a:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 2 unpaired lines in /content/data-tm-alignments/aligned/toh8a/TMtoh8a_84000-bo.txt


Processing files in toh605:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh605/TMtoh605_84000-bo.txt


Processing files in toh65:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh65/TMtoh65_84000-bo.txt


Processing files in toh1-1:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh11:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh11/TMtoh11_84000-bo.txt


Processing files in toh205:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh205/TMtoh205_84000-bo.txt


Processing files in toh580:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh580/TMtoh580_84000-bo.txt


Processing files in toh113:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh113/TMtoh113_84000-bo.txt


Processing files in toh667:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh743:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh743/TMtoh743_84000-bo.txt


Processing files in toh509:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh509/TMtoh509_84000-bo.txt


Processing files in toh197:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh197/TMtoh197_84000-bo.txt


Processing files in toh639:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh639/TMtoh639_84000-bo.txt


Processing files in toh726:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh219:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh219/TMtoh219_84000-bo.txt


Processing files in toh76:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh76/TMtoh76_84000-bo.txt


Processing files in toh75:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh75/TMtoh75_84000-bo.txt


Processing files in toh665:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh665/TMtoh665_84000-bo.txt


Processing files in toh126:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh126/TMtoh126_84000-bo.txt


Processing files in toh263:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh263/TMtoh263_84000-bo.txt


Processing files in toh8f:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh8f/TMtoh8f_84000-bo.txt


Processing files in toh361:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh361/TMtoh361_84000-bo.txt


Processing files in toh293:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh293/TMtoh293_84000-bo.txt


Processing files in toh555:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh555/TMtoh555_84000-bo.txt


Processing files in toh127:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh127/TMtoh127_84000-bo.txt


Processing files in toh584:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh584/TMtoh584_84000-bo.txt


Processing files in toh642:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh642/TMtoh642_84000-bo.txt


Processing files in toh327:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh327/TMtoh327_84000-bo.txt


Processing files in toh292:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh292/TMtoh292_84000-bo.txt


Processing files in toh136:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh136/TMtoh136_84000-bo.txt


Processing files in toh322:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh322/TMtoh322_84000-bo.txt


Processing files in toh47:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh47/TMtoh47_84000-bo.txt


Processing files in toh56:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh56/TMtoh56_84000-bo.txt


Processing files in toh128:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh128/TMtoh128_84000-bo.txt


Processing files in toh215:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh215/TMtoh215_84000-bo.txt


Processing files in toh48:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh48/TMtoh48_84000-bo.txt


Processing files in toh891:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh891/TMtoh891_84000-bo.txt


Processing files in toh269:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh269/TMtoh269_84000-bo.txt


Processing files in toh564:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh564/TMtoh564_84000-bo.txt


Processing files in toh540:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh540/TMtoh540_84000-bo.txt


Processing files in toh8e:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 2 unpaired lines in /content/data-tm-alignments/aligned/toh8e/TMtoh8e_84000-bo.txt


Processing files in toh346:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh1091:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh330:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh330/TMtoh330_84000-bo.txt


Processing files in toh669:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh8c:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh8c/TMtoh8c_84000-bo.txt


Processing files in toh95:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh95/TMtoh95_84000-bo.txt


Processing files in toh556:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh556/TMtoh556_84000-bo.txt


Processing files in toh646:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh557:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh557/TMtoh557_84000-bo.txt


Processing files in toh73:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh73/TMtoh73_84000-bo.txt


Processing files in toh254:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh254/TMtoh254_84000-bo.txt


Processing files in toh248:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh248/TMtoh248_84000-bo.txt


Processing files in toh581:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh581/TMtoh581_84000-bo.txt


Processing files in toh953:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh953/TMtoh953_84000-bo.txt


Processing files in toh44-45:   0%|          | 0/1 [00:00<?, ?it/s]

Processing files in toh4568-3:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 2 unpaired lines in /content/data-tm-alignments/aligned/toh4568-3/TMtoh4568-3_84000-bo.txt


Processing files in toh8g:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh8g/TMtoh8g_84000-bo.txt


Processing files in toh505:   0%|          | 0/1 [00:00<?, ?it/s]

Skipped 1 unpaired lines in /content/data-tm-alignments/aligned/toh505/TMtoh505_84000-bo.txt

Processing Summary:
Total pairs processed: 124452
Pairs skipped: 78
Final pairs kept: 124452
Creating Hugging Face dataset...

Dataset statistics:
Total number of translation pairs: 124452

First few examples:

Example 1:
Tibetan: ༄༅༅། །རྒྱ་གར་སྐད་དུ། ཙན་དྲ་སཱུ་ཏྲ། བོད་སྐད་དུ།
English: The Sūtra of the Moon (1)

Example 2:
Tibetan: ཟླ་བའི་མདོ།
English: Respectful homage to the noble Three Jewels!

Example 3:
Tibetan: འཕགས་པ་དཀོན་མཆོག་གསུམ་ལ་གུས་པས་ཕྱག་འཚལ་ལོ། །
English: Thus did I hear at one time. The Blessed One was dwelling in Jeta Grove, Anāthapiṇḍada’s park in Śrāvastī. At that time the god Candramas was seized by Rāhu, lord of the asuras. Then the god Candramas, recollecting and taking the Blessed One to heart, recited this verse:

Saving dataset to tibetan_english_dataset...


Saving the dataset (0/1 shards):   0%|          | 0/124452 [00:00<?, ? examples/s]

Dataset saved successfully
