In [None]:
from tqdm.auto import tqdm
import os
import git
from pathlib import Path
from datasets import Dataset
import glob

In [None]:
# Check if repo exists before cloning
repo_path = 'data-tm-alignments'
if not os.path.exists(repo_path):
    print("Cloning repository...")
    !git clone https://github.com/84000/data-tm-alignments.git
else:
    print("Repository already exists, skipping clone...")

# Get the current working directory
base_path = os.getcwd()

def get_aligned_pairs(base_path):
    tibetan_lines = []
    english_lines = []

    # Get all toh directories
    toh_path = os.path.join(base_path, 'data-tm-alignments/aligned/toh*')
    toh_dirs = glob.glob(toh_path)

    skipped_pairs = 0
    total_pairs = 0

    for toh_dir in tqdm(toh_dirs, desc="Processing directories"):
        # Look for *-bo.txt and *-en.txt files
        bo_files = glob.glob(os.path.join(toh_dir, '*-bo.txt'))

        for bo_file in tqdm(bo_files, desc=f"Processing files in {os.path.basename(toh_dir)}", leave=False):
            en_file = bo_file.replace('-bo.txt', '-en.txt')

            if os.path.exists(en_file):
                try:
                    with open(bo_file, 'r', encoding='utf-8') as f:
                        bo_lines = [line.strip() for line in f if line.strip()]

                    with open(en_file, 'r', encoding='utf-8') as f:
                        en_lines = [line.strip() for line in f if line.strip()]

                    # Process each line pair
                    for bo_line, en_line in zip(bo_lines, en_lines):
                        total_pairs += 1
                        # Only add if both lines have content
                        if bo_line and en_line:
                            tibetan_lines.append(bo_line)
                            english_lines.append(en_line)
                        else:
                            skipped_pairs += 1

                    # Report if we had to skip any lines due to length mismatch
                    remainder = abs(len(bo_lines) - len(en_lines))
                    if remainder > 0:
                        skipped_pairs += remainder
                        print(f"Skipped {remainder} unpaired lines in {bo_file}")

                except Exception as e:
                    print(f"Error processing {bo_file}: {str(e)}")

    print(f"\nProcessing Summary:")
    print(f"Total pairs processed: {total_pairs}")
    print(f"Pairs skipped: {skipped_pairs}")
    print(f"Final pairs kept: {len(tibetan_lines)}")

    return tibetan_lines, english_lines

# Get aligned pairs
print("Reading aligned pairs...")
tibetan_lines, english_lines = get_aligned_pairs(base_path)

# Create dataset
print("Creating Hugging Face dataset...")
data_dict = {
    'tibetan': tibetan_lines,
    'english': english_lines
}

dataset = Dataset.from_dict(data_dict)

# Print statistics
print("\nDataset statistics:")
total_pairs = len(dataset)
print(f"Total number of translation pairs: {total_pairs}")

print("\nFirst few examples:")
for i in range(min(3, len(dataset))):
    print(f"\nExample {i+1}:")
    print(f"Tibetan: {dataset[i]['tibetan']}")
    print(f"English: {dataset[i]['english']}")

# Save dataset
output_path = 'tibetan_english_dataset'
print(f"\nSaving dataset to {output_path}...")
dataset.save_to_disk(output_path)
print("Dataset saved successfully")