# Preprocessing the Data

In [1]:
import re

def clean_text_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            cleaned_line = re.sub(r"^SNT\.\d+\.\d+\s+", "", line.strip())  # remove ID but keep sentence
            if cleaned_line:  # ensure non-empty lines are written
                outfile.write(cleaned_line + "\n")

In [2]:
clean_text_file('data/data_en.txt', 'data/cleaned_data_en.txt')
clean_text_file('data/data_my.txt', 'data/cleaned_data_my.txt')

In [3]:
import csv

def create_csv(english_file, burmese_file, output_file):
    with open(english_file, "r", encoding="utf-8") as en_f, open(burmese_file, "r", encoding="utf-8") as my_f, open(output_file, "w", encoding="utf-8", newline="") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(["en", "my"])  # header

        for en_line, my_line in zip(en_f, my_f):
            writer.writerow([en_line.strip(), my_line.strip()])

In [4]:
create_csv("data/cleaned_data_en.txt", "data/cleaned_data_my.txt", "data/dataset.csv")

In [5]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="data/dataset.csv")
print(dataset['train'][0])

Generating train split: 0 examples [00:00, ? examples/s]

{'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.', 'my': 'ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂၀၀၇ခုနှစ် ရပ်ဘီ ကမ္ဘာ့ ဖလား တွင် အီတလီ သည် ပေါ်တူဂီ ကို ၃၁-၅ ဂိုး ဖြင့် ရေကူးကန် စီ တွင် ရှုံးနိမ့်သွားပါသည် ။'}


In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'my'],
        num_rows: 20106
    })
})


In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [10]:
from datasets import DatasetDict

SEED = 69
dataset = load_dataset('csv', data_files='data/dataset.csv')

train_test_split = dataset["train"].train_test_split(test_size=0.3, seed=SEED)

validation_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=SEED)

final_dataset =  DatasetDict({
    'train': train_test_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})

In [11]:
print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'my'],
        num_rows: 14074
    })
    validation: Dataset({
        features: ['en', 'my'],
        num_rows: 3016
    })
    test: Dataset({
        features: ['en', 'my'],
        num_rows: 3016
    })
})


In [12]:
from datasets import Dataset
repo_name = 'archx64/english-burmese-parallel'
final_dataset.push_to_hub(repo_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/archx64/english-burmese-parallel/commit/8294a4ee834669954ac6d79a060aa175de8bde9b', commit_message='Upload dataset', commit_description='', oid='8294a4ee834669954ac6d79a060aa175de8bde9b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/archx64/english-burmese-parallel', endpoint='https://huggingface.co', repo_type='dataset', repo_id='archx64/english-burmese-parallel'), pr_revision=None, pr_num=None)