In [1]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import os
import re


DIR_RAW_DATASET = "./raw_dataset"
DIR_PROCESSED_DATASET = "./processed_dataset"
FILE_UZ_BOOKS = "uz_books.txt"
FILES_UZ_CORPORA = [
    "uzb_community_2017-sentences.txt",
    "uzb_news_2020_30K-sentences.txt",
    "uzb_newscrawl_2011_100K-sentences.txt",
    "uzb_wikipedia_2021_100K-sentences.txt"
]

### Download the dataset of 40,000 Latin script Uzbek books from hugging-face

In [None]:
books = load_dataset("tahrirchi/uz-books", split="lat")

with open(os.path.join(DIR_RAW_DATASET, FILE_UZ_BOOKS), "w", encoding="utf-8") as f:
    for book in books:
        f.write(book["text"] + "\n")

books = []

### Process the Uz-books dataset: Remove new-lines and cyrillic script.

In [None]:
input_path = os.path.join(DIR_RAW_DATASET, FILE_UZ_BOOKS)
output_path = os.path.join(DIR_PROCESSED_DATASET, FILE_UZ_BOOKS)

chunk_size = 100 * 1024 * 1024  # 100MB Adjust as needed

def process_text(text):
    text = text.replace("\n\n", "¶")
    text = text.replace("\n", " ")
    text = text.replace("¶", "\n\n")
    text = re.sub(r'[\u0400-\u04FF]+', '', text)
    return text

with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8') as outfile:

    remainder = ""
    while True:
        chunk = infile.read(chunk_size)
        if not chunk:
            if remainder:
                processed = process_text(remainder)
                outfile.write(processed)
            break

        combined = remainder + chunk
        last_newline_index = combined.rfind("\n")
        if last_newline_index == -1:
            remainder = combined
            continue

        to_process = combined[:last_newline_index]
        remainder = combined[last_newline_index:]
        
        processed = process_text(to_process)
        outfile.write(processed)

    if remainder:
        processed = process_text(remainder)
        outfile.write(processed)

### Process the Uzbek Corpora dataset: Remove new-lines and cyrillic script.

In [None]:
index_pattern = re.compile(r'^\S+\s+')
cyrillic_pattern = re.compile(r'[\u0400-\u04FF]')

for file_name in FILES_UZ_CORPORA:
    input_path = os.path.join(DIR_RAW_DATASET, file_name)
    output_path = os.path.join(DIR_PROCESSED_DATASET, file_name)
    processed_lines = []
    
    with open(input_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            line = re.sub(index_pattern, '', line)
            if cyrillic_pattern.search(line):
                continue
            processed_lines.append(line.strip())
    
    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write("\n".join(processed_lines))

## Tokenization

In [None]:
files = [os.path.join(DIR_PROCESSED_DATASET, FILE_UZ_BOOKS)] + \
        [os.path.join(DIR_PROCESSED_DATASET, f) for f in FILES_UZ_CORPORA]

tokenizer_uzbek = Tokenizer(models.BPE())

tokenizer_uzbek.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(
    vocab_size=10000,  #Adjust as needed
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
)

tokenizer_uzbek.train(files=files, trainer=trainer)

tokenizer_uzbek.save("uzbek_tokenizer.json")

vocab = tokenizer_uzbek.get_vocab()

sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])

output_lines = [f"{idx}: {token}" for token, idx in sorted_vocab]

with open("uzbek_tokens.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print("Final vocabulary size:", len(vocab))
print("Uzbek tokens saved to 'uzbek_tokens.txt'.")


Final vocabulary size: 10000
Uzbek tokens saved to 'uzbek_tokens.txt'.
