In [11]:
import os
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer
import regex
import pickle

In [12]:
def load_all_files(directory):
    all_texts = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory, file_name)
            all_texts.extend(load_single_file(file_path))
    return all_texts

In [13]:
def load_single_file(file_path):
    texts = []
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Navigate to the <body> and <p> tags
        body = root.find(".//body")
        if body:
            for paragraph in body.findall(".//p"):
                if paragraph.text:
                    texts.append(paragraph.text.strip())
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")
    return texts

In [14]:
def clean_text(text):
    # Replacing multiple spaces with one
    text = regex.sub(r"\s+", " ", text)
    # Keep Devanagari and spaces
    text = regex.sub(r"[^\p{Devanagari}\s]", "", text)
    # Convert to lowercase
    text = text.lower().strip()
    return text

def clean_dataset(dataset):
    return [clean_text(text) for text in dataset]

In [19]:
def tokenize_dataset(cleaned_dataset):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    
    # Set the padding token to be the same as eos_token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Tokenize the cleaned dataset
    tokenized_data = [tokenizer.encode(text, truncation=True, padding="max_length", max_length=2048) for text in cleaned_dataset]
    
    return tokenized_data

In [20]:
if __name__ == "__main__":
    directory = "Maithili"
    
    raw_texts = load_all_files(directory)

    cleaned_texts = clean_dataset(raw_texts)

    print("First cleaned sample:", cleaned_texts[0])

First cleaned sample: ताराशंकर बंधोपाध्यायक जन्म  जुलाई  ईमे पश्चिम बंगालक बीरभूम जिलाक लाभपुर नामक गाममे भेल छल हुनक पिताक नाम श्री हरिदास बंधोपाध्याय और माताक नाम श्रीमती प्रभावती देवी छल ओ परिवारमे सभसँ पैघ छलाह हुनका एकटा बहिन और दूटाभाय छलनि


In [21]:
tokenized_texts = tokenize_dataset(cleaned_texts)

print("First tokenized sample:", tokenized_texts[0])

Using pad_token, but it is not set yet.


First tokenized sample: [128000, 80338, 100273, 102871, 73414, 101185, 100276, 103286, 104715, 102744, 100305, 109953, 103669, 100597, 220, 100277, 101241, 101344, 220, 106512, 88344, 35470, 114474, 101483, 101877, 100276, 101238, 125550, 100276, 101815, 100600, 105542, 100277, 100482, 101315, 100293, 107469, 87262, 100759, 100282, 123121, 100343, 100497, 88344, 35470, 100348, 101385, 101946, 92911, 85410, 101391, 65804, 84736, 100428, 101315, 100282, 100497, 100446, 86133, 44747, 104579, 102557, 100625, 100276, 103286, 104715, 102744, 100305, 100537, 100358, 92317, 100444, 101315, 100282, 100497, 100446, 86133, 114959, 44747, 84736, 106817, 100780, 80338, 44747, 100291, 101993, 44747, 101946, 92911, 103796, 100406, 101153, 100273, 88344, 35470, 104124, 79468, 103694, 84736, 100266, 106277, 101946, 92911, 100731, 85410, 101391, 65804, 24810, 100549, 100366, 24810, 102875, 100556, 100358, 100291, 105135, 107469, 100537, 101946, 92911, 61196, 39951, 128001, 128001, 128001, 128001, 128001

In [22]:
with open("tokenized_data_llama3.pkl", "wb") as f:
    pickle.dump(tokenized_texts, f)

with open("tokenized_data_llama3.pkl", "rb") as f:
    loaded_tokenized_data = pickle.load(f)
    print("Loaded tokenized sample:", loaded_tokenized_data[0])

Loaded tokenized sample: [128000, 80338, 100273, 102871, 73414, 101185, 100276, 103286, 104715, 102744, 100305, 109953, 103669, 100597, 220, 100277, 101241, 101344, 220, 106512, 88344, 35470, 114474, 101483, 101877, 100276, 101238, 125550, 100276, 101815, 100600, 105542, 100277, 100482, 101315, 100293, 107469, 87262, 100759, 100282, 123121, 100343, 100497, 88344, 35470, 100348, 101385, 101946, 92911, 85410, 101391, 65804, 84736, 100428, 101315, 100282, 100497, 100446, 86133, 44747, 104579, 102557, 100625, 100276, 103286, 104715, 102744, 100305, 100537, 100358, 92317, 100444, 101315, 100282, 100497, 100446, 86133, 114959, 44747, 84736, 106817, 100780, 80338, 44747, 100291, 101993, 44747, 101946, 92911, 103796, 100406, 101153, 100273, 88344, 35470, 104124, 79468, 103694, 84736, 100266, 106277, 101946, 92911, 100731, 85410, 101391, 65804, 24810, 100549, 100366, 24810, 102875, 100556, 100358, 100291, 105135, 107469, 100537, 101946, 92911, 61196, 39951, 128001, 128001, 128001, 128001, 12800