In [1]:
import os
import xml.etree.ElementTree as ET
from transformers import GPT2Tokenizer
import regex
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_all_files(directory):
    all_texts = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory, file_name)
            all_texts.extend(load_single_file(file_path))
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                text_list = text.split('।')
                text_list = [segment.strip() for segment in text_list]
                all_texts.extend(text_list)
    return all_texts

In [3]:
def load_single_file(file_path):
    texts = []
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Navigate to the <body> and <p> tags
        body = root.find(".//body")
        if body:
            for paragraph in body.findall(".//p"):
                if paragraph.text:
                    texts.append(paragraph.text.strip())
    except ET.ParseError as e:
        print(f"Error parsing {file_path}: {e}")
    return texts

In [4]:
def clean_text(text):
    # Replacing multiple spaces with one
    text = regex.sub(r"\s+", " ", text)
    # Keep Devanagari and spaces
    text = regex.sub(r"[^\p{Devanagari}\s]", "", text)
    # Convert to lowercase
    text = text.lower().strip()
    return text

def clean_dataset(dataset):
    return [clean_text(text) for text in dataset]

In [5]:
from datasets import Dataset

In [6]:
if __name__ == "__main__":
    directory = "Maithili"
    
    raw_texts = load_all_files(directory)

    cleaned_texts = clean_dataset(raw_texts)

    print("First cleaned sample:", cleaned_texts[0])

First cleaned sample: ताराशंकर बंधोपाध्यायक जन्म  जुलाई  ईमे पश्चिम बंगालक बीरभूम जिलाक लाभपुर नामक गाममे भेल छल हुनक पिताक नाम श्री हरिदास बंधोपाध्याय और माताक नाम श्रीमती प्रभावती देवी छल ओ परिवारमे सभसँ पैघ छलाह हुनका एकटा बहिन और दूटाभाय छलनि


In [7]:
data = Dataset.from_dict({'text': cleaned_texts})

In [8]:

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the same as eos_token if it doesn't exist
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the cleaned dataset
def tokenize_function(examples):
    # Tokenize inputs and generate labels by shifting the input
    encodings = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    encodings['labels'] = encodings['input_ids'].copy()  # Copy the input_ids to labels
    return encodings
tokenized_data = data.map(tokenize_function, batched=True)

Map: 100%|██████████| 53951/53951 [00:10<00:00, 5163.23 examples/s]


In [9]:
print("First tokenized sample:", tokenized_data[0])

First tokenized sample: {'text': 'ताराशंकर बंधोपाध्यायक जन्म  जुलाई  ईमे पश्चिम बंगालक बीरभूम जिलाक लाभपुर नामक गाममे भेल छल हुनक पिताक नाम श्री हरिदास बंधोपाध्याय और माताक नाम श्रीमती प्रभावती देवी छल ओ परिवारमे सभसँ पैघ छलाह हुनका एकटा बहिन और दूटाभाय छलनि', 'input_ids': [11976, 97, 48077, 11976, 108, 48077, 11976, 114, 11976, 224, 11976, 243, 11976, 108, 28225, 105, 11976, 224, 11976, 100, 24231, 233, 11976, 103, 48077, 11976, 100, 24231, 235, 11976, 107, 48077, 11976, 107, 11976, 243, 28225, 250, 11976, 101, 24231, 235, 11976, 106, 220, 28225, 250, 24231, 223, 11976, 110, 48077, 11976, 230, 220, 28225, 230, 11976, 106, 24231, 229, 28225, 103, 11976, 114, 24231, 235, 11976, 248, 11976, 123, 11976, 106, 28225, 105, 11976, 224, 11976, 245, 48077, 11976, 110, 11976, 243, 28225, 105, 24231, 222, 11976, 108, 11976, 255, 24231, 224, 11976, 106, 28225, 250, 11976, 123, 11976, 110, 48077, 11976, 243, 28225, 110, 48077, 11976, 255, 11976, 103, 24231, 223, 11976, 108, 28225, 101, 48077, 11976

In [10]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))



Embedding(50258, 768)

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Simulate a batch size of 8 (2 * 4)
    save_steps=10_000,
    logging_steps=500,
    logging_dir='./logs',
    overwrite_output_dir=True,
)




trainer = Trainer(
    model=model,                     # The model to be trained
    args=training_args,              # Training arguments
    train_dataset=tokenized_data,    # The training dataset
)

In [12]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [13]:
trainer.train()
trainer.save_model("./gpt2-finetuned")

  2%|▏         | 500/20232 [04:50<3:11:17,  1.72it/s]

{'loss': 0.4551, 'learning_rate': 4.8764333728746545e-05, 'epoch': 0.07}


  5%|▍         | 1000/20232 [09:41<3:04:44,  1.74it/s]

{'loss': 0.338, 'learning_rate': 4.752866745749308e-05, 'epoch': 0.15}


  7%|▋         | 1500/20232 [14:15<2:48:52,  1.85it/s]

{'loss': 0.3082, 'learning_rate': 4.629300118623962e-05, 'epoch': 0.22}


 10%|▉         | 2000/20232 [18:59<2:58:21,  1.70it/s]

{'loss': 0.2996, 'learning_rate': 4.505733491498616e-05, 'epoch': 0.3}


 12%|█▏        | 2500/20232 [23:48<2:52:21,  1.71it/s]

{'loss': 0.286, 'learning_rate': 4.382166864373271e-05, 'epoch': 0.37}


 15%|█▍        | 3000/20232 [28:39<2:46:36,  1.72it/s]

{'loss': 0.2811, 'learning_rate': 4.258600237247924e-05, 'epoch': 0.44}


 17%|█▋        | 3500/20232 [33:30<2:44:10,  1.70it/s]

{'loss': 0.2681, 'learning_rate': 4.1350336101225785e-05, 'epoch': 0.52}


 20%|█▉        | 4000/20232 [38:19<2:36:41,  1.73it/s]

{'loss': 0.2637, 'learning_rate': 4.011466982997232e-05, 'epoch': 0.59}


 22%|██▏       | 4500/20232 [43:10<2:33:42,  1.71it/s]

{'loss': 0.2623, 'learning_rate': 3.887900355871886e-05, 'epoch': 0.67}


 25%|██▍       | 5000/20232 [48:03<2:28:50,  1.71it/s]

{'loss': 0.2548, 'learning_rate': 3.7643337287465405e-05, 'epoch': 0.74}


 27%|██▋       | 5500/20232 [52:52<2:20:45,  1.74it/s]

{'loss': 0.2548, 'learning_rate': 3.640767101621194e-05, 'epoch': 0.82}


 30%|██▉       | 6000/20232 [57:43<2:16:45,  1.73it/s]

{'loss': 0.248, 'learning_rate': 3.517200474495848e-05, 'epoch': 0.89}


 32%|███▏      | 6500/20232 [1:02:33<2:14:15,  1.70it/s]

{'loss': 0.2435, 'learning_rate': 3.3936338473705025e-05, 'epoch': 0.96}


 35%|███▍      | 7000/20232 [1:07:23<2:08:01,  1.72it/s]

{'loss': 0.2384, 'learning_rate': 3.270067220245157e-05, 'epoch': 1.04}


 37%|███▋      | 7500/20232 [1:12:15<2:02:17,  1.74it/s]

{'loss': 0.2371, 'learning_rate': 3.14650059311981e-05, 'epoch': 1.11}


 40%|███▉      | 8000/20232 [1:17:06<1:59:25,  1.71it/s]

{'loss': 0.2377, 'learning_rate': 3.0229339659944645e-05, 'epoch': 1.19}


 42%|████▏     | 8500/20232 [1:21:57<1:54:53,  1.70it/s]

{'loss': 0.2364, 'learning_rate': 2.8993673388691184e-05, 'epoch': 1.26}


 44%|████▍     | 9000/20232 [1:26:47<1:49:07,  1.72it/s]

{'loss': 0.241, 'learning_rate': 2.7758007117437723e-05, 'epoch': 1.33}


 47%|████▋     | 9500/20232 [1:31:38<1:44:34,  1.71it/s]

{'loss': 0.2372, 'learning_rate': 2.6522340846184262e-05, 'epoch': 1.41}


 49%|████▉     | 10000/20232 [1:36:29<1:39:57,  1.71it/s]

{'loss': 0.2353, 'learning_rate': 2.52866745749308e-05, 'epoch': 1.48}


 52%|█████▏    | 10500/20232 [1:41:30<1:32:56,  1.75it/s] 

{'loss': 0.2308, 'learning_rate': 2.4051008303677343e-05, 'epoch': 1.56}


 54%|█████▍    | 11000/20232 [1:46:22<1:30:04,  1.71it/s]

{'loss': 0.2275, 'learning_rate': 2.2815342032423882e-05, 'epoch': 1.63}


 57%|█████▋    | 11500/20232 [1:51:12<1:23:14,  1.75it/s]

{'loss': 0.2296, 'learning_rate': 2.1579675761170425e-05, 'epoch': 1.71}


 59%|█████▉    | 12000/20232 [1:56:03<1:19:41,  1.72it/s]

{'loss': 0.2284, 'learning_rate': 2.0344009489916967e-05, 'epoch': 1.78}


 62%|██████▏   | 12500/20232 [2:00:55<1:16:37,  1.68it/s]

{'loss': 0.2295, 'learning_rate': 1.9108343218663506e-05, 'epoch': 1.85}


 64%|██████▍   | 13000/20232 [2:05:47<1:10:39,  1.71it/s]

{'loss': 0.2275, 'learning_rate': 1.7872676947410045e-05, 'epoch': 1.93}


 67%|██████▋   | 13500/20232 [2:10:37<1:04:13,  1.75it/s]

{'loss': 0.2289, 'learning_rate': 1.6637010676156584e-05, 'epoch': 2.0}


 69%|██████▉   | 14000/20232 [2:15:27<1:01:03,  1.70it/s]

{'loss': 0.2253, 'learning_rate': 1.5401344404903126e-05, 'epoch': 2.08}


 72%|███████▏  | 14500/20232 [2:20:17<54:39,  1.75it/s]  

{'loss': 0.2185, 'learning_rate': 1.4165678133649665e-05, 'epoch': 2.15}


 74%|███████▍  | 15000/20232 [2:25:06<49:55,  1.75it/s]

{'loss': 0.22, 'learning_rate': 1.2930011862396206e-05, 'epoch': 2.22}


 77%|███████▋  | 15500/20232 [2:29:55<45:34,  1.73it/s]

{'loss': 0.2191, 'learning_rate': 1.1694345591142744e-05, 'epoch': 2.3}


 79%|███████▉  | 16000/20232 [2:34:44<40:21,  1.75it/s]

{'loss': 0.2229, 'learning_rate': 1.0458679319889285e-05, 'epoch': 2.37}


 82%|████████▏ | 16500/20232 [2:39:33<35:45,  1.74it/s]

{'loss': 0.2284, 'learning_rate': 9.223013048635824e-06, 'epoch': 2.45}


 84%|████████▍ | 17000/20232 [2:44:22<31:18,  1.72it/s]

{'loss': 0.2158, 'learning_rate': 7.987346777382365e-06, 'epoch': 2.52}


 86%|████████▋ | 17500/20232 [2:49:11<26:06,  1.74it/s]

{'loss': 0.2247, 'learning_rate': 6.751680506128904e-06, 'epoch': 2.59}


 89%|████████▉ | 18000/20232 [2:54:01<21:22,  1.74it/s]

{'loss': 0.2142, 'learning_rate': 5.516014234875446e-06, 'epoch': 2.67}


 91%|█████████▏| 18500/20232 [2:58:51<16:33,  1.74it/s]

{'loss': 0.2227, 'learning_rate': 4.2803479636219856e-06, 'epoch': 2.74}


 94%|█████████▍| 19000/20232 [3:03:39<11:45,  1.75it/s]

{'loss': 0.2195, 'learning_rate': 3.0446816923685253e-06, 'epoch': 2.82}


 96%|█████████▋| 19500/20232 [3:08:29<07:01,  1.74it/s]

{'loss': 0.2226, 'learning_rate': 1.8090154211150655e-06, 'epoch': 2.89}


 99%|█████████▉| 20000/20232 [3:13:21<02:16,  1.70it/s]

{'loss': 0.2173, 'learning_rate': 5.733491498616055e-07, 'epoch': 2.97}


100%|██████████| 20232/20232 [3:15:48<00:00,  1.72it/s]


{'train_runtime': 11748.9668, 'train_samples_per_second': 13.776, 'train_steps_per_second': 1.722, 'train_loss': 0.24715846364514166, 'epoch': 3.0}


In [18]:
from transformers import pipeline
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
model.to('cuda')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

input_text = "ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छ ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छ"  # prompt in Latin script
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# GPU if available
if torch.cuda.is_available():
    input_ids = input_ids.cuda()

# Generate text
output = model.generate(input_ids, max_length=200)  # max_length adjustment
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:  ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छ ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छी
ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छ ताराशंकर बंधोपाध्यायक जन्म  जुलाई  भारतक एक भारतीय अभिनेत्री छी


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abhinavmaithil/maithiliwikidump202009uncleaned")

print("Path to dataset files:", path)

In [None]:
pip install kagglehub