In [None]:
import pickle
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, random_split

In [None]:
class MaithiliDataset(Dataset):
    def __init__(self, tokenized_data, block_size=512):
        # Flatten nested list of tokens
        flattened_data = [token for sublist in tokenized_data for token in sublist]
        
        self.input_ids = []
        
        # sliding window chunks
        for i in range(0, len(flattened_data) - block_size + 1):
            chunk = flattened_data[i:i + block_size]
            self.input_ids.append(chunk)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx])

with open("tokenized_data.pkl", "rb") as f:
    loaded_tokenized_data = pickle.load(f)

# model's input size
block_size = 512
dataset = MaithiliDataset(loaded_tokenized_data, block_size)

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print("Original tokenized data batches:", len(loaded_tokenized_data))
print("Total flattened tokens:", sum(len(batch) for batch in loaded_tokenized_data))
print("Dataset chunks:", len(dataset))
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

In [None]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPU if available
if torch.cuda.is_available():
    model.cuda()

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('./fine_tuned_maithili_model')
tokenizer.save_pretrained('./fine_tuned_maithili_model')

In [None]:
trainer.evaluate()

In [None]:
input_text = ""  # prompt in Latin script
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# GPU if available
if torch.cuda.is_available():
    input_ids = input_ids.cuda()

# Text Generation
output = model.generate(input_ids, max_length=100)  # max_length adjustment
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)