In [1]:
import pickle
import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, random_split
import torch
from torch.utils.data import Dataset, random_split
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MaithiliDataset(Dataset):
    def __init__(self, tokenized_data, block_size=512):
        """
        Initializes the MaithiliDataset. 
        Args:
        - tokenized_data (list of dict): List of tokenized samples, each being a dictionary with 'input_ids' and 'attention_mask'.
        - block_size (int): The size of each chunk after sliding window.
        """
        self.input_ids = []
        self.attention_masks = []

        # Flatten and chunk the data (with sliding window) for both 'input_ids' and 'attention_mask'
        for data in tokenized_data:
            input_ids = data['input_ids']
            attention_mask = data['attention_mask']
            
            for i in range(0, len(input_ids) - block_size + 1, block_size):
                chunk_input_ids = input_ids[i:i + block_size]
                chunk_attention_mask = attention_mask[i:i + block_size]
                
                # Add the chunked data
                self.input_ids.append(chunk_input_ids)
                self.attention_masks.append(chunk_attention_mask)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_masks[idx])
        }

# Load your tokenized data (this should be a list of dictionaries with 'input_ids' and 'attention_mask')
with open("tokenized_data.pkl", "rb") as f:
    loaded_tokenized_data = pickle.load(f)

# Specify block size (e.g., 1024)
block_size = 128

# Create the dataset from tokenized data
dataset = MaithiliDataset(loaded_tokenized_data, block_size)

# Split dataset into training and validation
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Output some basic statistics
print("Original tokenized data batches:", len(loaded_tokenized_data))
print("Dataset chunks:", len(dataset))
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

Original tokenized data batches: 97
Dataset chunks: 388
Train dataset size: 271
Validation dataset size: 117


In [3]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPU if available
if torch.cuda.is_available():
    model.cuda()



In [4]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [6]:
trainer.train()

  0%|          | 0/204 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
model.save_pretrained('./fine_tuned_maithili_model')
tokenizer.save_pretrained('./fine_tuned_maithili_model')

In [None]:
trainer.evaluate()

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Define the path to your saved model
model_path = "fine_tuned_maithili_model"

# Load the model and tokenizer from the saved directory
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

  return torch.load(checkpoint_file, map_location="cpu")


OSError: Can't load tokenizer for 'gpt2-finetuned'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'gpt2-finetuned' is the correct path to a directory containing all relevant files for a GPT2Tokenizer tokenizer.

In [None]:
input_text = ""  # prompt in Latin script
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# GPU if available
if torch.cuda.is_available():
    input_ids = input_ids.cuda()

# Text Generation
output = model.generate(input_ids, max_length=100)  # max_length adjustment
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)