In [101]:
import pandas as pd
import numpy as np 
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sentencepiece as spm
import os 

from transformers import PreTrainedTokenizerFast
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
import math
from torch.nn.utils.rnn import pad_sequence

# Loading dataset

In [102]:
# normalising the dataset because by deafult all the colums get loaded into a single column in the dataframe
df = pd.read_json(r'dataset/train_mr.jsonl', lines=True)
df = pd.json_normalize(df['row'])

df_eval = pd.read_json(r'dataset/validation_mr.jsonl', lines=True)
df_eval = pd.json_normalize(df_eval['row'])

# loading tokenizer and converting the dataset 

In [103]:
sp = spm.SentencePieceProcessor()
sp.load(r'model/tokenizer/spm_tokenizer.model')

def encode(text):
    return sp.encode(text, out_type=int)

df['input_ids'] = df['input'].apply(encode)
df['target_ids'] = df['target'].apply(encode)

df_eval['input_ids'] = df_eval['input'].apply(encode)
df_eval['target_ids'] = df_eval['target'].apply(encode)

# Padding 

In [104]:
from torch.nn.utils.rnn import pad_sequence

class PadCollator:
    def __init__(self, pad_id=0, max_length=None):
        self.pad_id = pad_id
        self.max_length = max_length  # fixed max length for all batches

    def __call__(self, features):
        input_ids = [f["input_ids"].clone().detach().long() for f in features]
        labels = [f["labels"].clone().detach().long() for f in features]

        # Pad each sequence manually to fixed length
        if self.max_length is not None:
            input_ids = [self._pad_to_length(x, self.max_length, self.pad_id) for x in input_ids]
            labels = [self._pad_to_length(x, self.max_length, -100) for x in labels]
            input_ids = torch.stack(input_ids)
            labels = torch.stack(labels)
        else:
            # Dynamic padding (default)
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_id)
            labels = pad_sequence(labels, batch_first=True, padding_value=-100)
            
        attention_mask = (input_ids != self.pad_id).long()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }
    def _pad_to_length(self, tensor, length, pad_value):
        """Pad or truncate a tensor to a fixed length."""
        if tensor.size(0) < length:
            pad_size = length - tensor.size(0)
            return torch.cat([tensor, torch.full((pad_size,), pad_value, dtype=tensor.dtype)])
        else:
            return tensor[:length]

In [105]:
class TextDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input_ids'].tolist()
        self.targets = df['target_ids'].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'labels': torch.tensor(self.targets[idx], dtype=torch.long)
        }

dataset = TextDataset(df)
dataset_eval = TextDataset(df_eval)

In [106]:
type(dataset_eval)

__main__.TextDataset

In [107]:
sp.get_piece_size()

800

In [108]:
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
    vocab_size=sp.get_piece_size()+10,
    n_layer=4,
    n_head=4,
    n_embd=256,
)
model = GPT2LMHeadModel(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.config.pad_token_id = sp.pad_id()

In [109]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    learning_rate=5e-4,
    save_steps=500,
    logging_steps=100,
    report_to='none'
)
data_collator = PadCollator(pad_id=sp.pad_id(), max_length=64)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    eval_dataset=dataset_eval
)

In [110]:
trainer.train()



Step,Training Loss
100,5.8729
200,5.773
300,5.7716
400,5.758
500,5.7494
600,5.7347
700,5.7402
800,5.758
900,5.7103
1000,5.7199




TrainOutput(global_step=8625, training_loss=5.68243210060009, metrics={'train_runtime': 592.2657, 'train_samples_per_second': 58.251, 'train_steps_per_second': 14.563, 'total_flos': 41857744896000.0, 'train_loss': 5.68243210060009, 'epoch': 5.0})

In [111]:
model.save_pretrained(r"./results/trained_model2")
#tokenizer = PreTrainedTokenizerFast(tokenizer_file="my_tokenizer.model")
#tokenizer.save_pretrained("./trained_gpt2_sp")

In [112]:
eval_results = trainer.evaluate()



In [113]:
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 289.39


In [114]:
print(eval_results)

{'eval_loss': 5.667771816253662, 'eval_runtime': 8.4094, 'eval_samples_per_second': 309.178, 'eval_steps_per_second': 38.647, 'epoch': 5.0}


In [115]:
import torch

sample = dataset[2]

batch = data_collator([sample])

input_ids = batch["input_ids"]
labels = batch["labels"]
attention_mask = batch["attention_mask"]


print("Input IDs shape:", input_ids.shape)
print("Labels shape:", labels.shape)

print("\nDecoded input text:")
print(sp.decode_ids(input_ids[0].tolist()))

decoded_labels = [t for t in labels[0].tolist() if t != -100]
print(sp.decode_ids(decoded_labels))

print("\nDecoded label text:")
print(sp.decode_ids(decoded_labels))

model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, labels=labels)

logits = outputs.logits
print("\nModel output (logits) shape:", logits.shape)

next_token_id = torch.argmax(logits[:, -1, :], dim=-1)
next_token = sp.decode_ids(next_token_id[0].tolist())
print("\nPredicted next token:", repr(next_token))

generated_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=input_ids.shape[1] + 30,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    temperature=0.8,
    repetition_penalty=1.2,
    pad_token_id=sp.pad_id()
)
generated_text = sp.decode_ids(generated_ids[0].tolist())
print("\nGenerated text:\n", generated_text)

Input IDs shape: torch.Size([1, 64])
Labels shape: torch.Size([1, 64])

Decoded input text:
सांगली - सांगली शहरातील 100 फुटी रोड परिसरात नवीन वर्षाची सुरुवात तोडफोड, जाळपोळीने झाली.
सांगलीत अज्ञातांकडून वाहनांची जाळपोळ

Decoded label text:
सांगलीत अज्ञातांकडून वाहनांची जाळपोळ

Model output (logits) shape: torch.Size([1, 64, 810])

Predicted next token: 'व'

Generated text:
 सांगली - सांगली शहरातील 100 फुटी रोड परिसरात नवीन वर्षाची सुरुवात तोडफोड, जाळपोळीने झाली.ोललावीवदसटेकवायलेपूनचाट जलीचाातंजपपार ज 


In [116]:
sp.get_piece_size()

800