In [1]:
import pandas as pd
import numpy as np 
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sentencepiece as spm
import os 
from datasets import Dataset
import math


from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
from torch.nn.utils.rnn import pad_sequence

class PadCollator:
    def __init__(self, pad_id=0, max_length=None):
        self.pad_id = pad_id
        self.max_length = max_length  # fixed max length for all batches

    def __call__(self, features):
        input_ids = [f["input_ids"].clone().detach().long() for f in features]
        labels = [f["labels"].clone().detach().long() for f in features]

        # Pad each sequence manually to fixed length
        if self.max_length is not None:
            input_ids = [self._pad_to_length(x, self.max_length, self.pad_id) for x in input_ids]
            labels = [self._pad_to_length(x, self.max_length, -100) for x in labels]
            input_ids = torch.stack(input_ids)
            labels = torch.stack(labels)
        else:
            # Dynamic padding (default)
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_id)
            labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        return {"input_ids": input_ids, "labels": labels}

    def _pad_to_length(self, tensor, length, pad_value):
        """Pad or truncate a tensor to a fixed length."""
        if tensor.size(0) < length:
            pad_size = length - tensor.size(0)
            return torch.cat([tensor, torch.full((pad_size,), pad_value, dtype=tensor.dtype)])
        else:
            return tensor[:length]

In [15]:
class TextDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input_ids'].tolist()
        self.targets = df['target_ids'].tolist()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'labels': torch.tensor(self.targets[idx], dtype=torch.long)
        }

## Loading validation dataset

In [4]:
df_eval = pd.read_json(r'dataset/validation_mr.jsonl', lines=True)
df_eval = pd.json_normalize(df_eval['row'])

In [5]:
df_eval.head()

Unnamed: 0,id,input,target,url
0,1,‡§î‡§∞‡§Ç‡§ó‡§æ‡§¨‡§æ‡§¶ : ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡•Ä '‡§§‡•ç‡§Ø‡§æ‡§Ç‡§®‡•Ä ‡§Æ‡§≤‡§æ ‡§ñ‡•Ç‡§™ ‡§õ‡§≥‡§≤‡§Ç‡§Ø.,"‡§Æ‡§≤‡§æ ‡§ñ‡•Ç‡§™ ‡§õ‡§≥‡§≤‡§Ç‡§Ø, ‡§§‡•ç‡§Ø‡§æ‡§Ç‡§®‡§æ‡§π‡•Ä ‡§õ‡§≥‡§æ",https://www.pudhari.news/news/Aurangabad/Polic...
1,2,‡§µ‡§ø‡§π‡§ø‡§∞‡•Ä‡§Æ‡§ß‡•ç‡§Ø‡•á ‡§Ö‡§ú‡•ç‡§û‡§æ‡§§ ‡§á‡§∏‡§Æ‡§æ‡§ö‡§æ ‡§Æ‡•É‡§§‡§¶‡•á‡§π ‡§Ü‡§¢‡§≥‡§≤‡•ç‡§Ø‡§æ‡§®‡•á ‡§ñ‡§≥‡§¨...,‡§Æ‡•ã‡§¶‡•Ä‡§Ç‡§ö‡•Ä ‡§Æ‡•Å‡§ñ‡•ç‡§Ø‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä‡§™‡§¶‡§æ‡§ö‡•Ä ‡§ï‡§æ‡§∞‡§ï‡§ø‡§∞‡•ç‡§¶ ‡§¶‡•á‡§∂‡§æ‡§µ‡§∞‡§ö‡§æ ‡§°‡§æ‡§ó...,https://www.dainikprabhat.com/modis-chief-mini...
2,3,‡§Æ‡•Å‡§Ç‡§¨‡§à‡§É ‡§ï‡§æ‡§≤ ‡§∞‡§æ‡§§‡•ç‡§∞‡•Ä‡§™‡§æ‡§∏‡•Ç‡§® ‡§Æ‡•Å‡§Ç‡§¨‡§à‡§§ ‡§∏‡•Å‡§∞‡•Å ‡§Ö‡§∏‡§≤‡•á‡§≤‡•ç‡§Ø‡§æ ‡§ú‡•ã...,‡§Æ‡§π‡§æ‡§™‡•å‡§∞ ‡§µ‡§ø‡§∂‡•ç‡§µ‡§®‡§æ‡§• ‡§Æ‡§π‡§æ‡§°‡•á‡§∂‡•ç‡§µ‡§∞‡§æ‡§Ç‡§Ç‡§®‡§Ç‡§§‡§∞ ‡§â‡§¶‡•ç‡§ß‡§µ ‡§†‡§æ‡§ï‡§∞‡•á‡§Ç‡§ö...,https://maharashtradesha.com/uddhav-thackerays...
3,4,‡§ü‡•Ä‡§Æ ‡§Æ‡§π‡§æ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞ ‡§¶‡•á‡§∂‡§æ : ‡§∞‡§æ‡§ú‡•ç‡§Ø‡§æ‡§§ ‡§µ‡§ø‡§ß‡§æ‡§®‡§∏‡§≠‡§æ ‡§®‡§ø‡§µ‡§°‡§£‡•Å‡§ï...,‡§∂‡§∞‡§¶ ‡§™‡§µ‡§æ‡§∞‡§æ‡§Ç‡§ö‡•ç‡§Ø‡§æ '‡§Ø‡§æ' ‡§≠‡•Ç‡§Æ‡§ø‡§ï‡•á‡§ö‡§æ ‡§∂‡§ø‡§µ‡§∏‡•á‡§®‡•á‡§≤‡§æ ‡§¨‡§∏‡§≤‡§æ ‡§¶‡§£‡§ï‡§æ,https://maharashtradesha.com/sharad-pawar-said...
4,5,‡§™‡•Å‡§£‡•á : ‡§™‡•ç‡§∞‡§§‡§ø‡§®‡§ø‡§ß‡•Ä ‡§∂‡§æ‡§≤‡•á‡§Ø ‡§µ‡§ø‡§¶‡•ç‡§Ø‡§æ‡§∞‡•ç‡§•‡•ç‡§Ø‡§æ‡§Ç‡§ö‡•Ä ‡§∏‡•Å‡§∞‡§ï‡•ç‡§∑‡§ø...,"‡§∂‡§æ‡§≥‡§æ, ‡§™‡§æ‡§≤‡§ï‡§æ‡§Ç‡§Æ‡•Å‡§≥‡•á ‡§ñ‡§æ‡§∏‡§ó‡•Ä ‡§µ‡§æ‡§π‡§§‡•Ç‡§ï ‡§´‡•ã‡§´‡§æ‡§µ‡§£‡§æ‡§∞",https://www.pudhari.news/news/Pune/Private-tra...


## Importing tokenizer and tokenizing dataset

In [16]:
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

def encode(text):
    return sp.encode(text, out_type=int)

df_eval['input_ids'] = df_eval['input'].apply(encode)
df_eval['target_ids'] = df_eval['target'].apply(encode)

## Loading GPT2 Model and testing the model 

In [19]:
model = GPT2LMHeadModel.from_pretrained("./trained_model2")

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=4,
)
#eval_dataset = SentencePieceEvalDataset(df_eval, sp, max_length=64)
data_collator = PadCollator(pad_id=sp.pad_id(), max_length=64)
dataset_eval = TextDataset(df_eval)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  # your PadCollator
    eval_dataset=dataset_eval,
)
results = trainer.evaluate()
print(results)

{'eval_loss': 8.093897819519043, 'eval_model_preparation_time': 0.001, 'eval_runtime': 16.5617, 'eval_samples_per_second': 156.988, 'eval_steps_per_second': 39.247}


## Calculating perplexity 

In [20]:
perplexity = math.exp(results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 3274.43


## Cross entropy

In [21]:
cross_entropy = results["eval_loss"]
print(f"Cross Entropy: {cross_entropy:.4f}")

Cross Entropy: 8.0939


## Bits per token 

In [22]:
bpt = cross_entropy / math.log(2)
print(f"Bits per Token: {bpt:.4f}")

Bits per Token: 11.6770


## Token-level Top-1 Accuracy

In [None]:
import torch
from torch.utils.data import DataLoader

def token_level_accuracy(model, dataloader, device="cuda"):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs.logits, dim=-1)
            mask = labels != sp.pad_id()
            correct += ((preds == labels) & mask).sum().item()
            total += mask.sum().item()
    return correct / total

dataloader = DataLoader(dataset_eval, batch_size=4, collate_fn=data_collator)
acc = token_level_accuracy(model, dataloader)
print(f"Top-1 Token Accuracy: {acc:.4%}")

In [30]:
import torch
import math
import time
import os, glob, psutil
import matplotlib.pyplot as plt
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import Dataset

# -------------------------------
# CONFIGURATION
# -------------------------------
MODEL_PATH = "./trained_model2"
MAX_LENGTH = 64
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {DEVICE}")

# -------------------------------
# LOAD MODEL
# -------------------------------
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH).to(DEVICE)
model.eval()

# Suppose df_eval is your evaluation dataframe with a 'text' column
# and `sp` is your SentencePiece tokenizer
dataset_eval = Dataset.from_pandas(df_eval)

def tokenize_function(example):
    tokens = sp.encode(example["text"], out_type=int)
    return {"input_ids": tokens, "labels": tokens}

dataset_eval = dataset_eval.map(tokenize_function)

# -------------------------------
# 1Ô∏è‚É£ CROSS ENTROPY
# -------------------------------
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=BATCH_SIZE,
    report_to="none"
)

def compute_metrics(eval_pred):
    loss = eval_pred.loss
    return {"cross_entropy": loss}

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics,
)

results = trainer.evaluate()
cross_entropy = results["eval_loss"]
print(f"Cross Entropy: {cross_entropy:.4f}")

# -------------------------------
# 2Ô∏è‚É£ BITS PER TOKEN
# -------------------------------
bpt = cross_entropy / math.log(2)
print(f"Bits per Token: {bpt:.4f}")

# -------------------------------
# 3Ô∏è‚É£ TOKEN-LEVEL TOP-1 ACCURACY
# -------------------------------
from torch.utils.data import DataLoader

def collate_fn(batch):
    # simple padding for demonstration
    max_len = min(MAX_LENGTH, max(len(x["input_ids"]) for x in batch))
    input_ids = [x["input_ids"][:max_len] for x in batch]
    labels = [x["labels"][:max_len] for x in batch]
    input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in input_ids],
                                                batch_first=True, padding_value=sp.pad_id())
    labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in labels],
                                             batch_first=True, padding_value=sp.pad_id())
    return {"input_ids": input_ids, "labels": labels}

dataloader = DataLoader(dataset_eval, batch_size=BATCH_SIZE, collate_fn=collate_fn)

def token_level_accuracy(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids)
            preds = torch.argmax(outputs.logits, dim=-1)
            mask = labels != sp.pad_id()
            correct += ((preds == labels) & mask).sum().item()
            total += mask.sum().item()
    return correct / total if total > 0 else 0

acc = token_level_accuracy(model, dataloader, DEVICE)
print(f"Top-1 Token Accuracy: {acc:.4%}")

# -------------------------------
# 4Ô∏è‚É£ PARAMETER COUNTS
# -------------------------------
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")

# -------------------------------
# 5Ô∏è‚É£ INFERENCE LATENCY
# -------------------------------
sample_input = torch.tensor([sp.encode("Sample input text", out_type=int)]).to(DEVICE)
with torch.no_grad():
    for _ in range(3): model(sample_input)  # warmup

n_runs = 10
start = time.time()
for _ in range(n_runs):
    with torch.no_grad():
        model(sample_input)
end = time.time()
latency_ms = (end - start) / n_runs * 1000
print(f"Inference Latency: {latency_ms:.2f} ms per input")

# -------------------------------
# 6Ô∏è‚É£ THROUGHPUT
# -------------------------------
tokens = sample_input.numel()
throughput = tokens / (latency_ms / 1000)
print(f"Throughput: {throughput:.2f} tokens/s")

# -------------------------------
# 7Ô∏è‚É£ FLOPs ESTIMATE
# -------------------------------
flops_per_token = 6 * total_params
print(f"Approx FLOPs per token: {flops_per_token / 1e9:.2f} GFLOPs")

# -------------------------------
# 8Ô∏è‚É£ QUANTIZED MODEL SIZE (approx)
# -------------------------------
size_bytes = sum(os.path.getsize(f) for f in glob.glob(f"{MODEL_PATH}/**/*", recursive=True) if os.path.isfile(f))
print(f"Model disk size: {size_bytes / (1024 ** 2):.2f} MB")

# -------------------------------
# 9Ô∏è‚É£ PEAK CPU MEMORY USAGE
# -------------------------------
process = psutil.Process()
cpu_before = process.memory_info().rss
with torch.no_grad():
    model(sample_input)
cpu_after = process.memory_info().rss
print(f"Peak CPU memory usage (approx): {(cpu_after - cpu_before)/1e6:.2f} MB")

# -------------------------------
# üîü SAMPLING EFFICIENCY CURVE
# -------------------------------
lengths = [8, 16, 32, 64, 128]
efficiency = []

for L in lengths:
    inp = torch.tensor([sp.encode("test " * L, out_type=int)]).to(DEVICE)
    start = time.time()
    with torch.no_grad():
        model(inp)
    end = time.time()
    tok_per_sec = L / (end - start)
    efficiency.append(tok_per_sec)

plt.figure(figsize=(6,4))
plt.plot(lengths, efficiency, marker='o')
plt.xlabel("Sequence Length")
plt.ylabel("Tokens per Second")
plt.title("Sampling Efficiency Curve")
plt.grid(True)
plt.tight_layout()
plt.show()


Running on cpu


Map:   0%|          | 0/2600 [00:00<?, ? examples/s]


KeyError: 'text'