In [1]:
!pip install matplotlib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np 
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sentencepiece as spm
import os 
from datasets import Dataset
import math
import time

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments
import torch




In [3]:
from torch.nn.utils.rnn import pad_sequence

class PadCollator:
    def __init__(self, pad_id=0, max_length=None):
        self.pad_id = pad_id
        self.max_length = max_length

    def __call__(self, features):
        # FIX: safely copy tensors without warnings
        input_ids = [f["input_ids"].clone().detach() for f in features]
        labels = [f["labels"].clone().detach() for f in features]

        # Fixed-length padding or dynamic padding
        if self.max_length is not None:
            input_ids = [self._pad_to_length(x, self.max_length, self.pad_id) for x in input_ids]
            labels = [self._pad_to_length(x, self.max_length, -100) for x in labels]

            input_ids = torch.stack(input_ids)
            labels = torch.stack(labels)

        else:
            input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_id)
            labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        return {"input_ids": input_ids, "labels": labels}

    def _pad_to_length(self, tensor, length, pad_value):
        if tensor.size(0) < length:
            pad_size = length - tensor.size(0)
            return torch.cat([tensor, torch.full((pad_size,), pad_value, dtype=tensor.dtype)])
        else:
            return tensor[:length]


In [4]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input_ids'].tolist()
        self.targets = df['target_ids'].tolist()
        self._column_names = ["input_ids", "labels"]   # <-- added

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'labels': torch.tensor(self.targets[idx], dtype=torch.long)
        }

    @property
    def column_names(self):   # <-- added
        return self._column_names

## Loading validation dataset

In [5]:
df_eval = pd.read_json(r'dataset/validation_mr.jsonl', lines=True)
df_eval = pd.json_normalize(df_eval['row'])

In [6]:
df_eval.head()

Unnamed: 0,id,input,target,url
0,1,औरंगाबाद : प्रतिनिधी 'त्यांनी मला खूप छळलंय.,"मला खूप छळलंय, त्यांनाही छळा",https://www.pudhari.news/news/Aurangabad/Polic...
1,2,विहिरीमध्ये अज्ञात इसमाचा मृतदेह आढळल्याने खळब...,मोदींची मुख्यमंत्रीपदाची कारकिर्द देशावरचा डाग...,https://www.dainikprabhat.com/modis-chief-mini...
2,3,मुंबईः काल रात्रीपासून मुंबईत सुरु असलेल्या जो...,महापौर विश्वनाथ महाडेश्वरांंनंतर उद्धव ठाकरेंच...,https://maharashtradesha.com/uddhav-thackerays...
3,4,टीम महाराष्ट्र देशा : राज्यात विधानसभा निवडणुक...,शरद पवारांच्या 'या' भूमिकेचा शिवसेनेला बसला दणका,https://maharashtradesha.com/sharad-pawar-said...
4,5,पुणे : प्रतिनिधी शालेय विद्यार्थ्यांची सुरक्षि...,"शाळा, पालकांमुळे खासगी वाहतूक फोफावणार",https://www.pudhari.news/news/Pune/Private-tra...


## Importing tokenizer and tokenizing dataset

In [7]:
sp = spm.SentencePieceProcessor()
sp.load('my_tokenizer.model')

def encode(text):
    return sp.encode(text, out_type=int)

df_eval['input_ids'] = df_eval['input'].apply(encode)
df_eval['target_ids'] = df_eval['target'].apply(encode)

OSError: Not found: "my_tokenizer.model": No such file or directory Error #2

In [None]:
import torch
import numpy as np
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

pad_id = sp.pad_id()

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # logits: (batch, seq, vocab)
    preds = np.argmax(logits, axis=-1)

    # flatten
    preds = preds.reshape(-1)
    labels = labels.reshape(-1)

    # remove padding tokens
    mask = labels != pad_id
    preds = preds[mask]
    labels = labels[mask]

    # compute accuracy
    accuracy = (preds == labels).mean()

    return {"token_accuracy": accuracy}

## Loading GPT2 Model and testing the model 

In [None]:
model = GPT2LMHeadModel.from_pretrained("./trained_model2")
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=4,
    remove_unused_columns=False,
)
#eval_dataset = SentencePieceEvalDataset(df_eval, sp, max_length=64)
data_collator = PadCollator(pad_id=sp.pad_id(), max_length=64)
dataset_eval = TextDataset(df_eval)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  # your PadCollator
    eval_dataset=dataset_eval,
    compute_metrics=compute_metrics
)
results = trainer.evaluate()
print(results)

## Calculating perplexity 

In [None]:
perplexity = math.exp(results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

In [None]:
def measure_latency_cpu(model, inputs, *, n_warmup=20, n_runs=100, batch_size=None):
    """
    CPU-only latency measurement.
    Returns median, mean, p95, p99, samples/sec.
    """
    model.eval()
    if torch.is_tensor(inputs):
        sample_input = (inputs,)
    else:
        sample_input = tuple(inputs)

    # Warmup
    with torch.no_grad():
        for _ in range(n_warmup):
            _ = model(*sample_input)

    # Measure
    times = []
    with torch.no_grad():
        for _ in range(n_runs):
            t0 = time.time()
            _ = model(*sample_input)
            t1 = time.time()
            times.append((t1 - t0) * 1000)  # ms

    arr = np.array(times)
    stats = {
        "median_ms": float(np.median(arr)),
        "mean_ms": float(np.mean(arr)),
        "p95_ms": float(np.percentile(arr, 95)),
        "p99_ms": float(np.percentile(arr, 99)),
        "std_ms": float(np.std(arr)),
        "samples_per_sec": None
    }

    # Per-sample throughput
    if batch_size is None:
        b = sample_input[0].shape[0]
    else:
        b = batch_size

    stats["samples_per_sec"] = 1000.0 / stats["median_ms"] * b
    stats["median_ms_per_sample"] = stats["median_ms"] / b
    return stats



In [None]:
import torchvision.models as models

model = models.resnet18().eval()      # CPU by default
x = torch.randn(4, 3, 224, 224)       # batch size = 4

stats = measure_latency_cpu(model, x, n_runs=50)
print(stats)



In [None]:
def throughput_vs_batch_cpu(model, input_shape, batch_list=None, n_runs=50):
    if batch_list is None:
        batch_list = [1, 2, 4, 8, 16]

    results = []
    for b in batch_list:
        x = torch.randn((b,) + tuple(input_shape))
        stats = measure_latency_cpu(model, x, n_warmup=5, n_runs=n_runs, batch_size=b)
        results.append({
            "batch": b,
            "samples_per_sec": stats["samples_per_sec"],
            "median_latency_ms": stats["median_ms"]
        })
    return results


In [None]:
from thop import profile
import torch

model = models.resnet18().eval()
dummy = torch.randn(1, 3, 224, 224)

macs, params = profile(model, inputs=(dummy,), verbose=False)
flops = macs * 2
print("MACs:", macs)
print("FLOPs (approx):", flops)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from scipy.optimize import curve_fit

def plot_sample_efficiency(sample_sizes, metrics, *,
                           xlabel='# samples', ylabel='val accuracy', logx=True):
    x = np.array(sample_sizes)
    y = np.array(metrics)

    plt.figure(figsize=(6,4))
    if logx:
        plt.xscale('log')
    plt.plot(x, y, marker='o', linewidth=2)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.grid(True, alpha=0.4)
    plt.title("Sample Efficiency Curve")

    raw_auc = auc(x, y)
    norm_auc = raw_auc / (x.max() - x.min())
    print("AUC:", raw_auc, "Normalized AUC:", norm_auc)
    plt.show()

def power_law(N, a, b, alpha):
    return a - b * N**(-alpha)

def fit_power_law(sample_sizes, metrics):
    popt, _ = curve_fit(power_law, sample_sizes, metrics,
                        p0=[max(metrics), max(metrics)-min(metrics), 0.2])
    a, b, alpha = popt
    print("Power-law fit:", popt)
    return popt


In [None]:
from torchvision.models import resnet18
model = resnet18().eval()

# Latency
x = torch.randn(1, 3, 224, 224)
print(measure_latency_cpu(model, x))

# Throughput
print(throughput_vs_batch_cpu(model, (3,224,224)))

# FLOPs/MACs
from thop import profile
macs, params = profile(model, inputs=(x,), verbose=False)
print("MACs:", macs, "FLOPs:", macs*2)
