#### Inference on prompts dataset using all decoding strategies

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_name = 'gpt2-large'

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left' # required for parallelism (default is set to right padding for some reason)

model.to(device)
model.eval()

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

2025-08-12 09:57:59.309636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754992679.475794      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754992679.538777      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [4]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# end of sequence token is the end of document token in GPT-2 
eod_token_id = tokenizer.eos_token_id

# Custom stopping criteria if end of doc reached 
class StopOnEOD(StoppingCriteria):
    def __init__(self, eod_token_id):
        self.eod_token_id = eod_token_id
    
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1].item() == self.eod_token_id

stop_criteria = StoppingCriteriaList([StopOnEOD(eod_token_id)])

In [None]:
args_dict = {
    'greedy': {'max_new_tokens': 200, 'do_sample': False, 'num_beams': 1, 'top_k': None, 'top_p': None, 'temperature': 1.0, 'stopping_criteria': stop_criteria},
    'beam-16': {'max_new_tokens': 200, 'do_sample': False, 'num_beams': 16, 'top_k': None, 'top_p': None, 'temperature': 1.0, 'stopping_criteria': stop_criteria},
    'top-k': {'max_new_tokens': 200, 'do_sample': True, 'num_beams': 1, 'top_k': 40, 'top_p': None, 'temperature': 1.0, 'stopping_criteria': stop_criteria},
    'top-p': {'max_new_tokens': 200, 'do_sample': True, 'num_beams': 1, 'top_k': None, 'top_p': 0.95, 'temperature': 1.0, 'stopping_criteria': stop_criteria},
    'pure-sampling': {'max_new_tokens': 200, 'do_sample': True, 'num_beams': 1,'top_k': None, 'top_p': None, 'temperature': 1.0, 'stopping_criteria': stop_criteria}
}

In [None]:
with open('data\prompts_1000.txt') as f:
    prompts = f.read().splitlines()

print(len(prompts))

1000


In [None]:
import json
from itertools import islice
from tqdm.auto import tqdm

filename = "generations\greedy_tokens.jsonl"

In [None]:
# advanced parallelism black magic
def batched(iterable, n):
    it = iter(iterable)
    while True:
        batch = list(islice(it, n))
        if not batch:
            break
        yield batch

chunk_size = 16   # tune for GPU memory
write_every = 64  # number of records to buffer before writing to jsonl
records_buffer = [] 

i = 0
with open(filename, "a", encoding="utf-8") as f:
    with torch.inference_mode(): # some new method better than no_grad()
        for batch in tqdm(batched(prompts, chunk_size)):
            # tokenize whole batch and move to device once
            inputs = tokenizer(batch, padding=True, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model.generate(**inputs, **args_dict['greedy'])
            gens = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            for gen in gens:
                i += 1
                records_buffer.append({"iteration": i, "generation": gen}) # append to the jsonl

            # flush buffer periodically to minimize blocking I/O
            if len(records_buffer) >= write_every:
                f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records_buffer) + "\n")
                f.flush()
                records_buffer = []

    # write reamining records in the end
    if records_buffer:
        f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records_buffer) + "\n")
        f.flush()

print("Done. Total iterations:", i)


0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Done — total iterations: 1000


### Metrics

#### Zipf

In [None]:
import json
import operator
from collections import Counter
import numpy as np
from scipy import stats

file = "generations\greedy_tokens.jsonl" 
counter = Counter()

with open(file, 'r') as f:
    for line in f:
        diction = json.loads(line.strip())  
        counter.update(diction['tokens'])

N = 5000
xs = np.arange(1, min(len(counter), N) + 1)
ys = np.array(sorted(counter.values(), key=operator.neg)[:N])

# Zipf
# freq(r) inversely prop to 1/r^a
a, b, r, p, std = stats.linregress(np.log(xs), np.log(ys)) # fit a line to log(rank) and log(n)
print(f"Zipf Coefficient: {-a}")

Zipf exponent: 1.0094403775275071


#### SelfBLEU4

In [None]:
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
import random
from tqdm import tqdm

def calculate_self_bleu4(file_path, n_sample):
    random.seed(0)
    
    # Load sentences
    all_sentences = []
    with open(file_path, "r") as f:
        for line in f:
            diction = json.loads(line.strip())
            all_sentences.append(diction['tokens'])
   
    # Sample
    if len(all_sentences) > n_sample:
        sample_indices = random.sample(range(len(all_sentences)), n_sample)
        sampled_sentences = [all_sentences[i] for i in sample_indices]
    else:
        sampled_sentences = all_sentences
        n_sample = len(all_sentences)
   
    smoothing_function = SmoothingFunction().method1
    weights = (0.25, 0.25, 0.25, 0.25)  
    bleu_scores = []
   
    # Calculate BLEU4 for each against others
    for i in tqdm(range(len(sampled_sentences)), desc="BLEU-4"):
        # Create refs (all sentences except current one)
        refs = sampled_sentences[:i] + sampled_sentences[i+1:]
        hyps = sampled_sentences[i]
       
        # Calculate BLEU-4 score
        score = sentence_bleu(
            references=refs,
            hypothesis=hyps,
            weights=weights,
            smoothing_function=smoothing_function
        )
        bleu_scores.append(score)
   
    avg_score = sum(bleu_scores) / len(bleu_scores)
    print(f"Self-BLEU-4: {avg_score:.4f}")
    return avg_score

# Usage
result = calculate_self_bleu4("generations\greedy_tokens.jsonl", n_sample=200)

BLEU-4: 100%|██████████| 200/200 [00:28<00:00,  7.07it/s]

Self-BLEU-4: 0.1827





#### Repetition%

In [None]:
from transformers import GPT2Tokenizer

def count_repetitions(file):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large", do_lower_case=True)
    SEP = tokenizer.encode(tokenizer.bos_token)[0] # later we remove bos token when it is present
    
    with open(file, 'r') as f:
        dictions = [json.loads(line.strip()) for line in f]
    
    count = 0
    for obj in dictions:
        gen = obj['tokens']
        if gen[-1] == SEP:
            gen.pop(-1)
        
        # we have to check for repeated pattern at the end
        rev_gen = list(reversed(gen))
        last_n_repeats = [0] * 90
        
        for n in range(1, 91):
            n_repeat = 1
            while (len(rev_gen[n*n_repeat:n*(n_repeat+1)]) == n and 
                   rev_gen[n*n_repeat:n*(n_repeat+1)] == rev_gen[:n]):
                n_repeat += 1
            last_n_repeats[n - 1] = n_repeat
        
        max_repeated_n = max(range(90), key=lambda x: last_n_repeats[x])
        
        if (last_n_repeats[max_repeated_n] > 1 and 
            (max_repeated_n + 1 >= 3 or last_n_repeats[max_repeated_n] > 50)):
            count += 1
    
    print(f'{count/10.0}%') # for percentage


count_repetitions("generations\greedy_tokens.jsonl")

73.6%
