In [4]:
from datasets import load_from_disk

p="/home/alice/work/mosaic/data/padchest_EN"
ds = load_from_disk(p)
ds

DatasetDict({
    train: Dataset({
        features: ['report', 'labels', 'classes', 'findings', 'fs_examples'],
        num_rows: 1951
    })
    val: Dataset({
        features: ['report', 'labels', 'classes', 'findings', 'fs_examples'],
        num_rows: 100
    })
    test: Dataset({
        features: ['report', 'labels', 'classes', 'findings', 'fs_examples'],
        num_rows: 879
    })
})

In [4]:
# return shortest report
min(ds['train'], key=lambda x: len(x['report']))['report']

'Scoliosis.'

In [None]:
# Notebook version for computing PPL for a single sentence

from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
import pandas as pd
from tqdm import tqdm

# Example input

# Load model
def load_model(model_name, load_in_4bit=False, load_in_8bit=False):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="llama-3.1" if "llama" in model_name.lower() else "gemma-3",
        
    )
    model.eval()
    return model, tokenizer



# Usage
model_name = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"  # e.g., "TheBloke/Llama-3-7B-GGML"
model_name = "unsloth/Llama-3.1-8B-Instruct"
model, tokenizer = load_model(model_name, load_in_4bit=True, load_in_8bit=False)




==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.647 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [18]:
import torch
import pandas as pd
from tqdm import tqdm

# Function to create growing substrings from the input sentence
def tokenize_to_substrings(tokenizer, sentence):
    """
    Splits sentence into a list of substrings, each adding one more token.
    """
    encodings = tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids[0]  # single sequence
    substrings = []
    
    for i in range(1, len(input_ids) + 1):
        if hasattr(tokenizer, "tokenizer"):  # Gemma3Processor
            substr = tokenizer.tokenizer.decode(input_ids[:i])
        else:
            substr = tokenizer.decode(input_ids[:i])
        substrings.append(substr)
    return substrings

def _tokenize_to_substrings(tokenizer, sentence):
    """
    Splits sentence into substrings, adding one real token at a time.
    Ignores special tokens (like <bos>) at the beginning.
    """
    encodings = tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids[0]  # single sequence

    # Identify real token positions (skip special tokens)
    special_ids = tokenizer.all_special_ids if hasattr(tokenizer, "all_special_ids") else []
    real_token_indices = [i for i, tid in enumerate(input_ids) if tid not in special_ids]

    substrings = []
    for i in range(1, len(real_token_indices) + 1):
        idxs = real_token_indices[:i]
        if hasattr(tokenizer, "tokenizer"):  # Gemma3Processor
            substr = tokenizer.tokenizer.decode(input_ids[idxs])
        else:
            substr = tokenizer.decode(input_ids[idxs])
        substrings.append(substr)
    return substrings


# Compute perplexity for a single sentence
def ppl_sentence(model, tokenizer, sentence, max_length=2048, stride=512):
    encodings = tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids.to("cuda")
    seq_len = input_ids.size(1)
    prev_end_loc = 0

    nlls = []
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc

        input_chunk = input_ids[:, begin_loc:end_loc]
        target_ids = input_chunk.clone()
        target_ids[:, :-trg_len] = -100

        pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
        attention_mask = (input_chunk != pad_token_id).long()

        with torch.no_grad():
            outputs = model(input_chunk, labels=target_ids, attention_mask=attention_mask)
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    
    sentence_ppl = torch.exp(torch.stack(nlls).mean()).item()
    return sentence_ppl


In [16]:
report_to_analyze = min(ds['train'], key=lambda x: len(x['report']))['report']
substrings = tokenize_to_substrings(tokenizer, report_to_analyze)

for substr in substrings:
    ppl_value = ppl_sentence(model, tokenizer, substr)
    print(f"{substr}\n{ppl_value:.2f}\n")


S
81385.27

Scol
11312.28

Scoliosis
561.07

Scoliosis.
524.70



In [14]:
report_to_analyze = min(ds['train'], key=lambda x: len(x['report']))['report']
substrings = tokenize_to_substrings(tokenizer, report_to_analyze)

for substr in substrings:
    ppl_value = ppl_sentence(model, tokenizer, substr)
    print(f"{substr}\n{ppl_value:.2f}\n")


<|begin_of_text|>
802921.38

<|begin_of_text|>S
255628.59

<|begin_of_text|>Scol
47135.08

<|begin_of_text|>Scoliosis
3459.50

<|begin_of_text|>Scoliosis.
2206.88



In [19]:
report_to_analyze = ds['train'][0]['report']
substrings = tokenize_to_substrings(tokenizer, report_to_analyze)

for substr in substrings:
    ppl_value = ppl_sentence(model, tokenizer, substr)
    print(f"{substr}\n{ppl_value:.2f}\n")

<|begin_of_text|>
802921.38

<|begin_of_text|>Minimal
53582.67

<|begin_of_text|>Minimal bi
41483.62

<|begin_of_text|>Minimal biap
41306.16

<|begin_of_text|>Minimal biapical
11264.05

<|begin_of_text|>Minimal biapical ple
7978.68

<|begin_of_text|>Minimal biapical pleural
2750.84

<|begin_of_text|>Minimal biapical pleural thick
1653.81

<|begin_of_text|>Minimal biapical pleural thickening
725.97

<|begin_of_text|>Minimal biapical pleural thickening.
640.19

<|begin_of_text|>Minimal biapical pleural thickening. S
610.22

<|begin_of_text|>Minimal biapical pleural thickening. Slight
410.41

<|begin_of_text|>Minimal biapical pleural thickening. Slight bl
395.42

<|begin_of_text|>Minimal biapical pleural thickening. Slight blunting
266.97

<|begin_of_text|>Minimal biapical pleural thickening. Slight blunting of
184.35

<|begin_of_text|>Minimal biapical pleural thickening. Slight blunting of the
139.30

<|begin_of_text|>Minimal biapical pleural thickening. Slight blunting of the posterior


In [17]:
report_to_analyze = ds['train'][0]['report']
substrings = tokenize_to_substrings(tokenizer, report_to_analyze)

for substr in substrings:
    ppl_value = ppl_sentence(model, tokenizer, substr)
    print(f"{substr}\n{ppl_value:.2f}\n")

Minimal
3575.82

Minimal bi
8595.78

Minimal biap
13492.62

Minimal biapical
2270.43

Minimal biapical ple
1800.84

Minimal biapical pleural
769.66

Minimal biapical pleural thick
546.84

Minimal biapical pleural thickening
248.71

Minimal biapical pleural thickening.
235.90

Minimal biapical pleural thickening. S
245.59

Minimal biapical pleural thickening. Slight
173.22

Minimal biapical pleural thickening. Slight bl
170.34

Minimal biapical pleural thickening. Slight blunting
118.95

Minimal biapical pleural thickening. Slight blunting of
84.75

Minimal biapical pleural thickening. Slight blunting of the
65.73

Minimal biapical pleural thickening. Slight blunting of the posterior
63.17

Minimal biapical pleural thickening. Slight blunting of the posterior left
72.75

Minimal biapical pleural thickening. Slight blunting of the posterior left cost
58.50

Minimal biapical pleural thickening. Slight blunting of the posterior left costoph
47.26

Minimal biapical pleural thickening. Sligh

In [7]:
report_to_analyze = max(ds['train'], key=lambda x: len(x['report']))['report']
substrings = tokenize_to_substrings(tokenizer, report_to_analyze)

for substr in substrings:
    ppl_value = ppl_sentence(model, tokenizer, substr)
    print(f"{substr}\n{ppl_value:.2f}\n")


<|begin_of_text|>
802921.38

<|begin_of_text|>At
184122.36

<|begin_of_text|>At the
7017.59

<|begin_of_text|>At the right
5992.49

<|begin_of_text|>At the right inf
8659.86

<|begin_of_text|>At the right infrah
16891.87

<|begin_of_text|>At the right infrahilar
13840.65

<|begin_of_text|>At the right infrahilar level
6189.01

<|begin_of_text|>At the right infrahilar level,
2484.61

<|begin_of_text|>At the right infrahilar level, one
2030.26

<|begin_of_text|>At the right infrahilar level, one is
1532.35

<|begin_of_text|>At the right infrahilar level, one is visual
1830.02

<|begin_of_text|>At the right infrahilar level, one is visualized
1088.27

<|begin_of_text|>At the right infrahilar level, one is visualized in
808.40

<|begin_of_text|>At the right infrahilar level, one is visualized in ground
1134.35

<|begin_of_text|>At the right infrahilar level, one is visualized in ground glass
867.85

<|begin_of_text|>At the right infrahilar level, one is visualized in ground glass pattern
7