In [50]:
%load_ext autoreload
%autoreload 2

from transformers import AutoModelForCausalLM, AutoTokenizer
from lm_polygraph.utils.model import WhiteboxModel, BlackboxModel
from lm_polygraph.utils.manager import estimate_uncertainty
from lm_polygraph.estimators import MaximumTokenProbability, LexicalSimilarity, SemanticEntropy, PointwiseMutualInformation, EigValLaplacian,MeanPointwiseMutualInformation,SAR,MaximumSequenceProbability

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Selected low-computational estimator


In [None]:
LexicalSimilarity
PointwiseMutualInformation
MaximumSequenceProbability

## Selected high-computational estimator

In [None]:
SAR
SemanticEntropy

In [13]:
import numpy as np

In [61]:
stats = np.load('/data/home/wangys/lm-polygraph/examples/sar.npy',allow_pickle=True).item()

In [62]:
np.array(stats["sample_token_similarity"])[0]

array([[0.57795465, 0.5783484 , 0.578637  , 0.57867306, 0.5782978 ,
        0.57811266, 0.5780282 , 0.5785949 , 0.5791121 , 0.5779701 ,
        0.578808  , 0.58061755, 0.5733088 , 0.57541555, 0.57939076,
        0.5785188 , 0.5784118 , 0.57817084, 0.5782121 , 0.5781876 ,
        0.5775309 , 0.5781161 , 0.57702315, 0.56754553, 0.566034  ,
        0.57412   , 0.5653516 , 0.5790489 , 0.5779917 , 0.5778659 ,
        0.5677034 , 0.567616  , 0.567616  , 0.573672  , 0.5773175 ,
        0.57708836, 0.5776825 , 0.57818127, 0.57827777, 0.5773349 ,
        0.5781683 , 0.5753414 , 0.577256  , 0.57631296, 0.5708441 ,
        0.5604796 , 0.5604796 , 0.5707252 , 0.5795488 , 0.57802165,
        0.58168286, 0.567329  , 0.567329  , 0.567329  , 0.5805734 ,
        0.5782185 , 0.578472  , 0.5796611 , 0.57767975, 0.57721233,
        0.57695186, 0.57838976, 0.57776374, 0.5784785 , 0.5743262 ,
        0.5824831 , 0.5731482 , 0.57099926, 0.5775634 , 0.5770057 ,
        0.5699354 , 0.57480556, 0.58115304, 0.58

In [47]:
batch_sample_log_likelihoods = stats["sample_log_likelihoods"]
batch_sample_token_similarity = stats["sample_token_similarity"]
batch_sample_sentence_similarity = stats["sample_sentence_similarity"]
t = 0.001
SAR = []
for batch_data in zip(
    batch_sample_log_likelihoods,
    batch_sample_token_similarity,
    batch_sample_sentence_similarity,
):
    sample_log_likelihoods = batch_data[0]
    sample_token_similarity = batch_data[1]
    sample_sentence_similarity = batch_data[2]

    tokenSAR = []
    for log_likelihoods, token_similarity in zip(
        sample_log_likelihoods, sample_token_similarity
    ):
        log_likelihoods = np.array(log_likelihoods)
        R_t = 1 - token_similarity
        R_t_norm = R_t / R_t.sum()
        E_t = -log_likelihoods * R_t_norm
        tokenSAR.append(E_t.sum())

    tokenSAR = np.array(tokenSAR)
    probs_token_sar = np.exp(-tokenSAR)
    R_s = (
        probs_token_sar
        * sample_sentence_similarity
        * (1 - np.eye(sample_sentence_similarity.shape[0]))
    )
    sent_relevance = R_s.sum(-1) / t
    E_s = -np.log(sent_relevance + probs_token_sar)
    SAR.append(E_s.mean())
np.array(SAR)

array([-8.35613904])

### Initialize model

In [4]:
base_model = AutoModelForCausalLM.from_pretrained(
    '/data/home/wangys/model/Mistral-7B-Instruct-v0.2',
    device_map='cuda:7',
)
tokenizer = AutoTokenizer.from_pretrained('/data/home/wangys/model/Mistral-7B-Instruct-v0.2', token_type_ids=None,
                clean_up_tokenization_spaces=False)

model = WhiteboxModel(base_model,tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Token level UE

In [12]:
estimator = SAR()
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
S

UncertaintyOutput(uncertainty=-8.356139042348568, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 4

In [5]:
estimator = MaximumSequenceProbability()
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


UncertaintyOutput(uncertainty=13.410937309265137, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 4

### Sequence level UE

In [None]:
estimator = SAR()
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

In [11]:
estimator = MeanPointwiseMutualInformation()
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


UncertaintyOutput(uncertainty=-17.852901331445374, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 

In [59]:
stats = np.load('/data/home/wangys/lm-polygraph/examples/semantic_entropy.npy',allow_pickle=True).item()

  return torch.load(io.BytesIO(b))


In [60]:
import numpy as np
from collections import defaultdict
from typing import List, Dict, Optional

def semantic_entropy(stats: Dict[str, np.ndarray]) -> np.ndarray:
    """
    Estimates the semantic entropy for each sample in the input statistics.

    Parameters:
        stats (Dict[str, np.ndarray]): input statistics, which for multiple samples includes:
            * generated samples in 'sample_texts',
            * corresponding log probabilities in 'sample_log_probs',
            * matrix with semantic similarities in 'semantic_matrix_entail'
    Returns:
        np.ndarray: float semantic entropy for each sample in input statistics.
            Higher values indicate more uncertain samples.
    """
    
    loglikelihoods_list = stats["sample_log_probs"]
    entailment_id = stats.get("entailment_id", 1)
    is_entailment = stats["semantic_matrix_entail"] == entailment_id

    # Concatenating hypotheses with input texts
    hyps_list = [[' '.join([input_text, hyp]) for hyp in stats["sample_texts"][i]] for i, input_text in enumerate(stats["input_texts"])]
    
    return batched_call(hyps_list, loglikelihoods_list, is_entailment)

def batched_call(hyps_list: List[List[str]], loglikelihoods_list: List[List[float]], is_entailment: np.ndarray, log_weights: Optional[List[List[float]]] = None) -> np.array:
    if log_weights is None:
        log_weights = [None] * len(hyps_list)
    
    semantic_logits = {}
    sample_to_class = {}
    class_to_sample = defaultdict(list)

    # Determine classes for hypotheses
    for idx, hyps in enumerate(hyps_list):
        sample_to_class[idx], class_to_sample[idx] = determine_classes(hyps, is_entailment[idx])

        # Collect likelihoods per class
        class_likelihoods = [np.array(loglikelihoods_list[idx])[np.array(class_idx)] for class_idx in class_to_sample[idx]]
        class_lp = [np.logaddexp.reduce(likelihoods) for likelihoods in class_likelihoods]
        
        # Apply weights if provided
        if log_weights[idx] is None:
            log_weights[idx] = [0] * len(hyps)
        
        semantic_logits[idx] = -np.mean([class_lp[sample_to_class[idx][j]] * np.exp(log_weights[idx][j]) for j in range(len(hyps))])
    
    return np.array([semantic_logits[i] for i in range(len(hyps_list))])

def determine_classes(hyps: List[str], is_entailment: np.ndarray) -> (Dict[int, int], Dict[int, List[int]]):
    sample_to_class = {}
    class_to_sample = defaultdict(list)

    for i in range(len(hyps)):
        if i == 0:
            class_to_sample[0] = [0]
            sample_to_class[0] = 0
            continue
        
        for class_id, class_indices in class_to_sample.items():
            class_text_id = class_indices[0]
            if is_entailment[class_text_id, i] and is_entailment[i, class_text_id]:
                class_to_sample[class_id].append(i)
                sample_to_class[i] = class_id
                break
        else:
            new_class_id = len(class_to_sample)
            class_to_sample[new_class_id] = [i]
            sample_to_class[i] = new_class_id
    
    return sample_to_class, class_to_sample

# Example usage:
# stats = {
#     'sample_log_probs': np.array([[0.1, 0.2], [0.4, 0.5]]),
#     'sample_texts': [['text1', 'text2'], ['text3', 'text4']],
#     'input_texts': ['input1', 'input2'],
#     'semantic_matrix_entail': np.array([[[1, 0], [0, 1]], [[1, 1], [1, 1]]]),
#     'entailment_id': 1
# }
output = semantic_entropy(stats)
output

array([23.7672994])

In [6]:
estimator = LexicalSimilarity('rougeL')
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


UncertaintyOutput(uncertainty=-0.5354888443376555, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 

In [58]:
estimator = SemanticEntropy()
estimate_uncertainty(model, estimator, input_text='Who is George Bush?')

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
S

UncertaintyOutput(uncertainty=20.165683420266305, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 4

In [None]:
estimator = PointwiseMutualInformation()
estimate_uncertainty(model, estimator, input_text='Once upon a time there was a little girl who liked to')

In [8]:
ue_method = LexicalSimilarity()
input_text = "Who is George Bush?"
estimate_uncertainty(model, ue_method, input_text=input_text)

Some weights of the model checkpoint at /data/home/wangys/model/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


UncertaintyOutput(uncertainty=-0.5115664112993994, input_text='Who is George Bush?', generation_text="George Bush may refer to one of two U.S. presidents named George Bush. Here's a brief overview of each:\n\n1. George H.W. Bush (b. 1924): He was the 41st President of the United States, serving from 1989 to 1993. Bush was a naval aviator in World War II and later served as a member of Congress from Texas. He also served as the", generation_tokens=[5163, 13668, 993, 3295, 298, 624, 302, 989, 500, 28723, 28735, 28723, 1258, 6640, 5160, 5163, 13668, 28723, 4003, 28742, 28713, 264, 6817, 23094, 302, 1430, 28747, 13, 13, 28740, 28723, 5163, 382, 28723, 28780, 28723, 13668, 325, 28726, 28723, 28705, 28740, 28774, 28750, 28781, 1329, 650, 403, 272, 28705, 28781, 28740, 303, 5120, 302, 272, 2969, 3543, 28725, 10732, 477, 28705, 28740, 28774, 28783, 28774, 298, 28705, 28740, 28774, 28774, 28770, 28723, 13668, 403, 264, 23850, 1182, 28710, 1028, 297, 3304, 3273, 3717, 304, 2062, 6117, 390, 264, 

### BlackBox UE

In [None]:
model = BlackboxModel(
    'YOUR_OPENAI_TOKEN',
    'gpt-3.5-turbo'
)
estimator = EigValLaplacian(verbose=True)
estimate_uncertainty(model, estimator, input_text='When did Albert Einstein die?')

In [None]:
API_TOKEN = 'YOUR_API_TOKEN'
# for example let's take google/t5-small-ssm-nq model
MODEL_ID = 'google/t5-large-ssm-nqo'

model = BlackboxModel.from_huggingface(hf_api_token=API_TOKEN, hf_model_id=MODEL_ID, openai_api_key = None, openai_model_path = None)
ue_method = LexicalSimilarity()
input_text = "Who is George Bush?"
estimate_uncertainty(model, ue_method, input_text=input_text)

In [None]:
# for example let's take bigscience/bloomz-560m model
MODEL_ID = 'bigscience/bloomz-560m'

model = BlackboxModel.from_huggingface(hf_api_token=API_TOKEN, hf_model_id=MODEL_ID, openai_api_key = None, openai_model_path = None)
ue_method = LexicalSimilarity()
input_text = "Who is George Bush?"
estimate_uncertainty(model, ue_method, input_text=input_text)