In [1]:
import torch
import json
import numpy as np
from typing import List, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(f"device used: {device}")
MODEL_NAME_GPT2 = "gpt2" 

gpt2_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_GPT2)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token 

gpt2_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME_GPT2)
gpt2_model.to(device).eval() 

VERBALIZER = {
    "entailment": "true",
    "contradiction": "false",
    "neutral": "neither",
}
verbalizer_token_ids = {}

for label, word in VERBALIZER.items():
    token_id = gpt2_tokenizer.encode(word, add_special_tokens=False)[0]
    verbalizer_token_ids[label] = token_id

print("\nVerbalizer Token ID projection created:", verbalizer_token_ids)

device used: cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-12-01 04:17:42.960710: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764562663.143872      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764562663.201343      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Verbalizer Token ID projection created: {'entailment': 7942, 'contradiction': 9562, 'neutral': 710}


In [2]:
LABEL_MAPPING = {
    "entailment": "entailment",
    "contradiction": "contradiction",
    "neutral": "neutral"
}

def load_jsonl_data(file_path: str) -> List[Dict]:
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                if item.get("gold_label") in LABEL_MAPPING:
                    data.append({
                        'premise': item['sentence1'],  
                        'hypothesis': item['sentence2'], 
                        'gold_label': item['gold_label']
                    })
            except (json.JSONDecodeError, KeyError) as e:
                continue 
    return data

In [3]:
def predict_nli_gpt2(premise: str, hypothesis: str, model, tokenizer, device, verbalizer_token_ids) -> str:
    query = f'Premise：“{premise}”. Hypothesis：“{hypothesis}”. The relationship is：'
    
    inputs = tokenizer(
        query, 
        return_tensors="pt",
        truncation=True, 
        max_length=1024,
        padding="longest"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad(): 
        outputs = model(**inputs) 
        
    logits = outputs.logits[0, -1, :]  
    probabilities = torch.softmax(logits, dim=-1)
    max_prob = -1.0
    predicted_label = "neutral" 
    
    for label, token_id in verbalizer_token_ids.items():
        prob = probabilities[token_id].item() 
        
        if prob > max_prob:
            max_prob = prob
            predicted_label = label
            
    return predicted_label


def run_evaluation(data: List[Dict], set_name: str, model, tokenizer, verbalizer_token_ids):
    correct_predictions = 0
    total_samples = len(data)

    print(f"\n--- Evaluation Starting {set_name} (Total samples: {total_samples}) ---")

    for i, sample in enumerate(data):
        predicted_label = predict_nli_gpt2(
            sample['premise'], 
            sample['hypothesis'], 
            model, 
            tokenizer, 
            device, 
            verbalizer_token_ids
        )

        if predicted_label == sample['gold_label']:
            correct_predictions += 1
            
        if (i + 1) % 500 == 0:
            print(f"Have processed {i + 1}/{total_samples} samples...")

    accuracy = correct_predictions / total_samples
    print(f"--- Evaluation Finished {set_name} ---")
    print(f"Accuracy: {accuracy:.4f} ({correct_predictions} / {total_samples})")
    
    return accuracy

In [4]:
MATCHED_FILE = "/kaggle/input/nlpindividualproject/dev_matched_sampled-1.jsonl"
MISMATCHED_FILE = "/kaggle/input/nlpindividualproject/dev_mismatched_sampled-1.jsonl"

matched_data = load_jsonl_data(MATCHED_FILE)
mismatched_data = load_jsonl_data(MISMATCHED_FILE)

matched_acc = run_evaluation(matched_data, "Matched Set", gpt2_model, gpt2_tokenizer, verbalizer_token_ids)
mismatched_acc = run_evaluation(mismatched_data, "Mismatched Set", gpt2_model, gpt2_tokenizer, verbalizer_token_ids)


--- Evaluation Starting Matched Set (Total samples: 2460) ---
Have processed 500/2460 samples...
Have processed 1000/2460 samples...
Have processed 1500/2460 samples...
Have processed 2000/2460 samples...
--- Evaluation Finished Matched Set ---
Accuracy: 0.3415 (840 / 2460)

--- Evaluation Starting Mismatched Set (Total samples: 2464) ---
Have processed 500/2464 samples...
Have processed 1000/2464 samples...
Have processed 1500/2464 samples...
Have processed 2000/2464 samples...
--- Evaluation Finished Mismatched Set ---
Accuracy: 0.3584 (883 / 2464)


In [5]:
from datasets import load_dataset

hallucination_dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination", split="evaluation")


def preprocess_hallucination_data(dataset) -> List[Dict]:
    processed_samples = []
    
    for entry in dataset:
        premise = entry['wiki_bio_text']
        gpt3_sentences = entry['gpt3_sentences']
        annotations = entry['annotation']
        
        for sentence, annotation in zip(gpt3_sentences, annotations):
            binary_label = 'Factual' if annotation == 0.0 else 'Non-Factual'    
            processed_samples.append({
                'premise': premise,
                'hypothesis': sentence,
                'gold_label': binary_label 
            })
            
    return processed_samples
if hallucination_dataset:
    evaluation_data = preprocess_hallucination_data(hallucination_dataset)
    print(f"Dataset preprocessing finished，total {len(evaluation_data)} sentence evaluation samples.")

README.md: 0.00B [00:00, ?B/s]

data/evaluation-00000-of-00001-e91191b8f(…):   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Generating evaluation split:   0%|          | 0/238 [00:00<?, ? examples/s]

Dataset preprocessing finished，total 1908 sentence evaluation samples.


In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from typing import List

def calculate_metrics(y_true: List[str], y_pred: List[str]):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, 
        y_pred, 
        average='binary', 
        pos_label='Non-Factual', 
        zero_division=0
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("calculate_metrics is used")

calculate_metrics is used


In [9]:

try:
    hallucination_dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination", split="evaluation")
except Exception:
    hallucination_dataset = None

if hallucination_dataset:
    evaluation_data = preprocess_hallucination_data(hallucination_dataset)
else:
    evaluation_data = []

def run_hallucination_evaluation(data: List[Dict], model, tokenizer, device, verbalizer_token_ids):
    y_true = []
    y_pred = []
    total_samples = len(data)
    
    for i, sample in enumerate(data):
        nli_predicted_label = predict_nli_gpt2(
            sample['premise'], 
            sample['hypothesis'], 
            model, 
            tokenizer, 
            device,
            verbalizer_token_ids
        )
        
        binary_predicted_label = 'Factual' if nli_predicted_label == 'entailment' else 'Non-Factual'
        
        y_pred.append(binary_predicted_label)
        y_true.append(sample['gold_label'])

    if not y_true:
        print("NO SAMPLE FOUND")
        return None

    metrics = calculate_metrics(y_true, y_pred)
    return metrics

gpt2_hallucination_metrics = run_hallucination_evaluation(
    evaluation_data, 
    gpt2_model, 
    gpt2_tokenizer,
    device,
    verbalizer_token_ids
)

if gpt2_hallucination_metrics:
    print("\n--- GPT-2 halluciantion detection results ---")
    for metric, value in gpt2_hallucination_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")


--- GPT-2 halluciantion detection results ---
Accuracy: 0.8905
Precision: 1.0000
Recall: 0.8905
F1: 0.9421


In [10]:
FLAN_FAILED_PREMISE = "Admiral of the Fleet Matthew Aylmer, 1st Baron Aylmer (ca. 1650 - 18 August 1720) was a Royal Navy officer. He was one of the captains who sent a letter to Prince William of Orange, who had just landed at Torbay, assuring the Prince of the captains' support; the Prince's response ultimately led to the Royal Navy switching allegiance to the Prince and the Glorious Revolution of November 1688. Aylmer saw action at the Battle of Bantry Bay in May 1689, at the Battle of Beachy Head in July 1690, and again at the Battle of Barfleur in May 1692 during the Nine Years' War. Aylmer became Commander-in-Chief of the Navy on 12 November 1709. However, when Aylmer met a French squadron and convoy, he was only able to capture one merchantman and the 56-gun \"Superbe\": the new Harley ministry used this failure as an excuse to remove him as Commander-in-Chief and did so a few months later. Following the accession of George I and the appointment of the Townshend ministry, Aylmer was reappointed Commander-in-Chief on 5 November 1714. He was also appointed Governor of Greenwich Hospital: in this post he founded the Royal Hospital School for the sons of seamen."
FLAN_FAILED_HYPOTHESIS = "He was born in Dublin, the son of a barrister, and was educated at Trinity College, Dublin."

gpt2_case_prediction = predict_nli_gpt2(
    FLAN_FAILED_PREMISE, 
    FLAN_FAILED_HYPOTHESIS, 
    gpt2_model, 
    gpt2_tokenizer, 
    device, 
    verbalizer_token_ids
)

print(f"Gold Label: Neutral")
print(f"Flan-T5 prediction (Prompting): Entailment")
print(f"GPT-2 prediction (Prompting): {gpt2_case_prediction}")

Gold Label: Neutral
Flan-T5 prediction (Prompting): Entailment
GPT-2 prediction (Prompting): neutral
