In [25]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device used: {device}")

Current device used: cuda


In [27]:
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.eval()
model.to(device)

# check if model is successfully loaded
print(f"Model {MODEL_NAME} has been loaded successfully")

Model google/flan-t5-base has been loaded successfully


In [28]:
VERBALIZER = {
    "entailment": "true",
    "contradiction": "false",
    "neutral": "neither",
}

In [29]:
verbalizer_words = list(VERBALIZER.values())
verbalizer_token_ids = {}

for label, word in VERBALIZER.items():
    token_id = tokenizer.encode(word, add_special_tokens=False)[0]
    verbalizer_token_ids[label] = token_id
    
print("\nVerbalizer Token ID projection has created:")
print(verbalizer_token_ids)


Verbalizer Token ID projection has created:
{'entailment': 1176, 'contradiction': 6136, 'neutral': 7598}


In [30]:
target_ids = list(verbalizer_token_ids.values())

In [5]:
def predict_nli_prompting(premise: str, hypothesis: str, model, tokenizer, verbalizer_token_ids, device) -> str:
    query = f'Premise：“{premise}”. Hypothesis：“{hypothesis}”. The relationship between them is：'
    
    input_ids = tokenizer(query, return_tensors="pt", truncation=True, max_length=512).input_ids
    input_ids = input_ids.to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=1,
            output_scores=True,
            return_dict_in_generate=True
        )
    logits = output.scores[0].squeeze() 
    
    probabilities = torch.softmax(logits, dim=-1)
    
    max_prob = -1.0
    predicted_label = "neutral" 
    
    for label, token_id in verbalizer_token_ids.items():
        prob = probabilities[token_id].item() 
        
        if prob > max_prob:
            max_prob = prob
            predicted_label = label
            
    return predicted_label

In [7]:
import json
import os
from typing import List, Dict

In [19]:
LABEL_MAPPING = {
    "entailment": "entailment",
    "contradiction": "contradiction",
    "neutral": "neutral"
}

def load_jsonl_data(file_path: str) -> List[Dict]:
    data = []
    print(f"Data Loading: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                if item.get("gold_label") in LABEL_MAPPING:
                    data.append({
                        'premise': item['sentence1'],
                        'hypothesis': item['sentence2'],
                        'gold_label': item['gold_label']
                    })
            except json.JSONDecodeError as e:
                print(f"Skip invalid JSON row: {e}")
    return data

def run_evaluation(data: List[Dict], set_name: str, model, tokenizer, verbalizer_token_ids):
    correct_predictions = 0
    total_samples = len(data)

    print(f"\n--- Evaluation Starting {set_name} (Total samples: {total_samples}) ---")

    for i, sample in enumerate(data):
        predicted_label = predict_nli_prompting(
            sample['premise'], 
            sample['hypothesis'], 
            model, 
            tokenizer, 
            verbalizer_token_ids,
            
        )

        if predicted_label == sample['gold_label']:
            correct_predictions += 1
        
        if (i + 1) % 500 == 0:
            print(f"Have processed {i + 1}/{total_samples} samples...")

    accuracy = correct_predictions / total_samples
    print(f"--- Evaluation Finished {set_name} ---")
    print(f"Accuracy: {accuracy:.4f} ({correct_predictions} / {total_samples})")
    
    return accuracy


MATCHED_FILE = "/kaggle/input/nlpindividualproject/dev_matched_sampled-1.jsonl"
MISMATCHED_FILE = "/kaggle/input/nlpindividualproject/dev_mismatched_sampled-1.jsonl"

matched_data = load_jsonl_data(MATCHED_FILE)
mismatched_data = load_jsonl_data(MISMATCHED_FILE)

matched_acc = run_evaluation(matched_data, "Matched Set", model, tokenizer, verbalizer_token_ids)
mismatched_acc = run_evaluation(mismatched_data, "Mismatched Set", model, tokenizer, verbalizer_token_ids)

Data Loading: /kaggle/input/nlpindividualproject/dev_matched_sampled-1.jsonl
Data Loading: /kaggle/input/nlpindividualproject/dev_mismatched_sampled-1.jsonl


NameError: name 'model' is not defined

In [110]:
from datasets import load_dataset

hallucination_dataset = load_dataset("potsawee/wiki_bio_gpt3_hallucination", split="evaluation")


def preprocess_hallucination_data(dataset) -> List[Dict]:
    processed_samples = []
    
    for entry in dataset:
        premise = entry['wiki_bio_text']
        gpt3_sentences = entry['gpt3_sentences']
        annotations = entry['annotation']
        
        for sentence, annotation in zip(gpt3_sentences, annotations):
            binary_label = 'Factual' if annotation == 0.0 else 'Non-Factual'    
            processed_samples.append({
                'premise': premise,
                'hypothesis': sentence,
                'gold_label': binary_label 
            })
            
    return processed_samples

if hallucination_dataset:
    evaluation_data = preprocess_hallucination_data(hallucination_dataset)
    print(f"Dataset preprocessing finished，total {len(evaluation_data)} sentence evaluation samples.")

README.md: 0.00B [00:00, ?B/s]

data/evaluation-00000-of-00001-e91191b8f(…):   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Generating evaluation split:   0%|          | 0/238 [00:00<?, ? examples/s]

Dataset preprocessing finished，total 1908 sentence evaluation samples.


In [35]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [116]:
def calculate_metrics(y_true: list, y_pred: list):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, 
        y_pred, 
        average='binary', 
        pos_label='Non-Factual' 
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [37]:
import time

In [41]:
def nli_to_hallucination_label(nli_label: str) -> str:
    if nli_label == 'entailment':
        return 'Factual'
    else:
        return 'Non-Factual'

def run_hallucination_evaluation(data: List[Dict], model, tokenizer, verbalizer_token_ids, device):
    y_true = []
    y_pred = []
    total_samples = len(data)
    
    print(f"\n--- Starting Hallucination Detection (Total Samples: {total_samples}) ---")

    start_time = time.time()
    detailed_results = []
    
    for i, sample in enumerate(data):
        nli_predicted_label = predict_nli_prompting(
            sample['premise'], 
            sample['hypothesis'], 
            model, 
            tokenizer, 
            verbalizer_token_ids,
            device 
        )
        
        binary_predicted_label = nli_to_hallucination_label(nli_predicted_label)

        # --------------------------case sample get (test)-----------------------
        detailed_results.append({
            'premise': sample['premise'],
            'hypothesis': sample['hypothesis'],
            'gold_label': sample['gold_label'],
            'nli_prediction': nli_predicted_label, 
            'binary_prediction': binary_predicted_label,
            'is_correct': (binary_predicted_label == sample['gold_label'])
        })
        # --------------------------test end----------------------------------
        
        y_pred.append(binary_predicted_label)
        y_true.append(sample['gold_label'])
        
        if (i + 1) % 500 == 0:
            elapsed = time.time() - start_time
            print(f"Have processed {i + 1}/{total_samples} samples... Time cost: {elapsed:.2f}s")
            
    metrics = calculate_metrics(y_true, y_pred)
    
    print("--- Hallucination Detection Finished ---")
    
    return detailed_results, metrics


hallucination_results, hallucination_metrics = run_hallucination_evaluation(evaluation_data, model, tokenizer, verbalizer_token_ids, device)

print("\nFlan-T5 Hallucination Detection Results:")
for metric, value in hallucination_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")


--- Starting Hallucination Detection (Total Samples: 1908) ---
Have processed 500/1908 samples... Time cost: 19.12s
Have processed 1000/1908 samples... Time cost: 37.87s
Have processed 1500/1908 samples... Time cost: 56.88s
--- Hallucination Detection Finished ---

Flan-T5 Hallucination Detection Results:
Accuracy: 0.3029
Precision: 1.0000
Recall: 0.3029
F1: 0.4650


In [42]:
# case： label is Non-Factual， but model predicted Factual
case_study_samples = []

for result in hallucination_results:
    if result['gold_label'] == 'Non-Factual' and result['binary_prediction'] == 'Factual':
        case_study_samples.append(result)
        break 

if case_study_samples:
    failed_case = case_study_samples[0]
    print("\n--- Case Found ---")
    print(f"Gold Label: {failed_case['gold_label']}")
    print(f"Model Predict: {failed_case['binary_prediction']}")
    print(f"Raw NLI: {failed_case['nli_prediction']} ")
    print("-" * 30)
    print(f"premise:\n {failed_case['premise']}")
    print(f"hypothesis:\n {failed_case['hypothesis']}")
else:
    print("No case found")


--- Case Found ---
Gold Label: Non-Factual
Model Predict: Factual
Raw NLI: entailment 
------------------------------
premise:
 Admiral of the Fleet Matthew Aylmer, 1st Baron Aylmer (ca. 1650 - 18 August 1720) was a Royal Navy officer. He was one of the captains who sent a letter to Prince William of Orange, who had just landed at Torbay, assuring the Prince of the captains' support; the Prince's response ultimately led to the Royal Navy switching allegiance to the Prince and the Glorious Revolution of November 1688. Aylmer saw action at the Battle of Bantry Bay in May 1689, at the Battle of Beachy Head in July 1690, and again at the Battle of Barfleur in May 1692 during the Nine Years' War. Aylmer became Commander-in-Chief of the Navy on 12 November 1709. However, when Aylmer met a French squadron and convoy, he was only able to capture one merchantman and the 56-gun "Superbe": the new Harley ministry used this failure as an excuse to remove him as Commander-in-Chief and did so a few

In [1]:
# Code ends here!
# Below is just a try for RoBERT, which is such computationally expensive, so I gave up to use it

In [79]:
from datasets import load_dataset
nli_train_dataset = load_dataset("multi_nli", split="train")
print(f"MultiNLI dataset has loaded，sample size：{len(nli_train_dataset)}")

MultiNLI dataset has loaded，sample size：392702


In [80]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

In [81]:
MODEL_NAME2 = 'bert-base-uncased'
robert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME2)
robert_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME2, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
robert_model.to(device)
print(f"Current device used: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Current device used: cuda


In [107]:
from datasets import Dataset
import random

total_size = len(nli_train_dataset)
subset_percentage = 0.05 # 0.01 has been tested, which is so small, very bad performance 
subset_size = int(total_size * subset_percentage) 

nli_train_subset = nli_train_dataset.select(range(subset_size)) 

print(f"Initial Dataset size: {total_size}. Subset used size: {subset_size}.")

tokenized_inputs = {
    "input_ids": [],
    "attention_mask": [],
    "labels": [],
}

print(f"Tokenization Start，sample size in total: {len(nli_train_subset)}")

for i, example in enumerate(nli_train_subset): 
    encoding = robert_tokenizer(
        example["premise"], 
        example["hypothesis"], 
        truncation=True, 
        max_length=512,
        padding="max_length"
    )
    
    tokenized_inputs["input_ids"].append(encoding["input_ids"])
    tokenized_inputs["attention_mask"].append(encoding["attention_mask"])
    
    tokenized_inputs["labels"].append(example["label"]) 
    
    if (i + 1) % 500 == 0:
        print(f"Have processed {i + 1}/{subset_size} samples...")

print("Tokenization finished...")

tokenized_train_dataset = Dataset.from_dict(tokenized_inputs)

# tokenized_train_dataset = tokenized_train_dataset.rename_column("labels", "labels")

tokenized_train_dataset.set_format("torch")

print("Dataset Preprocessing Finished")

training_args = TrainingArguments(
    output_dir="./roberta_nli_finetuned",
    num_train_epochs=NEW_EPOCHS,
    per_device_train_batch_size=NEW_BATCH_SIZE,
    dataloader_num_workers=0,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
    save_total_limit=1,
    report_to="none", 
)

trainer = Trainer(
    model=robert_model,
    args=training_args,
    train_dataset=tokenized_train_dataset, 
    tokenizer=robert_tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=robert_tokenizer), 
    compute_metrics=compute_metrics
)

Initial Dataset size: 392702. Subset used size: 19635.
Tokenization Start，sample size in total: 19635
Have processed 500/19635 samples...
Have processed 1000/19635 samples...
Have processed 1500/19635 samples...
Have processed 2000/19635 samples...
Have processed 2500/19635 samples...
Have processed 3000/19635 samples...
Have processed 3500/19635 samples...
Have processed 4000/19635 samples...
Have processed 4500/19635 samples...
Have processed 5000/19635 samples...
Have processed 5500/19635 samples...
Have processed 6000/19635 samples...
Have processed 6500/19635 samples...
Have processed 7000/19635 samples...
Have processed 7500/19635 samples...
Have processed 8000/19635 samples...
Have processed 8500/19635 samples...
Have processed 9000/19635 samples...
Have processed 9500/19635 samples...
Have processed 10000/19635 samples...
Have processed 10500/19635 samples...
Have processed 11000/19635 samples...
Have processed 11500/19635 samples...
Have processed 12000/19635 samples...
Have p

  trainer = Trainer(


In [92]:
# !pip3 install evaluate
from transformers import TrainingArguments, Trainer,DataCollatorWithPadding
import numpy as np
import evaluate

In [108]:
trainer.train()

Step,Training Loss
1000,0.3673


TrainOutput(global_step=1228, training_loss=0.3435480726658327, metrics={'train_runtime': 2615.8279, 'train_samples_per_second': 15.012, 'train_steps_per_second': 0.469, 'total_flos': 1.033246391417856e+16, 'train_loss': 0.3435480726658327, 'epoch': 2.0})

In [109]:
from datasets import Dataset
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

LABEL_TO_ID = {
    "contradiction": 0,
    "entailment": 1,
    "neutral": 2
}

def map_labels_to_ids(data_list):
    mapped_data = []
    for item in data_list:
        if item['gold_label'] in LABEL_TO_ID:
            item['labels'] = LABEL_TO_ID[item['gold_label']]
            mapped_data.append(item)
    return mapped_data

mapped_matched_data = map_labels_to_ids(matched_data)
hf_matched_data = Dataset.from_list(mapped_matched_data)

def tokenize_eval_data(examples):
    return robert_tokenizer(
        examples["premise"], 
        examples["hypothesis"], 
        truncation=True, 
        max_length=512
    )

tokenized_matched_eval = hf_matched_data.map(tokenize_eval_data, batched=True)
tokenized_matched_eval = tokenized_matched_eval.remove_columns([col for col in tokenized_matched_eval.column_names if col not in ['labels', 'input_ids', 'attention_mask']])
tokenized_matched_eval.set_format("torch")


mapped_mismatched_data = map_labels_to_ids(mismatched_data)
hf_mismatched_data = Dataset.from_list(mapped_mismatched_data)

tokenized_mismatched_eval = hf_mismatched_data.map(tokenize_eval_data, batched=True)
tokenized_mismatched_eval = tokenized_mismatched_eval.remove_columns([col for col in tokenized_mismatched_eval.column_names if col not in ['labels', 'input_ids', 'attention_mask']])
tokenized_mismatched_eval.set_format("torch")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_metric = evaluate.load("accuracy")
    return accuracy_metric.compute(predictions=predictions, references=labels)


matched_results = trainer.evaluate(eval_dataset=tokenized_matched_eval) 
mismatched_results = trainer.evaluate(eval_dataset=tokenized_mismatched_eval) 

print("\n--- RoBERTa NLI Result ---")
print(f"Matched Set Accuracy: {matched_results.get('eval_accuracy'):.4f}")
print(f"Mismatched Set Accuracy: {mismatched_results.get('eval_accuracy'):.4f}")

Map:   0%|          | 0/2460 [00:00<?, ? examples/s]

Map:   0%|          | 0/2464 [00:00<?, ? examples/s]


--- RoBERTa NLI Result ---
Matched Set Accuracy: 0.1171
Mismatched Set Accuracy: 0.1246


In [112]:


def tokenize_hallucination_data(examples):
    return robert_tokenizer(
        examples["premise"], 
        examples["hypothesis"], 
        truncation=True, 
        max_length=512
    )

from datasets import Dataset
hf_hallucination_data = Dataset.from_list(evaluation_data)

tokenized_hallucination_eval = hf_hallucination_data.map(tokenize_hallucination_data, batched=True)

tokenized_hallucination_eval = tokenized_hallucination_eval.remove_columns([col for col in tokenized_hallucination_eval.column_names if col not in ['gold_label', 'input_ids', 'attention_mask']])
tokenized_hallucination_eval = tokenized_hallucination_eval.rename_column("gold_label", "labels")
tokenized_hallucination_eval.set_format("torch", columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/1908 [00:00<?, ? examples/s]

In [119]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from typing import List

In [120]:
# 1: Entailment (Factual), 0: Contradiction (Non-Factual), 2: Neutral (Non-Factual)
ID_TO_BINARY_LABEL = {
    1: 'Factual',
    0: 'Non-Factual',
    2: 'Non-Factual'
}

def convert_nli_to_binary(nli_predictions: np.ndarray) -> list:
    binary_preds = []
    predicted_ids = np.argmax(nli_predictions, axis=1)
    
    for pred_id in predicted_ids:
        binary_preds.append(ID_TO_BINARY_LABEL[pred_id])
        
    return binary_preds

prediction_output = trainer.predict(tokenized_hallucination_eval)

nli_predictions = prediction_output.predictions

roberta_binary_preds = convert_nli_to_binary(nli_predictions)
roberta_true_labels = hf_hallucination_data['gold_label'] 

roberta_hallucination_metrics = calculate_metrics(roberta_true_labels, roberta_binary_preds)

print("\n--- RoBERTa Hallucination Detection Result ---")
for metric, value in roberta_hallucination_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")


--- RoBERTa Hallucination Detection Result ---
Accuracy: 0.2935
Precision: 1.0000
Recall: 0.2935
F1: 0.4538


In [124]:
def analyze_case_robert(premise: str, hypothesis: str, trainer):
    test_data = [{'premise': premise, 'hypothesis': hypothesis}]
    
    def tokenize_case(examples):
        return robert_tokenizer(
            examples["premise"], 
            examples["hypothesis"], 
            truncation=True, 
            max_length=512
        )

    from datasets import Dataset
    from transformers import DataCollatorWithPadding

    hf_test_data = Dataset.from_list(test_data).map(tokenize_case, batched=True)
    
    hf_test_data = hf_test_data.remove_columns([col for col in hf_test_data.column_names if col not in ['input_ids', 'attention_mask']])
    hf_test_data.set_format("torch")
    
    prediction_output = trainer.predict(test_dataset=hf_test_data)
    predicted_id = np.argmax(prediction_output.predictions[0])
    
    # 0, 1, 2 projected to Contradiction, Entailment, Neutral
    NLI_ID_TO_LABEL = {0: "contradiction", 1: "entailment", 2: "neutral"}
    
    predicted_label = NLI_ID_TO_LABEL[predicted_id]
    
    return predicted_label

FLAN_FAILED_PREMISE = "Admiral of the Fleet Matthew Aylmer, 1st Baron Aylmer (ca. 1650 - 18 August 1720) was a Royal Navy officer. He was one of the captains who sent a letter to Prince William of Orange, who had just landed at Torbay, assuring the Prince of the captains' support; the Prince's response ultimately led to the Royal Navy switching allegiance to the Prince and the Glorious Revolution of November 1688. Aylmer saw action at the Battle of Bantry Bay in May 1689, at the Battle of Beachy Head in July 1690, and again at the Battle of Barfleur in May 1692 during the Nine Years' War. Aylmer became Commander-in-Chief of the Navy on 12 November 1709. However, when Aylmer met a French squadron and convoy, he was only able to capture one merchantman and the 56-gun \"Superbe\": the new Harley ministry used this failure as an excuse to remove him as Commander-in-Chief and did so a few months later. Following the accession of George I and the appointment of the Townshend ministry, Aylmer was reappointed Commander-in-Chief on 5 November 1714. He was also appointed Governor of Greenwich Hospital: in this post he founded the Royal Hospital School for the sons of seamen."
FLAN_FAILED_HYPOTHESIS = "He was born in Dublin, the son of a barrister, and was educated at Trinity College, Dublin."

robert_prediction = analyze_case_robert(FLAN_FAILED_PREMISE, FLAN_FAILED_HYPOTHESIS, trainer)

print(f"Flan-T5 prediction: Entailment (Factual)")
print(f"RoBERTa prediction: {robert_prediction}")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Flan-T5 prediction: Entailment (Factual)
RoBERTa prediction: entailment
