In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import re
from tqdm import tqdm
import numpy as np
from collections import Counter
import json
import random
import scipy.stats as stats
import pandas as pd

2025-10-20 10:46:46.824420: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [3]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")



OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-68f3dfd1-1b7c0f0614f9d8964d709eb9;d418ff8c-95cf-409e-a888-5fab30bb0e0f)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

In [4]:
# Load and subsample GSM8K (fixed for all runs)
dataset = load_dataset("gsm8k", "main")['test']
dataset = dataset.shuffle(seed=42).select(range(200))

Downloading builder script:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading and preparing dataset gsm8k/main (download: 4.69 MiB, generated: 4.46 MiB, post-processed: Unknown size, total: 9.15 MiB) to /home/sagemaker-user/.cache/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/242k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset gsm8k downloaded and prepared to /home/sagemaker-user/.cache/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Temperatures (slight variations)
temperatures = [0.5, 0.7, 0.9]
num_chains = len(temperatures)

In [6]:
cot_prompt_template = """Q: {question}
Let's think step by step. End with \boxed{{final_answer}}."""
reflection_prompt_template = """Review this Chain-of-Thought for math:
1. Hallucination score (0-10, 0=none).
2. Logic pass (true/false).
Chain: {chain}
JSON output: {"hallucination_score": int, "logic_pass": bool}"""

In [7]:
def extract_answer(text):
    match = re.search(r'\\boxed\{(.*)\}', text)
    if match:
        try: return eval(match.group(1).strip())
        except: pass
    nums = re.findall(r'\d+\.?\d*', text)
    return float(nums[-1]) if nums else None


In [8]:
def generate_cot(question, temp):
    prompt = cot_prompt_template.format(question=question)
    output = generator(prompt, do_sample=True, temperature=temp, top_p=0.95)[0]['generated_text']
    chain = output[len(prompt):].strip()
    return chain

In [9]:
def reflect_on_chain(chain):
    prompt = reflection_prompt_template.format(chain=chain)
    output = generator(prompt, temperature=0.2, max_new_tokens=128)[0]['generated_text']
    try:
        json_start = output.find('{')
        json_end = output.rfind('}') + 1
        return json.loads(output[json_start:json_end])
    except:
        return {"hallucination_score": 5, "logic_pass": False}

In [10]:
def run_baseline(example):  # Single chain, no reflection
    question = example['question']
    ground_truth = extract_answer(example['answer'])
    chain = generate_cot(question, temp=0.7)  # Fixed mid-temp
    ans = extract_answer(chain)
    is_correct = (ans == ground_truth)
    chain_length = len(tokenizer.encode(chain))
    return {'accuracy': is_correct, 'chain_length': chain_length, 'hallucination_score': None}


In [None]:
def run_self_consistency(example):  # Multiple chains, voting, no reflection
    question = example['question']
    ground_truth = extract_answer(example['answer'])
    final_answers = []
    chain_lengths = []
    for temp in temperatures:
        chain = generate_cot(question, temp)
        ans = extract_answer(chain)
        final_answers.append(ans)
        chain_lengths.append(len(tokenizer.encode(chain)))
    vote_count = Counter(final_answers)
    majority_ans = vote_count.most_common(1)[0][0] if vote_count else None
    is_correct = (majority_ans == ground_truth)
    consistency = vote_count.most_common(1)[0][1] / num_chains if vote_count else 0
    avg_length = np.mean(chain_lengths)
    return {'accuracy': is_correct, 'consistency': consistency, 'chain_length': avg_length}


In [None]:
def run_reflection_only(example):  # Single chain + reflection filter
    question = example['question']
    ground_truth = extract_answer(example['answer'])
    chain = generate_cot(question, temp=0.7)
    reflection = reflect_on_chain(chain)
    ans = extract_answer(chain)
    filtered_correct = (ans == ground_truth) and (reflection['hallucination_score'] < 4 and reflection['logic_pass'])
    pass_rate = 1 if (reflection['hallucination_score'] < 4 and reflection['logic_pass']) else 0
    return {'accuracy': filtered_correct, 'reflection_pass_rate': pass_rate, 'hallucination_score': reflection['hallucination_score'], 'chain_length': len(tokenizer.encode(chain))}


In [None]:
def run_full(example):  # Multiple chains + reflection + filtered voting
    question = example['question']
    ground_truth = extract_answer(example['answer'])
    final_answers = []
    reflections = []
    chain_lengths = []
    for temp in temperatures:
        chain = generate_cot(question, temp)
        reflection = reflect_on_chain(chain)
        ans = extract_answer(chain)
        final_answers.append(ans)
        reflections.append(reflection)
        chain_lengths.append(len(tokenizer.encode(chain)))

    # Unfiltered vote
    vote_count = Counter(final_answers)
    majority_ans = vote_count.most_common(1)[0][0] if vote_count else None
    unfiltered_correct = (majority_ans == ground_truth)

    # Filtered vote
    filtered_answers = [ans for ans, ref in zip(final_answers, reflections) if ref['hallucination_score'] < 4 and ref['logic_pass']]
    filtered_vote = Counter(filtered_answers).most_common(1)[0][0] if filtered_answers else majority_ans
    filtered_correct = (filtered_vote == ground_truth)

    hall_scores = [ref['hallucination_score'] for ref in reflections]
    pass_rate = sum(ref['logic_pass'] for ref in reflections) / num_chains
    consistency = vote_count.most_common(1)[0][1] / num_chains if vote_count else 0
    reduction = filtered_correct - unfiltered_correct if isinstance(filtered_correct, int) and isinstance(unfiltered_correct, int) else 0

    return {'accuracy': unfiltered_correct, 'filtered_accuracy': filtered_correct, 'hallucination_reduction': reduction,
            'avg_hallucination': np.mean(hall_scores), 'reflection_pass_rate': pass_rate, 'consistency': consistency,
            'chain_length': np.mean(chain_lengths)}


In [None]:
conditions = {'baseline': run_baseline,
    'self_consistency': run_self_consistency,
    'reflection_only': run_reflection_only,
    'full': run_full
}

experiment_results = {cond: [] for cond in conditions}
for example in tqdm(dataset):
    for cond, func in conditions.items():
        metrics = func(example)
        experiment_results[cond].append(metrics)

# Aggregate and compare
summary = {}
per_example_acc = {cond: [r['accuracy'] if 'accuracy' in r else r['filtered_accuracy'] for r in results]
                   for cond, results in experiment_results.items()}

for cond, results in experiment_results.items():
    acc = np.mean([r['accuracy'] if 'accuracy' in r else r['filtered_accuracy'] for r in results])
    summary[cond] = {
        'accuracy': acc,
        'std_acc': np.std([r['accuracy'] if 'accuracy' in r else r['filtered_accuracy'] for r in results]),
        'avg_consistency': np.mean([r.get('consistency', 0) for r in results]) if 'consistency' in results[0] else None,
        'avg_reflection_pass': np.mean([r.get('reflection_pass_rate', 0) for r in results]) if 'reflection_pass_rate' in results[0] else None,
        'avg_hallucination': np.mean([r.get('avg_hallucination', r.get('hallucination_score', 0)) for r in results]) if any('hallucination' in r for r in results) else None,
        'hallucination_reduction': np.mean([r.get('hallucination_reduction', 0) for r in results]),
        'avg_chain_length': np.mean([r['chain_length'] for r in results])
    }

# Statistical tests (e.g., baseline vs full)
t_stat, p_value = stats.ttest_rel(per_example_acc['baseline'], per_example_acc['full'])
summary['stats_vs_baseline_to_full'] = {'t_stat': t_stat, 'p_value': p_value}

# Present as table
df = pd.DataFrame(summary).T
print(df)

# Save
with open('experiment_summary.json', 'w') as f:
    json.dump(summary, f)