# Implementation of topology of attention maps 

In [None]:
# !pip install --upgrade pip setuptools wheel 
# !pip install -U bitsandbytes

In [None]:
# !cmake --version

In [3]:
# !pip install cmake 
# !pip install human_eval ripser
# !pip install lightgbm
# !pip install accelerate

In [4]:
#!/usr/bin/env python
# coding: utf-8
import os
import sys
import json
import numpy as np
import torch
# import xgboost as xgb
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    set_seed
)
from human_eval.data import read_problems, write_jsonl
from human_eval.execution import check_correctness
from ripser import ripser

import lightgbm as lgb


In [11]:

# Configuration - Easy to modify for other models/benchmarks
MODEL_NAME = "codellama/CodeLlama-7b-hf"  # Can switch to other models
BENCHMARK = "human_eval"  # Options: "human_eval" or "mbpp"
TEMPERATURE = 0.8
TOP_P = 0.95
MAX_NEW_TOKENS = 256
NUM_SHOTS = 0  # Zero-shot setting (HumanEval standard)
NUM_SAMPLES = 5  # Generations per problem for classifier training
SEEDS = [0, 1, 2, 3, 4]  # Seeds for reproducibility

# Critical: Set up sandbox for code execution
os.environ["HF_HOME"] = "/tmp"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def setup_environment():
    """Initialize model, tokenizer, and benchmark problems"""
    print(f"Loading model: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="cuda:0",
        torch_dtype=torch.float16,

        # load_in_8bit=True,  # Add 8-bit quantization
        output_attentions=True,
        return_dict_in_generate=True
    ).eval()
    
    print(f"Loading benchmark: {BENCHMARK}")
    if BENCHMARK == "human_eval":
        problems = read_problems()
    elif BENCHMARK == "mbpp":
        # MBPP loading would go here (simplified for this example)
        raise NotImplementedError("MBPP support requires additional setup")
    else:
        raise ValueError(f"Unsupported benchmark: {BENCHMARK}")
    
    return model, tokenizer, problems
def generate_with_attention(model, tokenizer, prompt, seed):
    """Generate code completion with full attention matrix reconstruction"""
    set_seed(seed)
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            num_return_sequences=1,
            output_attentions=True,
            output_scores=True,
            return_dict_in_generate=True
        )
    
    full_seq = outputs.sequences[0]
    generated_ids = full_seq[prompt_len:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    total_len = len(full_seq)
    
    # Initialize full attention matrix (averaged over layers AND heads)
    full_attention = torch.zeros((total_len, total_len), device=model.device, dtype=torch.float32)
    
    # Step 1: Process initial prompt attentions (step 0)
    initial_attentions = outputs.attentions[0]  # tuple of (layer0, ..., layer31)
    
    # Stack all layers, then average over layers AND heads
    # Each layer: [1, 32, P, P] → after squeeze: [32, P, P]
    stacked_initial = torch.stack([
        layer.squeeze(0) for layer in initial_attentions  # remove batch dim
    ], dim=0)  # → [num_layers, 32, P, P]
    
    # Average over layers (dim=0) AND heads (dim=1) → [P, P]
    avg_initial = stacked_initial.mean(dim=(0, 1))  # Key fix!
    full_attention[:prompt_len, :prompt_len] = avg_initial
    
    # Step 2: Process generation steps (step 1 onward)
    for step, step_attentions in enumerate(outputs.attentions[1:], start=prompt_len):
        # step_attentions: tuple of (layer0, ..., layer31), each [1, 32, 1, step+1]
        stacked_step = torch.stack([
            layer.squeeze(0) for layer in step_attentions  # → [32, 1, step+1]
        ], dim=0)  # → [num_layers, 32, 1, step+1]
        
        # Average over layers and heads → [1, step+1]
        avg_step = stacked_step.mean(dim=(0, 1))  # → [1, step+1]
        
        # Assign the attention row for the newly generated token
        full_attention[step, :step+1] = avg_step[0, :]  # [step+1,]
    
    # Get token probabilities
    probs = []
    for i, logits in enumerate(outputs.scores):
        probs.append(torch.softmax(logits[0], dim=-1)[generated_ids[i]].item())
    
    return {
        "full_sequence": full_seq.cpu(),
        "generated_text": generated_text,
        "attention_matrix": full_attention.cpu().float(),  # Now [T, T]
        "token_probs": probs,
        "prompt_len": prompt_len
    }

def build_distance_matrix(attention, prompt_len):
    """Convert attention matrix to symmetrized distance matrix"""
    n = attention.shape[0]
    dist = np.zeros((n, n))
    
    # Symmetrize attention: w_{i,j} = 1 - max(a_{i,j}, a_{j,i})
    for i in range(n):
        for j in range(n):
            if i != j:
                dist[i, j] = 1 - max(attention[i, j].item(), attention[j, i].item())
    
    # Zero diagonal
    np.fill_diagonal(dist, 0)
    
    # Create masks for prompt (P) and generated (G) tokens
    prompt_mask = np.zeros(n, dtype=bool)
    prompt_mask[:prompt_len] = True
    gen_mask = ~prompt_mask
    
    return dist, prompt_mask, gen_mask

def compute_topological_features(dist, prompt_mask, gen_mask):
    """Compute MTD-inspired topological features"""
    n_prompt = prompt_mask.sum()
    n_gen = gen_mask.sum()
    
    # Compute persistent homology explicitly as distance matrix
    dgms = ripser(dist, distance_matrix=True, maxdim=1)['dgms']
    
    # Extract H0 and H1 features
    h0_bars = dgms[0][:-1]  # Exclude infinite bar
    h1_bars = dgms[1]
    
    # Compute MTD approximation features
    features = {
        'h0_max_persistence': np.max(h0_bars[:, 1] - h0_bars[:, 0]) if len(h0_bars) > 0 else 0,
        'h1_total_persistence': np.sum(h1_bars[:, 1] - h1_bars[:, 0]) if len(h1_bars) > 0 else 0,
        'num_h1_bars': len(h1_bars),
        'prompt_size': n_prompt,
        'gen_size': n_gen
    }
    
    # Normalize by component sizes (as specified in requirements)
    if n_gen > 0:
        features['h0_max_persistence_norm'] = features['h0_max_persistence'] / n_gen
    else:
        features['h0_max_persistence_norm'] = 0
        
    if n_prompt > 0:
        features['h1_total_persistence_norm'] = features['h1_total_persistence'] / n_prompt
    else:
        features['h1_total_persistence_norm'] = 0
    
    return features

def extract_attention_features(attention, prompt_len):
    """Extract all required features from attention matrix"""
    dist, prompt_mask, gen_mask = build_distance_matrix(attention, prompt_len)
    topo_features = compute_topological_features(dist, prompt_mask, gen_mask)
    
    # Extract attention statistics
    prompt_attn = attention[:prompt_len, :prompt_len]
    gen_attn = attention[prompt_len:, prompt_len:]
    
    features = {
        # Topological features (normalized)
        'mtd_h0_norm': topo_features['h0_max_persistence_norm'],
        'mtd_h1_norm': topo_features['h1_total_persistence_norm'],
        
        # Attention statistics
        'prompt_self_attn': torch.diagonal(prompt_attn).mean().item() if prompt_len > 0 else 0,
        'gen_self_attn': torch.diagonal(gen_attn).mean().item() if gen_attn.shape[0] > 0 else 0,
        
        # Basic features
        'prompt_len': prompt_len,
        'gen_len': attention.shape[0] - prompt_len,
        'h1_num_bars': topo_features['num_h1_bars']
    }
    
    return features
def detect_hallucination(generated_code, problem):
    """Execute code to determine hallucination (failure to pass tests)"""
    try:
        # Use the correct signature for HumanEval
        result = check_correctness(
            problem=problem,          # Pass the entire problem dict
            completion=generated_code,
            timeout=3.0,
            completion_id="temp"
        )
        return not result["passed"]  # Hallucination = failed tests
    except Exception as e:
        print(f"Error in execution for problem: {e}")
        return True  # Treat execution errors as hallucinations

def collect_training_data(model, tokenizer, problems):
    """Generate dataset with features and hallucination labels"""
    training_data = []
    
    for task_id, problem in tqdm(problems.items(), desc="Collecting data"):
        prompt = problem["prompt"]
        
        for seed in SEEDS[:NUM_SAMPLES]:
            # Generate code with attention
            gen_result = generate_with_attention(model, tokenizer, prompt, seed)
            
            # Extract features
            attn_features = extract_attention_features(
                gen_result["attention_matrix"],
                gen_result["prompt_len"]
            )
            
            # Add probability features
            # Add probability features with safe log calculation
            safe_probs = [max(p, 1e-10) for p in gen_result["token_probs"]]
            mean_log_prob = np.mean(np.log(safe_probs))
            attn_features.update({
                'mean_log_prob': mean_log_prob,
                'task_id': task_id,
                'seed': seed
            })
            
            # Determine hallucination label
            is_hallucinated = detect_hallucination(
                gen_result["generated_text"],
                problem
            )
            
            training_data.append({
                "features": attn_features,
                "label": int(is_hallucinated),  # 1 = hallucinated, 0 = correct
                "code": gen_result["generated_text"]
            })
    
    return training_data
import pandas as pd 
def train_classifier(data):
    """Train XGBoost classifier on collected features"""
    # Prepare data
    X = []
    y = []
    feature_names = None
    
    for item in data :
        if feature_names is None:
            feature_names = sorted([k for k in item["features"].keys() 
                                  if k not in ["task_id", "seed"]])
        
        X.append([item["features"][k] for k in feature_names])
        y.append(item["label"])
    
    X = np.array(X)
    y = np.array(y)
    
    # Split data (simple holdout)
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    # # Train classifier
    # clf = xgb.XGBClassifier(
    #     objective='binary:logistic',
    #     eval_metric='logloss',
    #     random_state=42,
    #     use_label_encoder=False
    # )
    # clf.fit(X_train, y_train)
    clf = lgb.LGBMClassifier(
        objective='binary',
        random_state=42,
        verbose=-1  # Silences LightGBM output, remove if you want to see training logs
    )
    clf.fit(X_train, y_train)
    # Evaluate
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    
    print(f"Classifier trained. Train accuracy: {train_acc:.4f}, Test accuracy: {test_acc:.4f}")
    return clf, feature_names

def evaluate_pass_at_k(model, tokenizer, problems, classifier, feature_names):
    """Evaluate pass@1 with hallucination filtering"""
    results = []
    total_correct = 0
    # At the start of evaluate_pass_at_k
    sample_problem = next(iter(problems.values()))
    required_keys = ["task_id", "prompt", "test"]
    missing_keys = [k for k in required_keys if k not in sample_problem]
    if missing_keys:
        raise ValueError(f"Problem dictionary missing required keys: {missing_keys}")
    for task_id, problem in tqdm(problems.items(), desc="Evaluating pass@1"):
        prompt = problem["prompt"]
        best_code = None
        best_score = -np.inf
        
        # Generate multiple candidates
        for seed in SEEDS:
            gen_result = generate_with_attention(model, tokenizer, prompt, seed)
            attn_features = extract_attention_features(
                gen_result["attention_matrix"],
                gen_result["prompt_len"]
            )
            
            # Add probability features
            probs_array = np.array(gen_result["token_probs"])
            mean_log_prob = np.mean(np.log(probs_array + 1e-10))  # Add epsilon AFTER converting to array
            attn_features['mean_log_prob'] = mean_log_prob
            
            # Prepare features for classifier
            # When preparing features for classifier
            feature_values = []
            for k in feature_names:
                if k in attn_features:
                    feature_values.append(attn_features[k])
                else:
                    # Use default value for missing features
                    feature_values.append(0.0 if "norm" in k or "prob" in k else 1.0)
            X = pd.DataFrame([feature_values], columns=feature_names)
            halluc_prob = classifier.predict_proba(X)[0][1]
            
            # Score = confidence in non-hallucination + log probability
            score = (1 - halluc_prob) + mean_log_prob
            
            if score > best_score:
                best_score = score
                best_code = gen_result["generated_text"]
        
        # Execute best candidate
        result = check_correctness(
            problem=problem,
            completion=best_code,
            timeout=3.0,
            completion_id=task_id
        )
        
        is_correct = result["passed"]
        total_correct += int(is_correct)
        
        results.append({
            "task_id": task_id,
            "completion": best_code,
            "passed": is_correct,
            "hallucination_prob": 1 - (best_score - mean_log_prob)  # Approximation
        })
    
    pass_at_1 = total_correct / len(problems)
    print(f"Final pass@1 after hallucination filtering: {pass_at_1:.4f}")
    return results, pass_at_1


In [6]:

# def main():
    # Setup
model, tokenizer, problems = setup_environment()


# if __name__ == "__main__":
#     main()

Loading model: codellama/CodeLlama-7b-hf


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading benchmark: human_eval


In [None]:

# # Step 1: Collect training data with hallucination labels
# print("\n=== Collecting training data ===")
# training_data = collect_training_data(model, tokenizer, problems)

# # Save raw data for analysis
# with open("training_data.json", "w") as f:
#     json.dump([{**item, "features": {k: float(v) if isinstance(v, np.float32) else v 
#                                     for k,v in item["features"].items()}} 
#                 for item in training_data], f)



=== Collecting training data ===


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Collecting data:   1%|▏                       | 1/164 [00:48<2:11:13, 48.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Collecting data:   1%|▎                       | 2/164 [01:37<2:12:25, 49.05s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` t

In [12]:
with open("training_data.json","r") as f:
    training_data = json.load(f)
print(f"Loaded {len(training_data)} training samples")
# Step 2: Train hallucination classifier
print("\n=== Training hallucination classifier ===")
classifier, feature_names = train_classifier(training_data)
import joblib
joblib.dump({
    "classifier": classifier,
    "feature_names": feature_names
}, "hallucination_detector.joblib")

Loaded 820 training samples

=== Training hallucination classifier ===
Classifier trained. Train accuracy: 1.0000, Test accuracy: 0.9817




['hallucination_detector.joblib']

In [None]:

# Step 3: Evaluate pass@1 with hallucination filtering
print("\n=== Evaluating final performance ===")
results, pass_at_1 = evaluate_pass_at_k(
    model, 
    tokenizer, 
    problems, 
    classifier, 
    feature_names
)

# Save results
write_jsonl("samples.jsonl", results)
print(f"Results saved to samples.jsonl. Final pass@1: {pass_at_1:.4f}")


=== Evaluating final performance ===


Evaluating pass@1:   0%|                                | 0/164 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating pass@1:   1%|▏                     | 1/164 [00:45<2:03:32, 45.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating pass@1:   1%|▎                     | 2/164 [01:34<2:08:18, 47.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad