In [1]:
import os
import json
import random
from typing import List, Dict

# ===================================================================
# Phase 2, Step 1: Understanding and Preparing the Data    
# ===================================================================

# --- 1. Download the dataset from GitHub ---
# Use '!' to execute shell commands in Jupyter Notebook
# Download only if the folder does not exist to avoid duplication
repo_url = "https://github.com/Narabzad/prompt-sensitivity.git"
repo_dir = "prompt-sensitivity"

if not os.path.exists(repo_dir):
    print(f"Folder '{repo_dir}' does not exist. Downloading from GitHub...")
    !git clone {repo_url}
    print("✅ Dataset downloaded.")
else:
    print(f"✅ Folder '{repo_dir}' already exists. No need to download.")


# --- 2. Write data loading and sampling function (v2: correct grouping version) ---
def load_sensitivity_dataset(file_path: str, sample_size: int) -> List[List[str]]:
    """
    Read data from a .jsonl file, group prompts by 'reference_prompt_id',
    and then randomly sample from the groups.

    :param file_path: Path to the .jsonl file.
    :param sample_size: Number of prompt groups to sample.
    :return: A list where each element is a list of prompt variants.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: File not found {file_path}")
        return []

    print(f"\nReading and grouping data from '{file_path}'...")
    
    # Use a dictionary to group prompts; key is reference_prompt_id, value is a list of prompts
    grouped_prompts = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                ref_id = data.get('reference_prompt_id')
                prompt_text = data.get('prompt')

                if ref_id and prompt_text:
                    if ref_id not in grouped_prompts:
                        grouped_prompts[ref_id] = []
                    grouped_prompts[ref_id].append(prompt_text)

            except json.JSONDecodeError:
                # Skip lines with format errors
                continue
    
    if not grouped_prompts:
        print("⚠️ Warning: Failed to build any prompt groups from the file.")
        return []

    print(f"Found {len(grouped_prompts)} unique prompt groups in the file.")
    
    # Sample from the grouped prompts
    all_group_ids = list(grouped_prompts.keys())
    sample_size = min(sample_size, len(all_group_ids))
    print(f"Randomly sampling {sample_size} groups.")
    
    random.seed(42) # For reproducibility
    sampled_group_ids = random.sample(all_group_ids, sample_size)
    
    # Build the final prompt list from the sampled group IDs
    final_prompt_groups = [grouped_prompts[gid] for gid in sampled_group_ids]
    
    print(f"✅ Successfully loaded and sampled {len(final_prompt_groups)} prompt groups.")
    return final_prompt_groups


# --- 3. Execute and inspect the data ---
# This time we increase the sample size to 50 groups
SAMPLE_SIZE = 50
hotpotqa_test_path = os.path.join(repo_dir, "prompt_set", "hotpotqa", "hotpotqa_mistral_dataset_test.jsonl")
hotpotqa_prompt_groups = load_sensitivity_dataset(hotpotqa_test_path, SAMPLE_SIZE)

# Print all variants of the first prompt group
if hotpotqa_prompt_groups:
    print("\n--- Sample Preview (All Variants of the First Prompt Group) ---")
    for i, prompt in enumerate(hotpotqa_prompt_groups[0]):
        print(f"  Variant {i+1}: {prompt}")

✅ Folder 'prompt-sensitivity' already exists. No need to download.

Reading and grouping data from 'prompt-sensitivity/prompt_set/hotpotqa/hotpotqa_mistral_dataset_test.jsonl'...
Found 3441 unique prompt groups in the file.
Randomly sampling 50 groups.
✅ Successfully loaded and sampled 50 prompt groups.

--- Sample Preview (All Variants of the First Prompt Group) ---
  Variant 1: Can you identify a film released in 2017 that features a character with the first name of the sea and a surname that matches the body part being referred to
  Variant 2: what was this actor's final appearance in a cinematic work, which occurred during that same year?
What is the title of a 2017 movie that shares its name with a specific body part
  Variant 3: what role was last performed by Bill Paxton before his passing?
Can you find a film from 2017 whose title matches the word for an indentation below one's belly button
  Variant 4: in this film, which actor gave their final screen appearance?
What is the n

In [2]:
# Dataset JSON debug block: check the actual structure of a single line
import json

# ===================================================================
# Debug block: check the actual structure of a single line
# ===================================================================
file_path = "prompt-sensitivity/prompt_set/hotpotqa/hotpotqa_mistral_dataset_test.jsonl"
print(f"Checking file: {file_path}\n")

try:
    with open(file_path, 'r') as f:
        # Read only the first line of the file for analysis
        first_line = f.readline()

        print("--- Raw text of the first line in the file ---")
        print(first_line)

        print("\n--- Keys after parsing the line as JSON ---")
        # Parse the single line into a Python dictionary
        data = json.loads(first_line)
        # Print all keys in this dictionary
        print(data.keys())

except Exception as e:
    print(f"\n❌ Error occurred: {e}")

Checking file: prompt-sensitivity/prompt_set/hotpotqa/hotpotqa_mistral_dataset_test.jsonl

--- Raw text of the first line in the file ---
{"prompt_id": "alt_hpqa_f6a03027_1", "reference_prompt_id": "hpqa_f6a03027", "prompt": "In the same metropolitan area as the Taj Mahal", "expected_answer": ["Delhi"], "model_answers": {"llama3.1:8b": "Agra India.", "mistral-nemo:latest": "Agra"}, "llm_response": "Agra", "exact_match": 0, "model": "mistral-nemo"}


--- Keys after parsing the line as JSON ---
dict_keys(['prompt_id', 'reference_prompt_id', 'prompt', 'expected_answer', 'model_answers', 'llm_response', 'exact_match', 'model'])


In [3]:
import os
import json
import random
import numpy as np
import time  # <--- 【Fix 1】: Added import time
from typing import List, Dict
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from predictionguard import PredictionGuard
from tqdm.auto import tqdm
from IPython.display import display, Markdown
import pandas as pd

# ===================================================================
# Phase 2, Full Pipeline: Setup, Testing, and Scoring (Revised)
# ===================================================================

# --- 1. Configuration ---
SENSITIVITY_CONFIG = {
    "DATA_FILE_PATH": "prompt-sensitivity/prompt_set/hotpotqa/hotpotqa_mistral_dataset_test.jsonl",
    "SAMPLE_SIZE": 50,
    "CHAT_MODELS_TO_TEST": [
        "DeepSeek-R1-Distill-Qwen-32B",
        "Hermes-3-Llama-3.1-70B",
        "Hermes-3-Llama-3.1-8B",
        "neural-chat-7b-v3-3"
    ],
    "EMBEDDING_MODEL_ID": "bge-m3",
    "API_MAX_TOKENS": 128,
    "API_SLEEP_INTERVAL": 0.6,
}

# --- 2. Data-loading function (unchanged) ---
def load_sensitivity_dataset(file_path: str, sample_size: int) -> List[List[str]]:
    # ... (function body unchanged)
    if not os.path.exists(file_path):
        print(f"❌ Error: File not found {file_path}"); return []
    
    print(f"\nReading and grouping data from '{file_path}'...")
    grouped_prompts = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                ref_id, prompt_text = data.get('reference_prompt_id'), data.get('prompt')
                if ref_id and prompt_text:
                    if ref_id not in grouped_prompts: grouped_prompts[ref_id] = []
                    grouped_prompts[ref_id].append(prompt_text)
            except json.JSONDecodeError: continue
            
    print(f"Found {len(grouped_prompts)} distinct prompt groups in the file.")
    all_group_ids = list(grouped_prompts.keys())
    sample_size = min(sample_size, len(all_group_ids))
    print(f"Randomly sampling {sample_size} groups.")
    
    random.seed(42)
    sampled_group_ids = random.sample(all_group_ids, sample_size)
    final_prompt_groups = [grouped_prompts[gid] for gid in sampled_group_ids]
    
    print(f"✅ Successfully loaded and sampled {len(final_prompt_groups)} prompt groups.")
    return final_prompt_groups

# --- 3. Prompt Sensitivity Tester Class (v2 Revised) ---
class PromptSensitivityTester:
    def __init__(self, client: PredictionGuard, config: Dict):
        self.client = client
        self.config = config
        self.embedding_model = config["EMBEDDING_MODEL_ID"]

    def _extract_content(self, resp: any) -> str:
        # (function remains unchanged)
        try:
            if isinstance(resp, dict):
                return resp.get("choices", [{}])[0].get("message", {}).get("content", "")
            elif hasattr(resp, "choices") and len(resp.choices) > 0:
                return resp.choices[0].message.content
            else:
                return str(resp)
        except (AttributeError, IndexError, KeyError):
            return f"ERROR: Unrecognized response format {str(resp)}"

    def get_responses(self, model_id: str, prompt_group: List[str]) -> List[str]:
        # (function remains unchanged)
        responses = []
        for prompt in prompt_group:
            try:
                resp = self.client.chat.completions.create(
                    model=model_id,
                    messages=[{"role": "user", "content": prompt}],
                    max_completion_tokens=self.config["API_MAX_TOKENS"],
                )
                content = self._extract_content(resp)
                responses.append(content)
            except Exception as e:
                error_msg = f"ERROR: {e}"
                print(f"  ⚠️ API request failed: {error_msg}")
                responses.append(error_msg)
            time.sleep(self.config["API_SLEEP_INTERVAL"])
        return responses

    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """【【【 Fixed function 】】】Convert a list of texts into embedding vectors."""
        try:
            valid_texts = [t for t in texts if t and not t.startswith("ERROR:")]
            if not valid_texts: return np.array([])

            # Fix: use client.embeddings (plural) instead of client.embedding
            resp = self.client.embeddings.create(
                model=self.embedding_model,
                input=valid_texts
            )
            # Ensure the returned 'data' is a list
            if 'data' in resp and isinstance(resp['data'], list):
                 return np.array([item['embedding'] for item in resp['data']])
            else:
                print(f"  ⚠️ Embedding API response format unexpected: {resp}")
                return np.array([])

        except Exception as e:
            print(f"  ⚠️ Embedding API request failed: {e}")
            return np.array([])

    def calculate_avg_similarity(self, embeddings: np.ndarray) -> float:
        # (function remains unchanged)
        if embeddings.shape[0] < 2: return 0.0
        sim_matrix = cosine_similarity(embeddings)
        indices = np.triu_indices_from(sim_matrix, k=1)
        pairwise_scores = sim_matrix[indices]
        return np.mean(pairwise_scores) if pairwise_scores.size > 0 else 1.0

    def run_evaluation_for_model(self, model_id: str, all_prompt_groups: List[List[str]]) -> float:
        # (function remains unchanged)
        model_scores = []
        print(f"\n▶️ Starting evaluation for model: {model_id}")
        progress_bar = tqdm(all_prompt_groups, desc=f"  Evaluation Progress ({model_id})")
        for group in progress_bar:
            responses = self.get_responses(model_id, group)
            embeddings = self.get_embeddings(responses)
            if embeddings.size == 0: continue
            avg_sim = self.calculate_avg_similarity(embeddings)
            model_scores.append(avg_sim)
        overall_avg_score = np.mean(model_scores) if model_scores else 0.0
        print(f"  📊 Average robustness score for {model_id} = {overall_avg_score:.4f}")
        return overall_avg_score

# --- 4. Main execution flow (unchanged) ---
if 'client' not in locals():
    try:
        with open("config.json", "r") as f: config = json.load(f)
        os.environ["PREDICTIONGUARD_API_KEY"] = config["PREDICTIONGUARD_API_KEY"]
        client = PredictionGuard()
        print("✅ Prediction Guard client initialized successfully.")
    except Exception as e:
        print(f"❌ Initialization failed: {e}"); client = None

if client:
    prompt_groups_for_testing = load_sensitivity_dataset(
        SENSITIVITY_CONFIG["DATA_FILE_PATH"], SENSITIVITY_CONFIG["SAMPLE_SIZE"]
    )
    if prompt_groups_for_testing:
        tester = PromptSensitivityTester(client, SENSITIVITY_CONFIG)
        leaderboard = []
        models_to_test = SENSITIVITY_CONFIG["CHAT_MODELS_TO_TEST"]
        for model_id in models_to_test:
            score = tester.run_evaluation_for_model(model_id, prompt_groups_for_testing)
            leaderboard.append({"model": model_id, "robustness_score": score})
        leaderboard_df = pd.DataFrame(leaderboard).sort_values(
            by="robustness_score", ascending=False
        ).reset_index(drop=True)
        display(Markdown("\n--- \n ## 📊 Prompt Sensitivity Robustness Leaderboard"))
        display(Markdown("`robustness_score` (robustness) **higher is better**, meaning the model’s responses are less affected by minor prompt variations."))
        display(leaderboard_df)

  from .autonotebook import tqdm as notebook_tqdm


✅ Prediction Guard client initialized successfully.

Reading and grouping data from 'prompt-sensitivity/prompt_set/hotpotqa/hotpotqa_mistral_dataset_test.jsonl'...
Found 3441 distinct prompt groups in the file.
Randomly sampling 50 groups.
✅ Successfully loaded and sampled 50 prompt groups.

▶️ Starting evaluation for model: DeepSeek-R1-Distill-Qwen-32B


  Evaluation Progress (DeepSeek-R1-Distill-Qwen-32B): 100%|██████████| 50/50 [53:32<00:00, 64.26s/it]


  📊 Average robustness score for DeepSeek-R1-Distill-Qwen-32B = 0.7039

▶️ Starting evaluation for model: Hermes-3-Llama-3.1-70B


  Evaluation Progress (Hermes-3-Llama-3.1-70B): 100%|██████████| 50/50 [33:03<00:00, 39.67s/it]


  📊 Average robustness score for Hermes-3-Llama-3.1-70B = 0.6974

▶️ Starting evaluation for model: Hermes-3-Llama-3.1-8B


  Evaluation Progress (Hermes-3-Llama-3.1-8B): 100%|██████████| 50/50 [12:43<00:00, 15.27s/it]


  📊 Average robustness score for Hermes-3-Llama-3.1-8B = 0.6323

▶️ Starting evaluation for model: neural-chat-7b-v3-3


  Evaluation Progress (neural-chat-7b-v3-3): 100%|██████████| 50/50 [16:27<00:00, 19.75s/it]

  📊 Average robustness score for neural-chat-7b-v3-3 = 0.6520






--- 
 ## 📊 Prompt Sensitivity Robustness Leaderboard

`robustness_score` (robustness) **higher is better**, meaning the model’s responses are less affected by minor prompt variations.

Unnamed: 0,model,robustness_score
0,DeepSeek-R1-Distill-Qwen-32B,0.7039
1,Hermes-3-Llama-3.1-70B,0.697386
2,neural-chat-7b-v3-3,0.651963
3,Hermes-3-Llama-3.1-8B,0.632337
