In [1]:
import os
import json
import pandas as pd
from predictionguard import PredictionGuard
from IPython.display import display

# Initialize and perform deep model exploration 
    
# --- 1. Initialize Prediction Guard client ---
try:
    # Ensure config.json is in the same directory as this new .ipynb file
    with open("config.json", "r") as f:
        api_config = json.load(f)
    os.environ["PREDICTIONGUARD_API_KEY"] = api_config["PREDICTIONGUARD_API_KEY"]
    client = PredictionGuard()
    print("✅ Client initialized successfully")
except Exception as e:
    print(f"❌ Initialization failed: {e}")
    client = None

# --- 2. Deep exploration: Get and display full information for all models ---
if client:
    print("\n--- Querying Prediction Guard for available models ---")
    try:
        # Get raw data of the model list
        models_data = client.models.list().get('data', [])
        if models_data:
            models_df = pd.DataFrame(models_data)
            
            # Expand 'capabilities' field for easier viewing
            if 'capabilities' in models_df.columns:
                caps_df = models_df['capabilities'].apply(pd.Series).add_prefix('caps_')
                models_df = pd.concat([models_df.drop('capabilities', axis=1), caps_df], axis=1)

            print(f"✅ Successfully retrieved {len(models_df)} available models. Displaying full information...")

            # Set pandas to display all columns and content for easy viewing
            pd.set_option('display.max_columns', None)
            pd.set_option('display.max_rows', None)
            pd.set_option('display.max_colwidth', None)

            # Display the complete DataFrame
            display(models_df)
        else:
            print("--- ⚠️ Failed to retrieve any model data from API ---")
            
    except Exception as e:
        print(f"❌ Error querying model list: {e}")
else:
    print("\n⚠️ 'client' object not initialized, unable to perform model exploration. Please check the API key and configuration.")

✅ Client initialized successfully

--- Querying Prediction Guard for available models ---
✅ Successfully retrieved 10 available models. Displaying full information...


Unnamed: 0,id,object,created,owned_by,description,max_context_length,prompt_format,caps_chat_completion,caps_chat_with_image,caps_completion,caps_embedding,caps_embedding_with_image,caps_tokenize,caps_detokenize,caps_rerank,caps_tool_calling
0,bge-m3,model,1730332800,Beijing Academy of Artificial Intelligence,"BGE M3 is distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.",8192,none,False,False,False,True,False,True,False,False,False
1,bge-reranker-v2-m3,model,1730332800,Beijing Academy of Artificial Intelligence,"BGE Reranker v2 M3 is distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.",8192,none,False,False,False,False,False,False,False,True,False
2,bridgetower-large-itm-mlm-itc,model,1730332800,BridgeTower,BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning,8192,none,False,False,False,True,True,False,False,False,False
3,DeepSeek-R1-Distill-Qwen-32B,model,1730332800,Deepseek,Deepseek R1 is a family of open-source large language models (LLMs) designed for high-quality code generation and understanding tasks.,20480,none,True,False,True,False,False,True,False,False,False
4,Hermes-3-Llama-3.1-70B,model,1730332800,NousResearch,Hermes 3 is a generalist language model based on Llama 3.1 70B.,20480,none,True,False,True,False,False,True,False,False,True
5,Hermes-3-Llama-3.1-8B,model,1730332800,NousResearch,Hermes 3 is a generalist language model based on Llama 3.1 8B.,32768,none,True,False,True,False,False,True,False,False,True
6,multilingual-e5-large-instruct,model,1742828621,intfloat,Open-source multilingual text embeddings model.,512,none,False,False,False,True,False,True,False,False,False
7,neural-chat-7b-v3-3,model,1730332800,Intel,Neural Chat is an open-source A fine-tuned model based on Mistral with good coverage of domain and language.,32768,none,True,False,True,False,False,True,False,False,False
8,Qwen2.5-Coder-14B-Instruct,model,1730332800,Qwen,Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).,20480,none,True,False,True,False,False,True,False,False,False
9,Qwen2.5-VL-7B-Instruct,model,1730332800,llava hugging face,Open-source multimodal chatbot trained by fine-tuning LLaMa/Vicuna.,16384,none,True,True,True,False,False,True,False,False,False


In [None]:
"""
Evaluate factuality of model generations on FACTS Grounding 1.0 Public (860 examples).
--------------------------------------------------------------------    
1) Load dataset (examples.csv)
2) For each sample:
      • Build messages = [system_instruction, user_request]
      • Call PG chat API → get answer
      • Call PG factuality API (or optional Judge LLM) → get score
3) Aggregate metrics & write results to CSV
--------------------------------------------------------------------
All configurable items are grouped in CONFIG dict at top.
"""

import os
import json
import uuid
import time
import pandas as pd
from typing import List, Dict, Any
from tqdm import tqdm
from tenacity import retry, wait_random_exponential, stop_after_attempt
from predictionguard import PredictionGuard

# --------------------------- CONFIGURATION --------------------------- #
CONFIG = {
    # --- paths ------------------------------------------------------- #
    "DATA_PATH":      "examples.csv",
    "OUTPUT_CSV":     "facts_grounding_results-2.csv",

    # --- Prediction Guard ------------------------------------------- #
    "CONFIG_FILE":    "config.json",
    "MODEL_NAMES": [
    "Hermes-3-Llama-3.1-8B",
    "Hermes-3-Llama-3.1-70B",
    "DeepSeek-R1-Distill-Qwen-32B",
    "neural-chat-7b-v3-3"
    ],      # generation model
    "TEMPERATURE":    0.2,
    "MAX_TOKENS":     512,

    # --- factuality -------------------------------------------------- #
    "FACT_THRESHOLD": 0.80,        # score ≥ threshold is treated as grounded
    "USE_SECOND_JUDGE": False,     # set True to enable LLM-as-Judge
    "JUDGE_MODEL_NAME": "Qwen2.5-Coder-14B-Instruct",

     # NEW ➜ how many examples to evaluate (0 = all)
    "N_SAMPLES": 24,          # ← Change to 0 or None to run all
}

# -------------------------------------------------------------------- #

# 1) Initialize and load API KEY (Again!)
with open("config.json", "r") as f:
    api_config = json.load(f)
os.environ["PREDICTIONGUARD_API_KEY"] = api_config["PREDICTIONGUARD_API_KEY"]
client = PredictionGuard()

# --------------------------- UTILITIES ------------------------------ #
from typing import Any

def _extract_score(res: Any) -> float:
    """
    Robustly extract 'score' from Prediction Guard factuality response.

    Supports all known schemas:
      • res.checks[0].score                (Old SDK dataclass)
      • res['checks'][0]['score']          (dict-like)
      • res['data'][0]['score']            (REST JSON)
      • res['score']                       (edge case)
    """
    # dataclass / namedtuple style
    if hasattr(res, "checks"):
        check0 = res.checks[0]
        return getattr(check0, "score", 0.0)

    # dict-like schemas
    if isinstance(res, dict):
        if "score" in res:
            return res["score"]
        if "checks" in res and res["checks"]:
            return res["checks"][0].get("score", 0.0)
        if "data" in res and res["data"]:
            return res["data"][0].get("score", 0.0)

    raise ValueError(f"Unexpected factuality response schema: {res}")

@retry(wait=wait_random_exponential(min=1, max=20),
       stop=stop_after_attempt(6))
def pg_factuality(client: PredictionGuard, reference: str, text: str) -> float:
    """
    Call Prediction Guard factuality endpoint (create or check) and return score ∈ [0,1].
    The SDK versions differ: some expose .create(), others .check().
    """
    if hasattr(client.factuality, "create"):
        res = client.factuality.create(reference=reference, text=text)
    elif hasattr(client.factuality, "check"):
        res = client.factuality.check(reference=reference, text=text)
    else:
        raise AttributeError("PredictionGuard.factuality has neither create() nor check()")
    return _extract_score(res)
# -------------------------------------------------------------------- #

def chunk_text(text: str, max_tokens: int = 14000) -> str:
    """
    Rough tokenizer-agnostic clip to avoid exceeding context limits.
    """
    tokens = text.split()
    return " ".join(tokens[:max_tokens])


@retry(wait=wait_random_exponential(min=1, max=20),
       stop=stop_after_attempt(6))
def pg_chat(client: PredictionGuard,
            messages: List[Dict[str, str]],
            model: str,
            max_tokens: int,
            temperature: float) -> str:
    """
    Robust chat wrapper:
      • Prefer client.chat.completions.create()
      • Fallback to client.completions.create() (Old SDK)
      • Parse both dataclass & dict schemas
    """
    # -------- call API -------- #
    if hasattr(client, "chat"):          # New SDK
        resp = client.chat.completions.create(
            model=model,
            messages=messages,
            max_completion_tokens=max_tokens,
            temperature=temperature,
        )
    else:                                # Old SDK
        # Expand chat messages into prompt (simplest fallback approach)
        prompt = "\n".join([m["content"] for m in messages])
        resp = client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
        )

    # -------- extract answer -------- #
    # dataclass
    if hasattr(resp, "choices"):
        choice0 = resp.choices[0]
        if hasattr(choice0, "message") and choice0.message:
            return choice0.message.content
        if hasattr(choice0, "text"):     # completions endpoint
            return choice0.text

    # dict-like
    if isinstance(resp, dict):
        if resp.get("choices"):
            first = resp["choices"][0]
            if "message" in first:
                return first["message"]["content"]
            if "text" in first:
                return first["text"]

    raise ValueError(f"Unrecognized chat/completion response schema: {resp}")
# -------------------------------------------------------------------- #

@retry(wait=wait_random_exponential(min=1, max=20),
       stop=stop_after_attempt(6))
def pg_judge(client: PredictionGuard, judge_model: str,
             question: str, reference: str, answer: str) -> float:
    """
    Optional 2nd-stage Judge LLM: returns score in [0,1].
    Prompt asks Judge to give numeric score only.
    """
    judge_prompt = (
        "You are a fact evaluator. "
        "Compare the ANSWER against the REFERENCE DOCUMENT in detail. "
        "Return ONLY a JSON object like {\"score\": <float 0-1>}."
        "\n\n[REFERENCE DOCUMENT]\n"
        f"{reference}\n\n[ANSWER]\n{answer}\n"
    )
    resp = client.chat.completions.create(
        model=judge_model,
        messages=[{"role": "user", "content": judge_prompt}],
        max_completion_tokens=50,
        temperature=0.0,
    )
    try:
        judge_json = json.loads(resp.choices[0].message.content)
        return float(judge_json["score"])
    except Exception:
        return 0.0
# -------------------------------------------------------------------- #

def evaluate_single_model(model_name: str,
                          df_eval: pd.DataFrame,
                          client: PredictionGuard) -> pd.DataFrame:
    """
    Run factuality evaluation for one model and return a result DataFrame
    with an extra 'model' column.
    """
    rows = []
    for idx, row in tqdm(df_eval.iterrows(),
                         total=len(df_eval),
                         desc=f"Evaluating {model_name}"):
        context = chunk_text(row["context_document"])
        messages = [
            {"role": "system", "content": row["system_instruction"]},
            {"role": "user",   "content": row["user_request"]}
        ]

        # ---- generation ---- #
        try:
            answer = pg_chat(client, messages,
                             model=model_name,
                             max_tokens=CONFIG["MAX_TOKENS"],
                             temperature=CONFIG["TEMPERATURE"])
        except Exception as e:
            answer = f"[ERROR] {e}"

        # ---- factuality ---- #
        score_pg = None
        if not answer.startswith("[ERROR]"):
            try:
                score_pg = pg_factuality(client,
                                         reference=context,
                                         text=answer)
            except Exception as e:
                print(f"❌ factuality API failed: {e}")

        rows.append({
            "id": idx,
            "model": model_name,
            "score_pg": score_pg,
            "grounded_pg": (score_pg is not None and
                            score_pg >= CONFIG["FACT_THRESHOLD"]),
            "answer": answer
        })

    return pd.DataFrame(rows)

def main():
    # === init PG ===
    with open(CONFIG["CONFIG_FILE"], "r") as f:
        api_cfg = json.load(f)
    os.environ["PREDICTIONGUARD_API_KEY"] = api_cfg["PREDICTIONGUARD_API_KEY"]
    client = PredictionGuard()

    # === load data ===
    df_all = pd.read_csv(CONFIG["DATA_PATH"])

    if CONFIG["N_SAMPLES"]:
        df_all = df_all.sample(CONFIG["N_SAMPLES"], random_state=42)

    print(f"✅ Loaded {len(df_all)} examples for evaluation\n")

    # === evaluate each model ===
    all_results = []
    for m in CONFIG["MODEL_NAMES"]:
        res_df = evaluate_single_model(m, df_all, client)
        all_results.append(res_df)

    final_df = pd.concat(all_results, ignore_index=True)
    final_df.to_csv(CONFIG["OUTPUT_CSV"], index=False)
    print(f"\n📄 Per-example results saved to {CONFIG['OUTPUT_CSV']}")

    # === build leaderboard ===
    leaderboard = (final_df
                   .dropna(subset=["score_pg"])        # exclude failed samples
                   .groupby("model")
                   .agg(avg_score=("score_pg", "mean"),
                        pass_rate=("grounded_pg", "mean"),
                        n=("score_pg", "count"))
                   .assign(pass_rate=lambda d: d["pass_rate"] * 100)
                   .sort_values("avg_score", ascending=False)
                   .reset_index())

    lb_file = "leaderboard.csv"
    leaderboard.to_csv(lb_file, index=False)

    print("\n=== Leaderboard ===")
    print(leaderboard.to_string(index=False, 
                                formatters={
                                    "avg_score": "{:.3f}".format,
                                    "pass_rate": "{:.1f}%".format
                                }))
    print(f"\n🏁 Leaderboard saved to {lb_file}")

if __name__ == "__main__":
    tic = time.time()
    main()
    print(f"\n⏱️  Done in {(time.time() - tic)/60:.1f} min")


✅ Loaded 24 examples for evaluation



Evaluating Hermes-3-Llama-3.1-8B: 100%|██████████| 24/24 [07:16<00:00, 18.17s/it]
Evaluating Hermes-3-Llama-3.1-70B: 100%|██████████| 24/24 [05:26<00:00, 13.62s/it]
Evaluating DeepSeek-R1-Distill-Qwen-32B: 100%|██████████| 24/24 [11:17<00:00, 28.22s/it]
Evaluating neural-chat-7b-v3-3: 100%|██████████| 24/24 [04:38<00:00, 11.60s/it]
Evaluating Qwen2.5-Coder-14B-Instruct: 100%|██████████| 24/24 [04:09<00:00, 10.40s/it]


📄 Per-example results saved to facts_grounding_results-2.csv

=== Leaderboard ===
                       model avg_score pass_rate  n
DeepSeek-R1-Distill-Qwen-32B     0.855     83.3% 24
  Qwen2.5-Coder-14B-Instruct     0.855     87.5% 24
         neural-chat-7b-v3-3     0.854     83.3% 24
       Hermes-3-Llama-3.1-8B     0.854     87.5% 24
      Hermes-3-Llama-3.1-70B     0.844     83.3% 24

🏁 Leaderboard saved to leaderboard.csv

⏱️  Done in 32.8 min



