In [None]:
import json
import pandas as pd
import numpy as np
import requests
import time
from collections import defaultdict, deque
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.cluster import KMeans
from scipy.stats import entropy

In [None]:

# --- GLOBAL CONFIGURATION ---
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
MAX_REPEATS_PER_MODEL = 3  # Maximum times to call each model/temp combo

print(f"Loading BERT model ({EMBEDDING_MODEL_NAME})...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

Loading BERT model (all-MiniLM-L6-v2)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# --- API CONFIGURATION ---
API_KEYS = {
    "deepinfra": "ADD_DEEP_INFRA_KEY",
    "openai": "ADD_OPENAI_KEY",
    "anthropic": "ADD_CLAUDE_KEY",
    "google": "ADD_GEMINI_KEY",
}

# Map models to their providers
MODEL_TO_PROVIDER = {
    # OpenAI
    "gpt-5-mini-2025-08-07": "openai",
    "gpt-5.1-2025-11-13": "openai",

    # Anthropic (Claude)
    "claude-haiku-4-5": "anthropic",
    "claude-sonnet-4-5": "anthropic",

    # Google (Gemini)
    "gemini-2.5-flash": "google",
    "gemini-3-pro-preview": "google",

    # DeepInfra (all others)
    "openai/gpt-oss-20b": "deepinfra",
    "openai/gpt-oss-120b": "deepinfra",

    "Qwen/Qwen3-30B-A3B": "deepinfra",
    "mistralai/Mixtral-8x7B-Instruct-v0.1": "deepinfra",
    "microsoft/WizardLM-2-8x22B": "deepinfra",
    "microsoft/phi-4": "deepinfra",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "deepinfra",
    "deepseek-ai/DeepSeek-V3.1-Terminus": "deepinfra",
    "google/gemma-3-27b-it": "deepinfra",
    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": "deepinfra",
}

In [None]:

# --- PROMPTS ---
SYSTEM_PROMPT = """You are an expert in ethics requirements engineering.
Your task is to identify and justify which stakeholders and vulnerable groups
should be interviewed to uncover ethical issues for a given AI or software system.
Return your final answer in the specified JSON format only."""

JSON_SCHEMA = """{
  "stakeholders": [{"role": "", "justification": ""}],
  "vulnerable_groups": [{"group": "", "justification": ""}],
  "reflection_summary": ""
}"""

USER_PROMPT_TEMPLATE = """
PROJECT DESCRIPTION:
<<<
{project_description}
>>>

TASK INSTRUCTIONS:

[CREATIVE PROMPT SECTION]
Imagine you are an ethics-aware requirements engineer preparing stakeholder interviews.
Given the project description above, creatively‚Äîbut realistically‚Äîpropose the key
stakeholders and specific vulnerable end-users to interview to elicit ethical requirements.
Use the questions below as a guide.

QUESTIONS:
1- To whom could the AI system have legal obligations?
2- Who might be [positively or negatively] impacted by the AI system‚Äôs decisions or activities?
3- Who is likely to express concerns about the decisions and activities of the AI system?
4- Who has been involved in the past when concerns about AI systems needed to be addressed?
5- Who can help the AI system address specific impacts and meet its responsibilities?
6- Who would be disadvantaged if excluded from the engagement with the AI system?

[ATOMICITY RULES ‚Äì STRICT]
- Each entry must contain ONE atomic actor type only.
- No Role Conflation (e.g., NO ‚Äúteachers and administrators‚Äù, ‚Äúdevelopers and data analysts‚Äù), INSTEAD  ‚Äúteachers", "school administrators‚Äù, "developers", and "data analysts" each as a seperate role.
- No umbrella categories (e.g., NO  ‚Äúregulatory bodies (e.g., education, health, and data protection authorities)‚Äù INSTEAD 'education department', 'health department', and 'data protection authority' each in an entry). Another example is "children mental health professionals" should become "child psychologists" or "child psychiatrists"
- If a role could be separated into two distinct units, separate them.

[CANONICALIZATION RULES]
- Use the most standard, domain-recognizable title.
- Avoid synonyms, paraphrases, stylistic variations, or repeated variants.
- Merge duplicated roles under a single canonical label.

[REALISM FILTER]
- Only include roles that commonly exist in real-world educational, healthcare, government, or industry systems.
- Do not invent new committees, institutions, technical roles, or regulatory bodies.


[REFLECTION PROMPT SECTION]
Reflect on your initial list:
1. Did you omit any category of stakeholders (e.g., organisational stakeholders, national/international stakeholders, developers, auditors, or affected citizens)?
2. Did you consider indirect or silent stakeholders (e.g., data subjects, bystanders)?
3. Are any of your listed groups unrealistic or redundant?

Now revise your final concise list of:
- Stakeholders to interview (with 1‚Äì2 sentence justification each)
- Vulnerable end-users or groups to include (with 1‚Äì2 sentence justification each)

OUTPUT FORMAT:
Return your final output in JSON exactly as follows:
{json_schema}
"""


In [None]:

# --- MODEL REGISTRY ---
MODEL_REGISTRY = {
    # --- CLUSTER 4 (High Efficiency / Low Cost) ---
    "openai/gpt-oss-20b": {
        "tier": 0, "cost": 0.19,
        "variations": {
            0.3: {"score": 0.85, "pure_score": 0.79},
            0.7: {"score": 0.83, "pure_score": 0.76},
            0.0: {"score": 0.81, "pure_score": 0.71},
        }
    },
    "openai/gpt-oss-120b": {
        "tier": 1, "cost": 0.30,
        "variations": {
            0.7: {"score": 0.77, "pure_score": 0.66},
            0.3: {"score": 0.76, "pure_score": 0.65},
            0.0: {"score": 0.74, "pure_score": 0.62},
        }
    },
    # --- CLUSTER 3 (High Intelligence / High Cost) ---
    "gpt-5-mini-2025-08-07": {
        "tier": 0, "cost": 2.00,
        "variations": {
            0.0: {"score": 0.83, "pure_score": 0.81},
        }
    },
    "gpt-5.1-2025-11-13": {
        "tier": 1, "cost": 10.00,
        "variations": {
            0.0: {"score": 0.67, "pure_score": 0.83},
        }
    },
    # --- CLUSTER 2 (Mid-Range Open Weights) ---
    "Qwen/Qwen3-30B-A3B": {
        "tier": 0, "cost": 0.28,
        "variations": {
            0.7: {"score": 0.64, "pure_score": 0.47},
            0.3: {"score": 0.62, "pure_score": 0.44},
            0.0: {"score": 0.50, "pure_score": 0.26},
        }
    },
    "mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "tier": 1, "cost": 0.54,
        "variations": {
            0.7: {"score": 0.62, "pure_score": 0.45},
            0.3: {"score": 0.58, "pure_score": 0.39},
            0.0: {"score": 0.58, "pure_score": 0.38},
        }
    },
    "microsoft/WizardLM-2-8x22B": {
        "tier": 2, "cost": 0.48,
        "variations": {
            0.7: {"score": 0.63, "pure_score": 0.45},
            0.3: {"score": 0.60, "pure_score": 0.41},
            0.0: {"score": 0.57, "pure_score": 0.38},
        }
    },
    "microsoft/phi-4": {
        "tier": 3, "cost": 0.14,
        "variations": {
            0.7: {"score": 0.66, "pure_score": 0.49},
            0.3: {"score": 0.58, "pure_score": 0.38},
            0.0: {"score": 0.45, "pure_score": 0.18},
        }
    },
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": {
        "tier": 4, "cost": 0.30,
        "variations": {
            0.7: {"score": 0.60, "pure_score": 0.40},
            0.3: {"score": 0.59, "pure_score": 0.40},
            0.0: {"score": 0.45, "pure_score": 0.18},
        }
    },
    # --- CLUSTER 1 (Diverse Proprietary & Specialized) ---
    "deepseek-ai/DeepSeek-V3.1-Terminus": {
        "tier": 0, "cost": 0.79,
        "variations": {
            0.3: {"score": 0.64, "pure_score": 0.48},
            0.7: {"score": 0.63, "pure_score": 0.48},
            0.0: {"score": 0.62, "pure_score": 0.45},
        }
    },
    "gemini-2.5-flash": {
        "tier": 1, "cost": 2.50,
        "variations": {
            0.7: {"score": 0.63, "pure_score": 0.53},
            0.3: {"score": 0.63, "pure_score": 0.53},
            0.0: {"score": 0.52, "pure_score": 0.37},
        }
    },
    "claude-sonnet-4-5": {
        "tier": 2, "cost": 15.00,
        "variations": {
            0.7: {"score": 0.43, "pure_score": 0.65},
            0.3: {"score": 0.40, "pure_score": 0.60},
            0.0: {"score": 0.30, "pure_score": 0.44},
        }
    },
    "google/gemma-3-27b-it": {
        "tier": 3, "cost": 0.16,
        "variations": {
            0.7: {"score": 0.64, "pure_score": 0.47},
            0.3: {"score": 0.61, "pure_score": 0.42},
            0.0: {"score": 0.59, "pure_score": 0.40},
        }
    },
    "claude-haiku-4-5": {
        "tier": 3, "cost": 5.00,
        "variations": {
            0.7: {"score": 0.63, "pure_score": 0.62},
            0.3: {"score": 0.61, "pure_score": 0.59},
            0.0: {"score": 0.42, "pure_score": 0.29},
        }
    },
    "gemini-3-pro-preview": {
        "tier": 4, "cost": 12.00,
        "variations": {
            0.7: {"score": 0.36, "pure_score": 0.44},
            0.3: {"score": 0.35, "pure_score": 0.43},
            0.0: {"score": 0.16, "pure_score": 0.14},
        }
    },
    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": {
        "tier": 4, "cost": 0.60,
        "variations": {
            0.7: {"score": 0.56, "pure_score": 0.35},
            0.3: {"score": 0.52, "pure_score": 0.30},
            0.0: {"score": 0.54, "pure_score": 0.33},
        }
    }
}



def build_cluster_config():
    return {
        4: ["openai/gpt-oss-120b", "claude-haiku-4-5", "claude-sonnet-4-5"],
        3: ["openai/gpt-oss-20b", "gpt-5-mini-2025-08-07", "gpt-5.1-2025-11-13"],
        2: ["Qwen/Qwen3-30B-A3B", "mistralai/Mixtral-8x7B-Instruct-v0.1", "microsoft/WizardLM-2-8x22B", "microsoft/phi-4", "meta-llama/Llama-4-Scout-17B-16E-Instruct"],
        1: ["deepseek-ai/DeepSeek-V3.1-Terminus", "gemini-2.5-flash", "google/gemma-3-27b-it", "gemini-3-pro-preview", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"]
    }


In [None]:
class LiveAPIProvider:

    def __init__(self, api_keys, project_description, max_repeats):
        self.api_keys = api_keys
        self.project_description = project_description
        self.max_repeats = max_repeats
        self.call_counts = defaultdict(int)
        self.max_comp = 20000

    def call_api(self, model_name, temperature, max_retries=3):
        """Route API call to the appropriate provider"""
        provider = MODEL_TO_PROVIDER.get(model_name)

        if provider == "openai":
            return self.call_openai(model_name, temperature, max_retries)
        elif provider == "anthropic":
            return self.call_anthropic(model_name, temperature, max_retries)
        elif provider == "google":
            return self.call_google(model_name, temperature, max_retries)
        elif provider == "deepinfra":
            return self.call_deepinfra(model_name, temperature, max_retries)
        else:
            print(f"    ‚úó Unknown provider for model: {model_name}")
            return None, 0

    def call_openai(self, model_name, temperature, max_retries=3):
        """Call OpenAI API"""
        headers = {
            "Authorization": f"Bearer {self.api_keys['openai']}",
            "Content-Type": "application/json"
        }

        user_prompt = USER_PROMPT_TEMPLATE.format(
            project_description=self.project_description,
            json_schema=JSON_SCHEMA
        )


        payload = {
            "model": model_name,
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt}
            ],
            "max_completion_tokens": self.max_comp,
            "reasoning_effort": "high",
            "response_format":{"type": "json_object"}
        }

        for attempt in range(max_retries):
            try:
                print(f"    üîÑ Calling OpenAI: {model_name} @ temp={temperature}...")
                response = requests.post(
                    "https://api.openai.com/v1/chat/completions",
                    headers=headers,
                    json=payload,
                    timeout=90
                )
                response.raise_for_status()
                result = response.json()

                content = result['choices'][0]['message']['content']
                tokens_used = result.get('usage', {}).get('total_tokens', len(content) / 4)

                return self.parse_json_response(content, tokens_used)

            except Exception as e:
                print(f"    ‚úó OpenAI call failed (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)

        return None, 0

    def call_anthropic(self, model_name, temperature, max_retries=3):
        """Call Anthropic (Claude) API"""
        headers = {
            "x-api-key": self.api_keys['anthropic'],
            "anthropic-version": "2023-06-01",
            "Content-Type": "application/json"
        }

        user_prompt = USER_PROMPT_TEMPLATE.format(
            project_description=self.project_description,
            json_schema=JSON_SCHEMA
        )

        # Map model names
        model_mapping = {
            "claude-sonnet-4-5": "claude-sonnet-4-5",
            "claude-haiku-4-5": "claude-haiku-4-5"
        }
        api_model_name = model_mapping.get(model_name, model_name)

        payload = {
            "model": api_model_name,
            "max_tokens": self.max_comp,
            "temperature": temperature,
            "system": SYSTEM_PROMPT,
            "messages": [
                {"role": "user", "content": user_prompt}
            ]
        }

        for attempt in range(max_retries):
            try:
                print(f"    üîÑ Calling Anthropic: {model_name} @ temp={temperature}...")
                response = requests.post(
                    "https://api.anthropic.com/v1/messages",
                    headers=headers,
                    json=payload,
                    timeout=90
                )
                response.raise_for_status()
                result = response.json()

                content = result['content'][0]['text']
                tokens_used = result.get('usage', {}).get('input_tokens', 0) + result.get('usage', {}).get('output_tokens', 0)

                return self.parse_json_response(content, tokens_used)

            except Exception as e:
                print(f"    ‚úó Anthropic call failed (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)

        return None, 0

    def call_google(self, model_name, temperature, max_retries=3):
        """Call Google (Gemini) API"""
        user_prompt = USER_PROMPT_TEMPLATE.format(
            project_description=self.project_description,
            json_schema=JSON_SCHEMA
        )

        full_prompt = f"{SYSTEM_PROMPT}\n\n{user_prompt}"

        # Map model names
        model_mapping = {
            "gemini-2.5-flash": "gemini-2.5-flash",
            "gemini-3-pro-preview": "gemini-3-pro-preview"
        }
        api_model_name = model_mapping.get(model_name, model_name)

        payload = {
            "contents": [{
                "parts": [{"text": full_prompt}]
            }],
            "generationConfig": {
                "temperature": temperature,
                "maxOutputTokens": self.max_comp
            }
        }

        url = f"https://generativelanguage.googleapis.com/v1beta/models/{api_model_name}:generateContent?key={self.api_keys['google']}"

        for attempt in range(max_retries):
            try:
                print(f"    üîÑ Calling Google: {model_name} @ temp={temperature}...")
                response = requests.post(url, json=payload, timeout=60)
                response.raise_for_status()
                result = response.json()

                content = result['candidates'][0]['content']['parts'][0]['text']
                tokens_used = result.get('usageMetadata', {}).get('totalTokenCount', len(content) / 4)

                return self.parse_json_response(content, tokens_used)

            except Exception as e:
                print(f"    ‚úó Google call failed (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)

        return None, 0

    def call_deepinfra(self, model_name, temperature, max_retries=3):
        """Call DeepInfra API"""
        headers = {
            "Authorization": f"Bearer {self.api_keys['deepinfra']}",
            "Content-Type": "application/json"
        }

        user_prompt = USER_PROMPT_TEMPLATE.format(
            project_description=self.project_description,
            json_schema=JSON_SCHEMA
        )

        payload = {
            "model": model_name,
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": temperature,
            "max_tokens": self.max_comp
        }
        for attempt in range(max_retries):
            try:
                print(f"    üîÑ Calling DeepInfra: {model_name} @ temp={temperature}...")
                response = requests.post(
                    "https://api.deepinfra.com/v1/openai/chat/completions",
                    headers=headers,
                    json=payload,
                    timeout=180
                )
                response.raise_for_status()
                result = response.json()
                content = result['choices'][0]['message']['content']
                tokens_used = result.get('usage', {}).get('total_tokens', len(content) / 4)

                return self.parse_json_response(content, tokens_used)

            except Exception as e:
                print(f"    ‚úó DeepInfra call failed (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)

        return None, 0

    def parse_json_response(self, content, tokens_used):
        """Parse JSON from API response"""
        try:
            start_idx = content.find('{')
            end_idx = content.rfind('}') + 1
            if start_idx != -1 and end_idx > start_idx:
                json_str = content[start_idx:end_idx]
                parsed_json = json.loads(json_str)
                print(f"    ‚úì API call successful ({tokens_used:.0f} tokens)")
                return parsed_json, tokens_used
        except json.JSONDecodeError:
            print(content)
            print(f"    ‚úó Failed to parse JSON from response")

        return None, 0

    def get_next_batch(self, model_name, temp):
        """Get next batch by making a live API call if under repeat limit"""
        key = (model_name, temp)

        if self.call_counts[key] >= self.max_repeats:
            return [], 0

        result, tokens = self.call_api(model_name, temp)
        self.call_counts[key] += 1

        if result:
            items = []
            if result.get('stakeholders'):
                items += [s['role'] for s in result['stakeholders'] if s.get('role')]
            if result.get('vulnerable_groups'):
                items += [v['group'] for v in result['vulnerable_groups'] if v.get('group')]

            return items, tokens

        return [], 0

In [None]:
class ProjectSimulation:
    def __init__(self, provider, project_name, cluster_config_map, config_name=""):
        self.provider = provider
        self.pid = project_name
        self.config_name = config_name
        self.kb_text = []
        self.kb_embeddings = None
        self.last_cluster = None
        self.master_log = []
        self.consecutive_low_yields = 0
        self.RESCUE_STRIKES_LIMIT = 3
        self.TERMINATION_LIMIT = 5

        self.stats = {
            "accepted": 0, "rejected": 0, "cost": 0.0,
            "cost_tier_0": 0.0, "cost_tier_1": 0.0, "cost_tier_2": 0.0,
            "entropy": 0.0, "model_counts": defaultdict(int),
            "api_calls": 0
        }

        self.clusters = {}
        for cid, m_list in cluster_config_map.items():
            self.clusters[cid] = []
            for m_name in m_list:
                if m_name in MODEL_REGISTRY:
                    meta = MODEL_REGISTRY[m_name]
                    for temp, scores in meta["variations"].items():
                        self.clusters[cid].append({
                            "name": m_name, "temp": temp, "tier": meta["tier"],
                            "score": scores["score"], "pure_score": scores["pure_score"],
                            "cost": meta["cost"], "saturated": False
                        })

    def find_highest_pure_score_model(self):
        candidates = []
        for cid, agent_list in self.clusters.items():
            for agent in agent_list:
                if not agent["saturated"]:
                    candidates.append((cid, agent, agent["pure_score"]))
        if not candidates: return None, None
        candidates.sort(key=lambda x: x[2], reverse=True)
        return candidates[0][0], candidates[0][1]

    def get_active_agent(self, cluster_id):
        valid_agents = [a for a in self.clusters[cluster_id] if not a["saturated"]]
        if not valid_agents: return None
        valid_agents.sort(key=lambda x: x["score"], reverse=True)
        return valid_agents[0]

    def get_next_move(self):
        if self.consecutive_low_yields >= self.TERMINATION_LIMIT:
            return None, None
        if self.consecutive_low_yields >= self.RESCUE_STRIKES_LIMIT:
            cid, best_agent = self.find_highest_pure_score_model()
            if best_agent: return cid, best_agent
            else: return None, None

        candidates = []
        for cid in self.clusters:
            agent = self.get_active_agent(cid)
            if agent: candidates.append((cid, agent, agent["score"]))

        if not candidates: return None, None

        target_pool = [c for c in candidates if self.last_cluster is None or c[0] != self.last_cluster]
        if not target_pool: target_pool = candidates
        target_pool.sort(key=lambda x: -x[2])
        return target_pool[0][0], target_pool[0][1]

    def normalize(self, text): return text.lower().strip()

    def process_batch(self, batch, model_name, temp, tier, sim_threshold):
        acc = 0
        rej = 0
        for item in batch:
            clean = self.normalize(item)
            if not clean: continue
            new_emb = embedding_model.encode([clean])

            status = "Accepted"
            sim_score = 0.0
            conflict = None

            if self.kb_embeddings is not None:
                sims = cosine_similarity(new_emb, self.kb_embeddings)[0]
                idx = np.argmax(sims)
                sim_score = float(sims[idx])
                conflict = self.kb_text[idx]
                if sim_score > sim_threshold:
                    status = "Rejected"

            self.master_log.append({
                "Config": self.config_name, "Project": self.pid,
                "Model": model_name.split('/')[-1], "Temp": temp, "Tier": tier,
                "Item": clean, "Status": status,
                "Sim": round(sim_score, 4), "Conflict": conflict
            })

            if status == "Accepted":
                print(f"      ‚úì {item}")
                acc += 1
                self.kb_text.append(clean)
                self.kb_embeddings = new_emb if self.kb_embeddings is None else np.vstack([self.kb_embeddings, new_emb])
            else:
                rej += 1

        return acc, rej

    def calculate_metrics(self):
        if self.kb_embeddings is None or len(self.kb_text) < 2: return 0.0, 0.0
        dists = cosine_distances(self.kb_embeddings)
        dispersion = np.mean(dists)
        try:
            n_clusters = min(5, len(self.kb_text))
            kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=42).fit(self.kb_embeddings)
            probs = np.bincount(kmeans.labels_, minlength=n_clusters) / len(self.kb_text)
            ent = entropy(probs, base=np.e) / np.log(n_clusters)
        except: ent = 0.0
        return dispersion, ent

    def run(self, COST_CAP, N_S_VG_CAP, SATURATION_YIELD_THRESHOLD, SIM_THRESHOLD):
        iteration = 0
        while True:
            iteration += 1

            if self.stats["cost"] >= COST_CAP:
                print(f"\n‚ö† Cost cap reached: ${self.stats['cost']:.4f}")
                break
            if self.stats["accepted"] >= N_S_VG_CAP:
                print(f"\n‚ö† Item cap reached: {self.stats['accepted']} items")
                break

            cid, agent_data = self.get_next_move()
            if not agent_data:
                print("\n‚ö† No more agents available")
                break

            m_name = agent_data["name"]
            m_temp = agent_data["temp"]

            print(f"\n[Iteration {iteration}] Cluster {cid} selected: {m_name.split('/')[-1]} @ temp={m_temp}")

            # Make live API call
            batch, tokens = self.provider.get_next_batch(m_name, m_temp)
            self.stats["api_calls"] += 1

            cost = (tokens / 1_000_000) * agent_data["cost"]
            self.stats["cost"] += cost

            if agent_data['tier'] == 0: self.stats["cost_tier_0"] += cost
            elif agent_data['tier'] == 1: self.stats["cost_tier_1"] += cost
            elif agent_data['tier'] == 2: self.stats["cost_tier_2"] += cost

            log_key = f"{m_name.split('/')[-1]}@{m_temp}"
            self.stats["model_counts"][log_key] += 1

            if not batch:
                print(f"    ‚ö† No data returned - marking as saturated")
                agent_data["saturated"] = True
                self.consecutive_low_yields += 1
            else:
                print(f"    Processing {len(batch)} items...")
                acc, rej = self.process_batch(batch, m_name, m_temp, agent_data['tier'], SIM_THRESHOLD)
                self.stats["accepted"] += acc
                self.stats["rejected"] += rej
                print(f"    ‚Üí Accepted: {acc}, Rejected: {rej}, Total: {self.stats['accepted']}, Cost: ${self.stats['cost']:.4f}")

                if acc < SATURATION_YIELD_THRESHOLD:
                    print(f"    ‚ö† Low yield ({acc} < {SATURATION_YIELD_THRESHOLD}) - marking as saturated")
                    agent_data["saturated"] = True
                    self.consecutive_low_yields += 1
                else:
                    self.consecutive_low_yields = 0

            self.last_cluster = cid

        d, e = self.calculate_metrics()
        self.stats["dispersion"] = d
        self.stats["entropy"] = e
        return self.stats, self.master_log


In [None]:
def generate_frequency_table_with_rejected(logs_df):
    """
    Generate frequency table including rejected items mapped to accepted roles
    No need to recompute embeddings - accepted items are already deduplicated
    """
    accepted_df = logs_df[logs_df['Status'] == 'Accepted'].copy()
    rejected_df = logs_df[logs_df['Status'] == 'Rejected'].copy()

    if len(accepted_df) == 0:
        print("No accepted items found.")
        return pd.DataFrame()

    print(f"\nGenerating frequency table...")
    print(f"  Accepted items: {len(accepted_df)}")
    print(f"  Rejected items: {len(rejected_df)}")

    # Get unique accepted items (each appears once due to similarity filtering)
    unique_accepted = accepted_df['Item'].unique()

    # Count rejected items that mapped to each accepted item
    rejection_counts = rejected_df['Conflict'].value_counts().to_dict()

    # Build frequency table
    grouped_items = []
    for canonical_item in unique_accepted:
        rejected_count = rejection_counts.get(canonical_item, 0)

        grouped_items.append({
            'Canonical_Role': canonical_item,
            'Total_Mentions': 1 + rejected_count,
        })

    # Create DataFrame and sort by total mentions
    freq_df = pd.DataFrame(grouped_items)
    freq_df = freq_df.sort_values('Total_Mentions', ascending=False).reset_index(drop=True)

    return freq_df

def generate_rejection_details(logs_df):
    """
    Show which rejected items mapped to which accepted roles
    """
    rejected_df = logs_df[logs_df['Status'] == 'Rejected'].copy()

    if len(rejected_df) == 0:
        return pd.DataFrame()

    # Group rejected items by their conflict (accepted role they're similar to)
    rejection_details = rejected_df.groupby(['Conflict', 'Item']).size().reset_index(name='Count')
    rejection_details = rejection_details.sort_values(['Conflict', 'Count'], ascending=[True, False])

    return rejection_details

# --- EXECUTION ---
def run_experiment(project_name, project_description):
    # Example project description

    print("="*80)
    print("LIVE API EXPERIMENT ")
    print("="*80)
    print(f"\nProject: {project_name}")
    print(f"Max repeats per model/temp: {MAX_REPEATS_PER_MODEL}")

    # Initialize provider (NO pre-generation)
    provider = LiveAPIProvider(API_KEYS, project_description, MAX_REPEATS_PER_MODEL)

    # Configuration
    cluster_config = build_cluster_config()

    # Parameters
    COST_CAP = 5.0
    ITEM_CAP = 200
    YIELD_THRESHOLD = 3
    SIM_THRESHOLD = 0.8

    print(f"\nSimulation Parameters:")
    print(f"  Cost Cap: ${COST_CAP}")
    print(f"  Item Cap: {ITEM_CAP}")
    print(f"  Yield Threshold: {YIELD_THRESHOLD}")
    print(f"  Similarity Threshold: {SIM_THRESHOLD}")
    print("="*80)

    # Run simulation (API calls happen during simulation)
    sim = ProjectSimulation(provider, project_name, cluster_config, config_name="4_role_Clusters")
    stats, logs = sim.run(COST_CAP, ITEM_CAP, YIELD_THRESHOLD, SIM_THRESHOLD)

    # Display results
    runs_list = [f"{m}: {c}" for m, c in sorted(stats['model_counts'].items(), key=lambda x: x[1], reverse=True)]
    runs_str = ", ".join(runs_list)

    print(f"\n{'='*80}")
    print(f"FINAL RESULTS")
    print(f"{'='*80}")
    print(f"Total API Calls: {stats['api_calls']}")
    print(f"Items Accepted: {stats['accepted']}")
    print(f"Items Rejected: {stats['rejected']}")
    print(f"Total Cost: ${stats['cost']:.4f}")
    print(f"  Tier 0 Cost: ${stats['cost_tier_0']:.4f}")
    print(f"  Tier 1 Cost: ${stats['cost_tier_1']:.4f}")
    print(f"  Tier 2 Cost: ${stats['cost_tier_2']:.4f}")
    print(f"Dispersion: {stats['dispersion']:.4f}")
    print(f"Entropy: {stats['entropy']:.4f}")
    print(f"\nModel Usage: {runs_str}")
    print(f"{'='*80}\n")

    # Save results
    if logs:
        logs_df = pd.DataFrame(logs)
        logs_df.to_csv("live_api_results_detailed.csv", index=False)
        print("‚úì Detailed logs saved to live_api_results_detailed.csv")

        print("\n" + "="*80)
        print("FREQUENCY ANALYSIS (Including Rejected Mappings)")
        print("="*80)

        # Main frequency table with rejected counts
        freq_table = generate_frequency_table_with_rejected(logs_df)
        if not freq_table.empty:
            freq_table.to_csv("frequency_table_with_rejected.csv", index=False)
            print("‚úì Role frequency saved to live_api_results_detailed.csv")

            # Top 10 by total mentions
            print("\nTop 20 Most Mentioned Roles (Accepted + Rejected):")
            top10 = freq_table.head(20)[['Canonical_Role', 'Total_Mentions']]
            print(top10.to_string(index=False))


        # Save summary
        summary = pd.DataFrame([{
            "API_Calls": stats['api_calls'],
            "Items": stats['accepted'],
            "Rejected": stats['rejected'],
            "Cost": stats['cost'],
            "Dispersion": stats['dispersion'],
            "Entropy": stats['entropy'],
            "Model_Usage": runs_str
        }])
        summary.to_csv("live_api_results_summary.csv", index=False)
        print("‚úì Summary saved to live_api_results_summary.csv")

if __name__ == "__main__":
    project_name = "AI-powered student assessment"
    project_description = """
    An AI-powered student assessment system that uses machine learning to evaluate
    student performance, predict learning outcomes, and provide personalized recommendations
    for educational interventions. The system analyzes student data including test scores,
    attendance records, and behavioral patterns to identify at-risk students.
    """

    run_experiment(project_name,project_description)


LIVE API EXPERIMENT 

Project: AI-powered student assessment
Max repeats per model/temp: 3

Simulation Parameters:
  Cost Cap: $5.0
  Item Cap: 200
  Yield Threshold: 3
  Similarity Threshold: 0.8

[Iteration 1] Cluster 3 selected: gpt-oss-20b @ temp=0.3
    üîÑ Calling DeepInfra: openai/gpt-oss-20b @ temp=0.3...
    ‚úì API call successful (5292 tokens)
    Processing 30 items...
      ‚úì Student
      ‚úì Parent
      ‚úì Teacher
      ‚úì School administrator
      ‚úì School counselor
      ‚úì School psychologist
      ‚úì School board member
      ‚úì School district superintendent
      ‚úì School district IT staff
      ‚úì School district data privacy officer
      ‚úì School district legal counsel
      ‚úì School district compliance officer
      ‚úì School district risk management officer
      ‚úì School district procurement officer
      ‚úì School district finance officer
      ‚úì School district curriculum specialist
      ‚úì School district assessment specialist
  