In [1]:
import numpy as np
import pandas as pd
from scipy import stats

class ChatParkSimulation:
    def __init__(self, num_llms, num_human_experts, num_stages, num_questions_per_stage):
        self.num_llms = num_llms
        self.num_human_experts = num_human_experts
        self.num_arms = num_llms + num_human_experts
        self.num_stages = num_stages
        self.num_questions_per_stage = num_questions_per_stage
        
        # Initialize arms (LLMs + human experts)
        self.arms = [f"LLM_{i+1}" for i in range(num_llms)] + [f"Expert_{i+1}" for i in range(num_human_experts)]
        
        # Set up data storage
        self.data = pd.DataFrame(columns=['Stage', 'Arm', 'Question', 'Knowledge', 'Empathy', 'Usefulness'])
        
        # Set up arm status (active/inactive)
        self.arm_status = {arm: True for arm in self.arms}
        
    def generate_question(self):
        # In a real scenario, this would pull from a database of patient questions
        return f"Question about Parkinson's disease #{np.random.randint(1000)}"
    
    def generate_response(self, arm):
        # Simulate response quality based on arm type (LLM vs Expert)
        if 'LLM' in arm:
            base_quality = np.random.normal(3.5, 0.5)  # LLMs slightly lower base quality
        else:
            base_quality = np.random.normal(4, 0.5)  # Experts slightly higher base quality
        
        knowledge = max(1, min(5, base_quality + np.random.normal(0, 0.5)))
        empathy = max(1, min(5, base_quality + np.random.normal(0, 0.5)))
        usefulness = max(1, min(5, base_quality + np.random.normal(0, 0.5)))
        
        return knowledge, empathy, usefulness
    
    def run_stage(self, stage):
        stage_data = []
        for _ in range(self.num_questions_per_stage):
            question = self.generate_question()
            for arm in self.arms:
                if self.arm_status[arm]:
                    knowledge, empathy, usefulness = self.generate_response(arm)
                    stage_data.append({
                        'Stage': stage,
                        'Arm': arm,
                        'Question': question,
                        'Knowledge': knowledge,
                        'Empathy': empathy,
                        'Usefulness': usefulness
                    })
        self.data = pd.concat([self.data, pd.DataFrame(stage_data)], ignore_index=True)
    
    def analyze_stage(self, stage):
        stage_data = self.data[self.data['Stage'] == stage]
        arm_means = stage_data.groupby('Arm')[['Knowledge', 'Empathy', 'Usefulness']].mean()
        
        # Simple decision rule: drop arms with mean score < 3.5 in any category
        for arm in self.arms:
            if self.arm_status[arm] and (arm_means.loc[arm] < 3.5).any():
                self.arm_status[arm] = False
                print(f"Stage {stage}: Dropping arm {arm} due to low scores")
        
        # Check if we should stop for efficacy (if any arm has mean score > 4.5 in all categories)
        best_arm = arm_means[(arm_means > 4.5).all(axis=1)].index
        if not best_arm.empty:
            print(f"Stage {stage}: Stopping for efficacy. Best arm: {best_arm[0]}")
            return True
        
        # Check if we should stop for futility (if all arms are inactive)
        if not any(self.arm_status.values()):
            print(f"Stage {stage}: Stopping for futility. No arms remain active.")
            return True
        
        return False
    
    def run_trial(self):
        for stage in range(1, self.num_stages + 1):
            print(f"Running Stage {stage}")
            self.run_stage(stage)
            if self.analyze_stage(stage):
                break
        
        # Final analysis
        final_data = self.data.groupby('Arm')[['Knowledge', 'Empathy', 'Usefulness']].mean()
        print("\nFinal Results:")
        print(final_data)
        
        best_arm = final_data.mean(axis=1).idxmax()
        print(f"\nBest performing arm: {best_arm}")

# Run the simulation
np.random.seed(42)  # for reproducibility
sim = ChatParkSimulation(num_llms=3, num_human_experts=2, num_stages=5, num_questions_per_stage=20)
sim.run_trial()

Running Stage 1
Stage 1: Dropping arm LLM_1 due to low scores
Stage 1: Dropping arm LLM_2 due to low scores
Stage 1: Dropping arm LLM_3 due to low scores
Running Stage 2
Running Stage 3
Running Stage 4
Running Stage 5

Final Results:
          Knowledge   Empathy  Usefulness
Arm                                      
Expert_1   3.892595  3.998176    3.917999
Expert_2   4.158458  4.060552    4.114116
LLM_1      3.314113  3.492615    3.500321
LLM_2      3.195856  3.142112    3.264809
LLM_3      3.500844  3.535780    3.399151

Best performing arm: Expert_2


  self.data = pd.concat([self.data, pd.DataFrame(stage_data)], ignore_index=True)


: 