In [None]:
# %% import pandas as pd
import numpy as np
import pystan
from scipy.stats import kendalltau
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.read_csv('Elicitation Formats/rank-rank/rank-rank_1_4.csv')  # Replace with your actual filename

# %% Convert string representations of lists to lists
import ast
df['options'] = df['options'].apply(ast.literal_eval)
df['votes'] = df['votes'].apply(ast.literal_eval)
df['predictions'] = df['predictions'].apply(ast.literal_eval)

# Constants
N = 10  # Number of voters
J = df['questions'].nunique()  # Number of questions
K = len(df['options'].iloc[0])  # Number of options per vote (assuming uniform)

# Calculate Kendall tau distances
kendall_tau_votes = np.zeros((N, J))
kendall_tau_predictions = np.zeros((N, J))

for j in range(J):
    question_responses = df[df['questions'] == j + 1]
    for i, response in question_responses.iterrows():
        kt_vote = 1-kendalltau(response['votes'], response['options'])[0]
        kt_prediction = 1-kendalltau(response['predictions'], response['options'])[0]
        n = i % N 
        kendall_tau_votes[n, j] = kt_vote
        kendall_tau_predictions[n, j] = kt_prediction

# Stan Data
stan_data = {
    'N': N,
    'K': K,
    'J': J,
    'kendall_tau_votes': kendall_tau_votes,
    'kendall_tau_predictions': kendall_tau_predictions,
}

In [None]:
def run_model_with_priors(stan_data, priors):
    stan_model_code = f"""
    data {{
      int<lower=1> N;
      int<lower=1> J;
      real kendall_tau_votes[N, J];
      real kendall_tau_predictions[N, J];
      vector[2] alpha;  // Alpha as data
    }}

    parameters {{
      real<lower=0> dispersion_vote_expert;
      real<lower=0> dispersion_vote_nonexpert;
      real<lower=0> dispersion_pred_expert;
      real<lower=0> dispersion_pred_nonexpert;
      simplex[2] prob_group;
    }}

    model {{
      dispersion_vote_expert ~ normal({priors['vote_expert'][0]}, {priors['vote_expert'][1]});
      dispersion_vote_nonexpert ~ normal({priors['vote_nonexpert'][0]}, {priors['vote_nonexpert'][1]});
      dispersion_pred_expert ~ normal({priors['pred_expert'][0]}, {priors['pred_expert'][1]});
      dispersion_pred_nonexpert ~ normal({priors['pred_nonexpert'][0]}, {priors['pred_nonexpert'][1]});
      
      prob_group ~ dirichlet(alpha);  // Use alpha from data

      for (n in 1:N) {{
         vector[2] log_lik;
         log_lik[1] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) + 
                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert);
         log_lik[2] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) + 
                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert);
         target += log_sum_exp(log_lik + log(prob_group));
       }}
     }}
   generated quantities {{
        vector[N] log_lik;
        for (n in 1:N) {{
            vector[2] log_lik_n;
            log_lik_n[1] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) +
                           normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert);
            log_lik_n[2] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) +
                           normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert);
            log_lik[n] = log_sum_exp(log_lik_n + log(prob_group));
        }}
    }}
    """

    # Convert alpha list to a numpy array for Stan
    alpha_array = np.array([priors['alpha'][0], priors['alpha'][1]])

    # Include alpha in the data dictionary
    stan_data['alpha'] = alpha_array

    sm = pystan.StanModel(model_code=stan_model_code)
    fit = sm.sampling(data=stan_data, 
                      iter=8000,
                      warmup=2000,
                      chains=4,
                      control={
                          'adapt_delta': 0.95,
                          'max_treedepth': 15
                      })
    return fit

# Define different sets of priors
prior_sets = {
    'case1': {
        'vote_expert': (0.1, 0.2),
        'vote_nonexpert': (0.8, 0.2),
        'pred_expert': (0.4, 0.3),
        'pred_nonexpert': (0.8, 0.3),
        'alpha':[2, 4]
    }
}
# Run models with different priors
results = {}
for prior_name, priors in prior_sets.items():
    print(f"Running model with {prior_name} priors...")
    results[prior_name] = run_model_with_priors(stan_data, priors)


In [None]:
# Extract learned parameters from Stan output
summary = results['case1'].summary()
param_names = summary['summary_rownames']
posterior_means = summary['summary'][:, 0]
param_mean_dict = dict(zip(param_names, posterior_means))

# Set parameters
learned_params = {
    'dispersion_vote_expert': param_mean_dict['dispersion_vote_expert'],
    'dispersion_vote_nonexpert': param_mean_dict['dispersion_vote_nonexpert'],
    'dispersion_pred_expert': param_mean_dict['dispersion_pred_expert'],
    'dispersion_pred_nonexpert': param_mean_dict['dispersion_pred_nonexpert'],
    'prob_expert': param_mean_dict['prob_group[1]'],
}


In [None]:
import pandas as pd
import random
from typing import List, Tuple
from itertools import permutations
from scipy.stats import kendalltau
import numpy as np
from tqdm import tqdm

class Vote:
    def __init__(self, question_number: int, options: List[int], ranking: List[int], predicted_probs: dict, is_expert: bool):
        self.question_number = question_number
        self.options = options
        self.ranking = ranking
        self.predicted_probs = predicted_probs
        self.is_expert = is_expert

class Voter:
    def __init__(self, is_expert: bool, params):
        self.is_expert = is_expert
        self.params = params 
    def vote(self, question_number: int, options: List[int], ground_truth: List[int], all_worlds: List[Tuple[int]]):

        if self.is_expert:
            centroid = ground_truth
        else:
            centroid = random.choice(all_worlds)

        # Compute the Mallows probabilities for all possible signals
        signal_probs = {}
        for possible_signal in all_worlds:
            signal_probs[possible_signal] = signal_probability(possible_signal, centroid, self.is_expert, self.params, mode='vote')

        # Normalize the probabilities so they sum to 1
        total_prob = sum(signal_probs.values())
        normalized_signal_probs = {signal: prob/total_prob for signal, prob in signal_probs.items()}

        # Choose the signal with the maximum probability
        signal = max(normalized_signal_probs, key=normalized_signal_probs.get)


        # Compute the conditional probabilities and the predicted ranking
        conditional_probs = {}
        for possible_signal in all_worlds: 
            s_j = possible_signal
            s_k = signal
            conditional_probs[s_j] = compute_conditional_prob(s_j, s_k, all_worlds, ground_truth, self.is_expert, self.params, mode='pred')

        # Call the predict method to get the predicted probabilities
        predicted_probs = self.predict(signal, conditional_probs, all_worlds, ground_truth)

        # Determine prediction by finding the ranking with the highest predicted probability
        prediction = max(predicted_probs, key=predicted_probs.get)

        return Vote(question_number, options, signal, prediction, self.is_expert)

    def predict(self, signal, conditional_probs, all_worlds, ground_truth):
        # Prepare the prediction probabilities.
        prediction_probs = {world: prob for world, prob in conditional_probs.items() if world != signal}

        # Normalize the probabilities so that they sum to 1.
        total_prob = sum(prediction_probs.values())
        normalized_prediction_probs = {world: prob/total_prob for world, prob in prediction_probs.items()}

        return normalized_prediction_probs


def mallows_distance(ranking1, ranking2):
    tau, _ = kendalltau(ranking1, ranking2)
    return 1 - tau  # Inverting tau to represent a 'distance'

def normalization_constant(phi, m):
    z = 1
    for i in range(1, m):
        z *= sum(phi**j for j in range(i + 1))
    return z

def signal_probability(signal, world, is_expert, params, mode='vote'):
    if mode == 'vote':
        dispersion = params['dispersion_vote_expert'] if is_expert else params['dispersion_vote_nonexpert']
    else:
        dispersion = params['dispersion_pred_expert'] if is_expert else params['dispersion_pred_nonexpert']

    distance = mallows_distance(signal, world)
    m = len(signal)
    phi = dispersion
    prob = phi**distance / normalization_constant(phi, m)
    return prob


computed_posteriors = {}

def compute_posterior(signal, world, all_worlds, ground_truth, is_expert, params, mode='vote'):
    key = (tuple(signal), world, tuple(ground_truth), is_expert, mode)
    if key in computed_posteriors:
        return computed_posteriors[key]
    
    prior = 1 / len(all_worlds)
    likelihood = signal_probability(signal, world, is_expert, params, mode=mode)
    total_signal_prob = sum(signal_probability(signal, w, is_expert, params, mode=mode) * prior for w in all_worlds)
    posterior = likelihood * prior / total_signal_prob
    computed_posteriors[key] = posterior
    return posterior




def compute_conditional_prob(s_j, s_k, all_worlds, ground_truth, is_expert, params, mode='vote', num_samples=1000):
    weights = [compute_posterior(s_k, w, all_worlds, ground_truth, is_expert, params, mode=mode) for w in all_worlds]
    sampled_worlds = random.choices(all_worlds, weights=weights, k=num_samples)

    total_prob = 0.0
    for world_i in sampled_worlds:
        p_sj_wi = signal_probability(s_j, world_i, is_expert, params, mode=mode)
        total_prob += p_sj_wi

    return total_prob / num_samples




def simulate_voting(num_voters: int, subset: List[int], ground_truth: List[int], params) -> Tuple[List[Vote]]:
    prob_expert = params['prob_expert']
    num_experts = np.random.binomial(num_voters, prob_expert)

    # Initialize voter list
    voters = [Voter(is_expert=False, params=params) for _ in range(num_voters)]

    # Randomly assign experts by selecting indices at random
    expert_indices = np.random.choice(num_voters, size=num_experts, replace=False)

    for idx in expert_indices:
        voters[idx].is_expert = True

    # Count experts and non-experts in every 100 voters
    for i in range(100, num_voters + 1, 100):
        experts_in_chunk = sum(voter.is_expert for voter in voters[i-100:i])
        non_experts_in_chunk = 100 - experts_in_chunk
        print(f"In samples {i-99} to {i}: Experts = {experts_in_chunk}, Non-experts = {non_experts_in_chunk}")

    
    votes = []
    all_worlds = all_worlds = list(permutations(subset))
    question_number=1 

    for voter in tqdm(voters, desc="Simulating Votes"):
        vote = voter.vote(question_number, subset, ground_truth, all_worlds)
        votes.append(vote)

    return votes


def write_to_csv(votes):
    data = []
    for vote in votes:
        data.append([vote.question_number, vote.options, vote.ranking, vote.predicted_probs, vote.is_expert, 1, 6])
        
    df = pd.DataFrame(data, columns=['questions', 'options', 'votes', 'predictions', 'is_expert', 'domain', 'treatment'])
    df.to_csv('groups2_mallows_simulated_data.csv', index=False)


# Test the simulation
num_voters = 1000
m = 5 # Total number of alternatives

m = [i for i in range(1,m+1)]
# Generate ground truths for each subset. In this example, we assume the ground truth 
# is the options sorted in ascending order. You can replace this with your actual ground truths.
ground_truth = sorted(m)

votes = simulate_voting(num_voters, m, ground_truth, learned_params)

write_to_csv(votes)

In [None]:
import pandas as pd
import numpy as np
import ast
from scipy.stats import kendalltau

# Load the simulated dataset
df_sim = pd.read_csv('groups2_mallows_simulated_data.csv')

# Convert string fields to lists
df_sim['options'] = df_sim['options'].apply(ast.literal_eval)
df_sim['votes'] = df_sim['votes'].apply(ast.literal_eval)
df_sim['predictions'] = df_sim['predictions'].apply(ast.literal_eval)

# Constants
N = 10  # Or however many you want to subsample
J = df_sim['questions'].nunique()
K = len(df_sim['options'].iloc[0])  # number of items

# Initialize matrices
kendall_tau_votes = np.zeros((N, J))
kendall_tau_predictions = np.zeros((N, J))

# Build matrices
for j in range(J):
    question_responses = df_sim[df_sim['questions'] == j + 1].head(N)  # limit to N
    for i, response in question_responses.iterrows():
        n = i % N
        kt_vote = 1 - kendalltau(response['votes'], response['options'])[0]
        kt_prediction = 1 - kendalltau(response['predictions'], response['options'])[0]
        kendall_tau_votes[n, j] = kt_vote
        kendall_tau_predictions[n, j] = kt_prediction

# Build Stan data dict
stan_data_synth = {
    'N': N,
    'K': K,
    'J': J,
    'kendall_tau_votes': kendall_tau_votes,
    'kendall_tau_predictions': kendall_tau_predictions,
}


In [None]:
fit_synth = run_model_with_priors(stan_data_synth, prior_sets['case1'])
results['synth_case1'] = fit_synth


In [None]:
def extract_mean_params(fit):
    summary = fit.summary()
    param_names = summary['summary_rownames']
    means = summary['summary'][:, 0]
    return dict(zip(param_names, means))

# Extract means
original_params = extract_mean_params(results['case1'])
synth_params = extract_mean_params(results['synth_case1'])

# Collect results
comparison_records = []

for key in ['dispersion_vote_expert', 'dispersion_vote_nonexpert',
            'dispersion_pred_expert', 'dispersion_pred_nonexpert',
            'prob_group[1]', 'prob_group[2]']:
    rel_error = abs(original_params[key] - synth_params[key]) / original_params[key]
    comparison_records.append({
        'parameter': key,
        'original': original_params[key],
        'synthetic': synth_params[key],
        'relative_error': rel_error
    })
    print(f"{key}: Relative Error = {rel_error:.4f}")

# Save to CSV
df_compare = pd.DataFrame(comparison_records)
df_compare.to_csv('group2_parameter_recovery_comparison_mallows.csv', index=False)


In [None]:
# %% import pandas as pd
import numpy as np
import pystan
from scipy.stats import kendalltau
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.read_csv('Elicitation Formats/rank-rank/rank-rank_1_4.csv')

# %% Convert string representations of lists to lists
import ast
df['options'] = df['options'].apply(ast.literal_eval)
df['votes'] = df['votes'].apply(ast.literal_eval)
df['predictions'] = df['predictions'].apply(ast.literal_eval)

# Constants
N = 10  # Number of voters
J = df['questions'].nunique()  # Number of questions
K = len(df['options'].iloc[0])  # Number of options per vote (assuming uniform)

# Calculate Kendall tau distances
kendall_tau_votes = np.zeros((N, J))
kendall_tau_predictions = np.zeros((N, J))

for j in range(J):
    question_responses = df[df['questions'] == j + 1]
    for i, response in question_responses.iterrows():
        kt_vote = 1-kendalltau(response['votes'], response['options'])[0]
        kt_prediction = 1-kendalltau(response['predictions'], response['options'])[0]
        n = i % N 
        kendall_tau_votes[n, j] = kt_vote
        kendall_tau_predictions[n, j] = kt_prediction

# Stan Data
stan_data = {
    'N': N,
    'K': K,
    'J': J,
    'kendall_tau_votes': kendall_tau_votes,
    'kendall_tau_predictions': kendall_tau_predictions,
}


In [None]:
def run_model_with_priors(stan_data, priors):
    stan_model_code = f"""
    data {{
      int<lower=1> N;
      int<lower=1> J;
      real kendall_tau_votes[N, J];
      real kendall_tau_predictions[N, J];
      vector[3] alpha;  // Alpha as data
    }}

    parameters {{
      real<lower=0> dispersion_vote_expert;
      real<lower=0> dispersion_vote_intermediate;
      real<lower=0> dispersion_vote_nonexpert;
      real<lower=0> dispersion_pred_expert;
      real<lower=0> dispersion_pred_intermediate;
      real<lower=0> dispersion_pred_nonexpert;
      simplex[3] prob_group;
    }}

    model {{
      dispersion_vote_expert ~ normal({priors['vote_expert'][0]}, {priors['vote_expert'][1]});
      dispersion_vote_intermediate ~ normal({priors['vote_intermediate'][0]}, {priors['vote_intermediate'][1]});
      dispersion_vote_nonexpert ~ normal({priors['vote_nonexpert'][0]}, {priors['vote_nonexpert'][1]});
      dispersion_pred_expert ~ normal({priors['pred_expert'][0]}, {priors['pred_expert'][1]});
      dispersion_pred_intermediate ~ normal({priors['pred_intermediate'][0]}, {priors['pred_intermediate'][1]});
      dispersion_pred_nonexpert ~ normal({priors['pred_nonexpert'][0]}, {priors['pred_nonexpert'][1]});
      
      prob_group ~ dirichlet(alpha);  // Use alpha from data

      for (n in 1:N) {{
         vector[3] log_lik;
         log_lik[1] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) + 
                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert);
         log_lik[2] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_intermediate) + 
                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_intermediate);
         log_lik[3] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) + 
                      normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert);
         target += log_sum_exp(log_lik + log(prob_group));
       }}
     }}
   generated quantities {{
        vector[N] log_lik;
        for (n in 1:N) {{
            vector[3] log_lik_n;
            log_lik_n[1] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_expert) +
                           normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_expert);
            log_lik_n[2] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_intermediate) +
                           normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_intermediate);
            log_lik_n[3] = normal_lpdf(kendall_tau_votes[n] | 0, dispersion_vote_nonexpert) +
                           normal_lpdf(kendall_tau_predictions[n] | 0, dispersion_pred_nonexpert);
            log_lik[n] = log_sum_exp(log_lik_n + log(prob_group));
        }}
    }}
    """

    # Convert alpha list to a numpy array for Stan
    alpha_array = np.array([priors['alpha'][0], priors['alpha'][1], priors['alpha'][2]])

    # Include alpha in the data dictionary
    stan_data['alpha'] = alpha_array

    sm = pystan.StanModel(model_code=stan_model_code)
    fit = sm.sampling(data=stan_data, 
                      iter=8000,
                      warmup=2000,
                      chains=4,
                      control={
                          'adapt_delta': 0.95,
                          'max_treedepth': 15
                      })
    return fit


prior_sets = {
    'case1': {
        'vote_expert': (0.1, 0.2),
        'vote_intermediate': (0.4, 0.2),
        'vote_nonexpert': (0.8, 0.2),
        'pred_expert': (0.4, 0.3),
        'pred_intermediate': (0.4, 0.3),
        'pred_nonexpert': (0.8, 0.3),
        'alpha':[2, 2, 4]
    }
}
# Run models with different priors
results = {}
for prior_name, priors in prior_sets.items():
    print(f"Running model with {prior_name} priors...")
    results[prior_name] = run_model_with_priors(stan_data, priors)


In [None]:
# Extract learned parameters from Stan output
summary = results['case1'].summary()
param_names = summary['summary_rownames']
posterior_means = summary['summary'][:, 0]
param_mean_dict = dict(zip(param_names, posterior_means))

# Set parameters for 3-group setting
learned_params = {
    'dispersion_vote_expert': param_mean_dict['dispersion_vote_expert'],
    'dispersion_vote_intermediate': param_mean_dict['dispersion_vote_intermediate'],
    'dispersion_vote_nonexpert': param_mean_dict['dispersion_vote_nonexpert'],
    'dispersion_pred_expert': param_mean_dict['dispersion_pred_expert'],
    'dispersion_pred_intermediate': param_mean_dict['dispersion_pred_intermediate'],
    'dispersion_pred_nonexpert': param_mean_dict['dispersion_pred_nonexpert'],
    'prob_expert': param_mean_dict['prob_group[1]'],
    'prob_intermediate': param_mean_dict['prob_group[2]'],
    'prob_nonexpert': param_mean_dict['prob_group[3]'],
}


In [None]:
import pandas as pd
import random
from typing import List, Tuple
from itertools import permutations
from scipy.stats import kendalltau
import numpy as np
from tqdm import tqdm

class Vote:
    def __init__(self, question_number: int, options: List[int], ranking: List[int], predicted_probs: dict, is_expert: bool):
        self.question_number = question_number
        self.options = options
        self.ranking = ranking
        self.predicted_probs = predicted_probs
        self.is_expert = is_expert

class Voter:
    def __init__(self, group: int, params):
        self.group = group  # 0 = expert, 1 = intermediate, 2 = non-expert
        self.params = params

    def vote(self, question_number: int, options: List[int], ground_truth: List[int], all_worlds: List[Tuple[int]]):

        centroid = ground_truth if self.group == 0 else random.choice(all_worlds)

        # Mallows probabilities for signals
        signal_probs = {
            signal: signal_probability(signal, centroid, self.group, self.params, mode='vote')
            for signal in all_worlds
        }

        # Normalize
        total_prob = sum(signal_probs.values())
        normalized_signal_probs = {signal: prob / total_prob for signal, prob in signal_probs.items()}
        signal = max(normalized_signal_probs, key=normalized_signal_probs.get)

        # Conditional probabilities
        conditional_probs = {
            s_j: compute_conditional_prob(s_j, signal, all_worlds, ground_truth, self.group, self.params, mode='pred')
            for s_j in all_worlds
        }

        predicted_probs = self.predict(signal, conditional_probs, all_worlds, ground_truth)
        prediction = max(predicted_probs, key=predicted_probs.get)

        return Vote(question_number, options, signal, prediction, self.group)


    def predict(self, signal, conditional_probs, all_worlds, ground_truth):
        # Prepare the prediction probabilities.
        prediction_probs = {world: prob for world, prob in conditional_probs.items() if world != signal}

        # Normalize the probabilities so that they sum to 1.
        total_prob = sum(prediction_probs.values())
        normalized_prediction_probs = {world: prob/total_prob for world, prob in prediction_probs.items()}

        return normalized_prediction_probs


def mallows_distance(ranking1, ranking2):
    tau, _ = kendalltau(ranking1, ranking2)
    return 1 - tau  # Inverting tau to represent a 'distance'

def normalization_constant(phi, m):
    z = 1
    for i in range(1, m):
        z *= sum(phi**j for j in range(i + 1))
    return z

def signal_probability(signal, world, group, params, mode='vote'):
    key = f'dispersion_{mode}_expert' if group == 0 else (
          f'dispersion_{mode}_intermediate' if group == 1 else f'dispersion_{mode}_nonexpert')
    
    dispersion = params[key]
    distance = mallows_distance(signal, world)
    m = len(signal)
    phi = dispersion
    prob = phi**distance / normalization_constant(phi, m)
    return prob



computed_posteriors = {}

def compute_posterior(signal, world, all_worlds, ground_truth, is_expert, params, mode='vote'):
    key = (tuple(signal), world, tuple(ground_truth), is_expert, mode)
    if key in computed_posteriors:
        return computed_posteriors[key]
    
    prior = 1 / len(all_worlds)
    likelihood = signal_probability(signal, world, is_expert, params, mode=mode)
    total_signal_prob = sum(signal_probability(signal, w, is_expert, params, mode=mode) * prior for w in all_worlds)
    posterior = likelihood * prior / total_signal_prob
    computed_posteriors[key] = posterior
    return posterior




def compute_conditional_prob(s_j, s_k, all_worlds, ground_truth, is_expert, params, mode='vote', num_samples=1000):
    weights = [compute_posterior(s_k, w, all_worlds, ground_truth, is_expert, params, mode=mode) for w in all_worlds]
    sampled_worlds = random.choices(all_worlds, weights=weights, k=num_samples)

    total_prob = 0.0
    for world_i in sampled_worlds:
        p_sj_wi = signal_probability(s_j, world_i, is_expert, params, mode=mode)
        total_prob += p_sj_wi

    return total_prob / num_samples




def simulate_voting(num_voters: int, subset: List[int], ground_truth: List[int], params) -> List[Vote]:
    prob_expert = params['prob_expert']
    prob_intermediate = params['prob_intermediate']
    prob_nonexpert = params['prob_nonexpert']

    group_assignments = np.random.choice(
        [0, 1, 2],
        size=num_voters,
        p=[prob_expert, prob_intermediate, prob_nonexpert]
    )

    voters = [Voter(group=g, params=params) for g in group_assignments]

    # Optional: group count summary
    for i in range(100, num_voters + 1, 100):
        chunk = voters[i-100:i]
        counts = [sum(v.group == g for v in chunk) for g in [0, 1, 2]]
        print(f"Samples {i-99}-{i} âž¤ Experts: {counts[0]}, Intermediates: {counts[1]}, Non-experts: {counts[2]}")

    all_worlds = list(permutations(subset))
    question_number = 1

    votes = []
    for voter in tqdm(voters, desc="Simulating Votes"):
        vote = voter.vote(question_number, subset, ground_truth, all_worlds)
        votes.append(vote)

    return votes



def write_to_csv(votes):
    data = []
    for vote in votes:
        data.append([vote.question_number, vote.options, vote.ranking, vote.predicted_probs, vote.is_expert, 1, 6])
        
    df = pd.DataFrame(data, columns=['questions', 'options', 'votes', 'predictions', 'group', 'domain', 'treatment'])
    df.to_csv('groups3_mallows_simulated_data.csv', index=False)


# Test the simulation
num_voters = 1000
m = 5 # Total number of alternatives

m = [i for i in range(1,m+1)]
# Generate ground truths for each subset. In this example, we assume the ground truth 
# is the options sorted in ascending order. You can replace this with your actual ground truths.
ground_truth = sorted(m)

votes = simulate_voting(num_voters, m, ground_truth, learned_params)

write_to_csv(votes)

In [None]:
import pandas as pd
import numpy as np
import ast
from scipy.stats import kendalltau

# Load the simulated dataset
df_sim = pd.read_csv('groups3_mallows_simulated_data.csv')

# Convert string fields to lists
df_sim['options'] = df_sim['options'].apply(ast.literal_eval)
df_sim['votes'] = df_sim['votes'].apply(ast.literal_eval)
df_sim['predictions'] = df_sim['predictions'].apply(ast.literal_eval)

# Constants
N = 10  # Or however many you want to subsample
J = df_sim['questions'].nunique()
K = len(df_sim['options'].iloc[0])  # number of items

# Initialize matrices
kendall_tau_votes = np.zeros((N, J))
kendall_tau_predictions = np.zeros((N, J))

# Build matrices
for j in range(J):
    question_responses = df_sim[df_sim['questions'] == j + 1].head(N)  # limit to N
    for i, response in question_responses.iterrows():
        n = i % N
        kt_vote = 1 - kendalltau(response['votes'], response['options'])[0]
        kt_prediction = 1 - kendalltau(response['predictions'], response['options'])[0]
        kendall_tau_votes[n, j] = kt_vote
        kendall_tau_predictions[n, j] = kt_prediction

# Build Stan data dict
stan_data_synth = {
    'N': N,
    'K': K,
    'J': J,
    'kendall_tau_votes': kendall_tau_votes,
    'kendall_tau_predictions': kendall_tau_predictions,
}


In [None]:
fit_synth = run_model_with_priors(stan_data_synth, prior_sets['case1'])
results['synth_case1'] = fit_synth


In [None]:
def extract_mean_params(fit):
    summary = fit.summary()
    param_names = summary['summary_rownames']
    means = summary['summary'][:, 0]
    return dict(zip(param_names, means))

# Extract means
original_params = extract_mean_params(results['case1'])
synth_params = extract_mean_params(results['synth_case1'])

# Collect results
comparison_records = []

param_keys = [
    'dispersion_vote_expert',
    'dispersion_vote_intermediate',
    'dispersion_vote_nonexpert',
    'dispersion_pred_expert',
    'dispersion_pred_intermediate',
    'dispersion_pred_nonexpert',
    'prob_group[1]',  # Expert
    'prob_group[2]',  # Intermediate
    'prob_group[3]',  # Non-expert
]

for key in param_keys:
    rel_error = abs(original_params[key] - synth_params[key]) / original_params[key]
    comparison_records.append({
        'parameter': key,
        'original': original_params[key],
        'synthetic': synth_params[key],
        'relative_error': rel_error
    })
    print(f"{key}: Relative Error = {rel_error:.4f}")

# Save to CSV
df_compare = pd.DataFrame(comparison_records)
df_compare.to_csv('group3_parameter_recovery_comparison_mallows.csv', index=False)
