# Setup
## Imports

In [1]:
%load_ext autoreload
%autoreload 2


In [21]:
import matplotlib.pyplot as plt
import json
import pandas as pd

from generative_social_choice.utils.helper_functions import get_base_dir_path
from generative_social_choice.slates.voting_utils import gini
from generative_social_choice.ratings.utility_matrix import extract_voter_utilities_from_info_csv
from generative_social_choice.utils.postprocessing import (
    plot_sorted_utility_distributions,
    plot_likert_category_clustered_bar_chart,
    scalar_utility_metrics,
    plot_candidate_distribution_stacked,   
)
from generative_social_choice.slates.voting_algorithms import (
    GreedyTotalUtilityMaximization,
    ExactTotalUtilityMaximization,
    LPTotalUtilityMaximization,
    ReweightedRangeVoting,
    GeometricTransformation,
)


## Load data

In [11]:
from generative_social_choice.utils.helper_functions import get_results_paths

LABELLING_MODEL = "4o-mini"

our_pipeline_result_dirs = [get_results_paths(labelling_model=LABELLING_MODEL, baseline=False, run_id=s,  embedding_type="llm")['utility_matrix_file'] for s in range(10)]
our_pipeline_result_dirs

[WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/0/generated_with_4o_using_llm_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/1/generated_with_4o_using_llm_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/2/generated_with_4o_using_llm_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/3/generated_with_4o_using_llm_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/4/generated_with_4o_

In [13]:
baseline_result_dirs = [get_results_paths(labelling_model=LABELLING_MODEL, baseline=False, run_id=f"fish_{s}",  embedding_type="fish")['utility_matrix_file'] for s in range(10)]
baseline_result_dirs

[WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/fish_0/generated_with_4o_using_fish_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/fish_1/generated_with_4o_using_fish_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/fish_2/generated_with_4o_using_fish_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/statements/fish_3/generated_with_4o_using_fish_embeddings/4o-mini_for_labelling/utility_matrix.csv'),
 WindowsPath('G:/Other computers/My Computer/NTFS/dev/generative_social_choice/generative_social_choice/data/results/stateme

In [14]:
# Load csvs
from pathlib import Path

# Load and merge our pipeline results
dfs_ours = []
for run_id, csv_path in enumerate(our_pipeline_result_dirs):
    df = pd.read_csv(csv_path, index_col=0, header=[0])
    # Create multiindex columns: (run_id, statement_id)
    df.columns = pd.MultiIndex.from_product([[run_id], df.columns], names=['run_id', 'statement_id'])
    dfs_ours.append(df)

# Merge all our pipeline dataframes
df_ours = pd.concat(dfs_ours, axis=1)
df_ours

run_id,0,0,0,0,0,0,0,0,0,0,...,9,9,9,9,9,9,9,9,9,9
statement_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,s39,s40,s41,s42,s43,s44,s45,s46,s47,s48
generation1,1.0,2.0,3.0,2.0,3.0,2.0,3.942932,2.840580,3.633299,2.949145,...,3.609351,3.197353,3.170728,3.828508,2.909827,3.054797,3.657096,3.006715,3.604248,3.215762
generation2,0.0,1.0,3.0,0.0,2.0,0.0,3.778286,2.946926,3.324725,3.002814,...,3.670735,3.090738,3.245388,3.499778,2.839152,3.001164,3.099498,2.958108,3.432330,3.264613
generation3,4.0,4.0,4.0,0.0,4.0,4.0,3.986486,3.936705,3.935221,3.924433,...,3.993985,3.943741,3.977463,3.996139,3.916983,3.940950,3.994327,3.944833,3.962288,3.995228
generation4,2.0,1.0,4.0,3.0,2.0,1.0,3.998020,2.813964,3.991555,2.214226,...,3.989284,3.251406,3.805581,3.984649,3.444030,3.439042,3.971363,2.002482,3.993974,3.988233
generation5,2.0,1.0,4.0,4.0,4.0,2.0,3.989969,3.790196,3.992708,3.916218,...,3.967461,3.840784,3.966082,3.984714,3.925103,3.938816,3.981071,3.916113,3.989769,3.965867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
generation96,4.0,4.0,4.0,1.0,4.0,4.0,3.985740,3.873320,3.996167,3.907693,...,3.990511,3.951611,3.985165,3.983180,3.921185,3.893748,3.974451,3.941803,3.974463,3.980517
generation97,1.0,2.0,3.0,1.0,3.0,2.0,3.271004,2.944387,3.177310,2.925313,...,3.224383,3.002321,3.014461,3.605246,2.938522,2.946576,3.038197,2.968257,3.157370,3.091172
generation98,2.0,2.0,4.0,1.0,4.0,2.0,3.967588,3.133832,3.960430,3.637952,...,3.967969,3.736390,3.841854,3.965997,3.683667,2.590904,3.984286,3.311317,3.932757,3.987983
generation99,3.0,2.0,2.0,2.0,2.0,2.0,3.093474,1.908323,3.386969,2.120680,...,3.254601,2.689543,2.778984,3.111632,2.200140,2.081919,2.499701,2.234431,2.713442,3.014619


In [16]:

# Load and merge baseline results
dfs_baseline = []
for run_id, csv_path in enumerate(baseline_result_dirs):
    df = pd.read_csv(csv_path, index_col=0, header=[0])
    # Create multiindex columns: (run_id, statement_id)
    df.columns = pd.MultiIndex.from_product([[run_id], df.columns], names=['run_id', 'statement_id'])
    dfs_baseline.append(df)

# Merge all baseline dataframes
df_baseline = pd.concat(dfs_baseline, axis=1)

print(f"df_ours shape: {df_ours.shape}")
print(f"df_baseline shape: {df_baseline.shape}")

df_baseline

df_ours shape: (100, 480)
df_baseline shape: (100, 480)


run_id,0,0,0,0,0,0,0,0,0,0,...,9,9,9,9,9,9,9,9,9,9
statement_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,s39,s40,s41,s42,s43,s44,s45,s46,s47,s48
generation1,1.0,2.0,3.0,2.0,3.0,2.0,3.606878,2.374456,3.340033,3.496645,...,2.474659,3.841186,2.877509,2.923776,2.874346,3.305598,1.999577,3.759770,2.968578,3.464279
generation2,0.0,1.0,3.0,0.0,2.0,0.0,3.133320,2.266603,3.291070,3.541779,...,2.755765,3.018781,3.004990,2.969787,2.913503,3.213426,2.123531,3.758277,2.976942,3.199522
generation3,4.0,4.0,4.0,0.0,4.0,4.0,3.993128,3.886643,3.947126,3.986632,...,3.906482,3.990423,3.917975,3.973903,3.939382,3.987964,3.879820,3.990560,3.974247,3.984298
generation4,2.0,1.0,4.0,3.0,2.0,1.0,3.997281,1.884515,3.976752,3.918682,...,2.022967,3.974662,3.076907,2.227909,2.586662,3.934105,1.326481,3.994621,2.229146,3.916668
generation5,2.0,1.0,4.0,4.0,4.0,2.0,3.974187,2.907868,3.985886,3.990719,...,3.632825,3.982241,3.968195,3.429931,3.179526,3.991991,3.029434,3.975952,3.851411,3.942721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
generation96,4.0,4.0,4.0,1.0,4.0,4.0,3.971385,3.815710,3.986407,3.973245,...,3.949032,3.968847,3.936251,3.932352,3.895415,3.977688,3.768560,3.985629,3.947697,3.952429
generation97,1.0,2.0,3.0,1.0,3.0,2.0,3.012188,1.990670,3.020738,3.002595,...,2.110535,2.942924,2.984580,2.467057,2.712076,3.107788,2.000237,3.253274,2.712322,3.045255
generation98,2.0,2.0,4.0,1.0,4.0,2.0,3.983599,1.968843,3.960472,3.943336,...,2.027830,3.872548,2.295713,2.290003,2.274354,3.987511,1.961220,3.941880,2.000546,3.985485
generation99,3.0,2.0,2.0,2.0,2.0,2.0,2.420106,2.017674,2.431835,2.904756,...,2.015156,2.253268,1.737717,2.030022,2.125922,2.555855,1.985011,3.310624,2.299399,2.840858


# Run Voting Algorithms

In [22]:
voting_algorithms_to_test = (
    GreedyTotalUtilityMaximization(),
    ExactTotalUtilityMaximization(),
    LPTotalUtilityMaximization(),
    # GreedyTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),
    ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),
    # LPTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),
    # GreedyTotalUtilityMaximization(utility_transform=GeometricTransformation(p=10.0)),
    # ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=10.0)),
    # LPTotalUtilityMaximization(utility_transform=GeometricTransformation(p=10.0)),
    # *all_instances(SequentialPhragmenMinimax),
    # SequentialPhragmenMinimax(load_magnitude_method="marginal_slate", clear_reassigned_loads=True, redistribute_defected_candidate_loads=False),
    # SequentialPhragmenMinimax(load_magnitude_method="marginal_slate", clear_reassigned_loads=False, redistribute_defected_candidate_loads=False),
    # SequentialPhragmenMinimax(load_magnitude_method="marginal_previous", clear_reassigned_loads=True, redistribute_defected_candidate_loads=True),
    # SequentialPhragmenMinimax(),
    ReweightedRangeVoting(),
    ReweightedRangeVoting(k=0.5),
)

## Our Pipeline

In [27]:
from generative_social_choice.slates.voting_utils import voter_utilities

# Set slate size (number of statements to select)
SLATE_SIZE = 5

# Store results for each run and algorithm
results_data = []

# Iterate through each run in df_ours
for run_id in df_ours.columns.get_level_values('run_id').unique():
    # Extract utility matrix for this run
    utility_matrix = df_ours[run_id]
    
    # Run each voting algorithm
    for voting_algorithm in voting_algorithms_to_test:
        # Run the voting algorithm
        slate, assignments = voting_algorithm.vote(
            rated_votes=utility_matrix.copy(),
            slate_size=SLATE_SIZE
        )
        
        # Extract voter utilities from assignments
        # assignments is a DataFrame with 'candidate_id' column
        voter_utilities_series = voter_utilities(
            rated_votes=utility_matrix,
            assignments_series=assignments['candidate_id'],
            output_column_name='utility'
        )
        
        # Store the result with run_id and algorithm name
        results_data.append({
            'run_id': run_id,
            'algorithm': voting_algorithm.name,
            'utilities': voter_utilities_series,
            'assignments': assignments['candidate_id'],
            'slate': sorted(slate)
        })

# Create dataframe with multiindex columns for voter utilities
dfs_utilities_list = []
for result in results_data:
    df_temp = pd.DataFrame({
        (result['run_id'], result['algorithm']): result['utilities']
    })
    df_temp.columns = pd.MultiIndex.from_tuples(
        df_temp.columns, 
        names=['run_id', 'algorithm']
    )
    dfs_utilities_list.append(df_temp)

# Concatenate all dataframes
df_voter_utilities = pd.concat(dfs_utilities_list, axis=1)

# Create dataframe with multiindex columns for voter assignments
dfs_assignments_list = []
for result in results_data:
    df_temp = pd.DataFrame({
        (result['run_id'], result['algorithm']): result['assignments']
    })
    df_temp.columns = pd.MultiIndex.from_tuples(
        df_temp.columns, 
        names=['run_id', 'algorithm']
    )
    dfs_assignments_list.append(df_temp)

# Concatenate all dataframes
df_voter_assignments = pd.concat(dfs_assignments_list, axis=1)

# Create dataframe for slates
# Index is range(SLATE_SIZE), columns are (run_id, algorithm)
slate_data = {}
for result in results_data:
    col_key = (result['run_id'], result['algorithm'])
    # Pad slate with None if it's shorter than SLATE_SIZE, or truncate if longer
    slate_list = list(result['slate']) + [None] * (SLATE_SIZE - len(result['slate']))
    slate_data[col_key] = slate_list[:SLATE_SIZE]

df_slates = pd.DataFrame(
    slate_data,
    index=range(SLATE_SIZE)
)
df_slates.columns = pd.MultiIndex.from_tuples(
    df_slates.columns,
    names=['run_id', 'algorithm']
)

print(f"df_voter_utilities shape: {df_voter_utilities.shape}")
print(f"df_voter_assignments shape: {df_voter_assignments.shape}")
print(f"df_slates shape: {df_slates.shape}")


df_voter_utilities shape: (100, 60)
df_voter_assignments shape: (100, 60)
df_slates shape: (5, 60)


In [28]:
df_slates

run_id,0,0,0,0,0,0,1,1,1,1,...,8,8,8,8,9,9,9,9,9,9
algorithm,GreedyTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=None),LPTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),"ReweightedRangeVoting(k=1.0, max_rating=None)","ReweightedRangeVoting(k=0.5, max_rating=None)",GreedyTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=None),LPTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),...,LPTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),"ReweightedRangeVoting(k=1.0, max_rating=None)","ReweightedRangeVoting(k=0.5, max_rating=None)",GreedyTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=None),LPTotalUtilityMaximization(utility_transform=None),ExactTotalUtilityMaximization(utility_transform=GeometricTransformation(p=1.5)),"ReweightedRangeVoting(k=1.0, max_rating=None)","ReweightedRangeVoting(k=0.5, max_rating=None)"
0,s19,s19,s19,s19,s11,s11,s40,s14,s14,s14,...,s1,s1,s16,s16,s1,s1,s1,s1,s18,s18
1,s22,s22,s22,s22,s14,s14,s41,s40,s40,s40,...,s34,s34,s25,s25,s19,s19,s19,s19,s19,s19
2,s28,s28,s28,s28,s19,s19,s46,s46,s46,s46,...,s35,s35,s35,s35,s26,s26,s26,s26,s26,s26
3,s30,s30,s30,s30,s22,s22,s5,s5,s5,s5,...,s40,s40,s40,s40,s40,s40,s40,s40,s42,s42
4,s5,s5,s5,s5,s30,s30,s6,s6,s6,s6,...,s5,s5,s7,s7,s5,s5,s5,s5,s7,s7
