In [17]:
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from collections import defaultdict

Read each fitting results file and print the fitting criteria and mean parameters

In [18]:
DATA_DIR = "../../data/fitting_results"
FIGURE_DIR = "../../figures"

In [19]:
# create a dictionary to store dataframes
all_model_dfs = {}
for agent_type in ['lqr', 'sparse_lqr', 'sparse_max_discrete', 'sparse_max_continuous', 'null_model_1', 'null_model_2', 'hill_climbing']:
    # read the aggregated fitting results
    df = pd.read_csv(f"{DATA_DIR}/fitting_results_{agent_type}.csv")
    all_model_dfs[agent_type] = df
    # print info about 
    print(f"agent type: {agent_type}, n={len(df)}")
    print(f"{agent_type} llh: {df['ll'].sum()}")
    print(f"{agent_type} AIC: {df['AIC'].sum()}")
    print(f"{agent_type} llh mean: {df['ll'].mean()}")
    print(f"{agent_type} AIC mean: {df['AIC'].mean()}")
    if agent_type == "lqr":
        print(f"mean exp param: {df['exp_param'].mean()}")
        print(f"mean vm param: {df['vm_param'].mean()}")

agent type: lqr, n=111
lqr llh: -12814.370781999998
lqr AIC: 26072.741563999996
lqr llh mean: -115.44478181981981
lqr AIC mean: 234.88956363963962
mean exp param: 0.009413725503555946
mean vm param: 2.0164565486785424
agent type: sparse_lqr, n=111
sparse_lqr llh: -8980.059542
sparse_lqr AIC: 18626.119084
sparse_lqr llh mean: -80.9014373153153
sparse_lqr AIC mean: 167.80287463063055
agent type: sparse_max_discrete, n=111
sparse_max_discrete llh: -8200.462375
sparse_max_discrete AIC: 17288.92475
sparse_max_discrete llh mean: -73.8780394144144
sparse_max_discrete AIC mean: 155.7560788288288
agent type: sparse_max_continuous, n=111
sparse_max_continuous llh: -8415.976799000002
sparse_max_continuous AIC: 17719.953598000004
sparse_max_continuous llh mean: -75.8196108018018
sparse_max_continuous AIC mean: 159.6392216036036
agent type: null_model_1, n=111
null_model_1 llh: -8873.0970816
null_model_1 AIC: 18634.1941632
null_model_1 llh mean: -79.93781154594593
null_model_1 AIC mean: 167.8756230

In [20]:
# read the participant ids
pp_nrs = pd.read_csv('../../data/experimental_data/experiment_ppids.csv')['id']

Get the number of participants best fit by each model, as well as the strength of evidence for the best model over the second-best model for each participant

In [21]:
models_by_best_fitting_pps = defaultdict(int)
participant_to_best_model = {}

for pp_id in pp_nrs:
    participant_fits = {}
    for agent_type in ['lqr', 'sparse_lqr', 'sparse_max_discrete', 'sparse_max_continuous', 'null_model_1', 'null_model_2', 'hill_climbing']:
        # select the dataframe for the selected agent type
        df = all_model_dfs[agent_type]

        if len(df[df['pp_id'] == pp_id]['AIC']) != 1:
            print("len:", len(df[df['pp_id'] == pp_id]['AIC']))
            break
        
        # add the AIC to the participant fits dictionary
        participant_fits[agent_type] = float(df[df['pp_id'] == pp_id]['AIC'])
    
    if len(df[df['pp_id'] == pp_id]) != 1:
        continue
    
    # select the best-fitting agent for this participant
    sorted_fits = sorted(participant_fits.values())
    best_agent = min(participant_fits, key=participant_fits.get)
    
    # increment the number of pps best fit by the model
    models_by_best_fitting_pps[best_agent] += 1
    # store the best-fitting model for this participant
    participant_to_best_model[pp_id] = best_agent

print(models_by_best_fitting_pps)

defaultdict(<class 'int'>, {'null_model_2': 17, 'hill_climbing': 37, 'sparse_max_discrete': 33, 'sparse_lqr': 12, 'sparse_max_continuous': 11, 'lqr': 1})


Create csv files for Bayesian model selection (done using SPM8)

In [22]:
df_aics = pd.DataFrame()
for df_type in all_model_dfs:
    # convert AICs to log model evidence format
    df_aics[df_type] = all_model_dfs[df_type]['AIC'].apply(lambda x: -x/2)
df_aics.to_csv(f"{DATA_DIR}/aic_lme.csv")  # save to csv

n_params = {"null_model_2": 2, "null_model_1": 4, "lqr": 2, "sparse_lqr": 3, "hill_climbing": 3, "sparse_max_continuous": 4, "sparse_max_discrete": 4}

df_bics = pd.DataFrame()
for df_type in all_model_dfs:
    # convert BICs to log model evidence format
    df_bics[df_type] = all_model_dfs[df_type]["ll"].apply(lambda x: - (n_params[df_type] * np.log(10) - 2 * x) / 2)
df_bics.to_csv(f"{DATA_DIR}/bic_lme.csv")  # save to csv

In [23]:
# make a csv file with the best-fitting model and parameters for each participant
df_bestfit = pd.DataFrame()
for pp_id in pp_nrs:
    best_model_type = participant_to_best_model[pp_id]  # pick the best model type
    df = all_model_dfs[best_model_type]  # get the dataframe for the best model
    row = df[df['pp_id'] == pp_id]  # get the relevant row using the pp id
    df_bestfit = df_bestfit.append(row, ignore_index=True)  # add the selected row to the best fit dataframe
df_bestfit.to_csv(f"{DATA_DIR}/best_fitting_models.csv")

Get the mean best-fitting parameter for each model type

In [24]:
for model_type in df_bestfit['agent_type'].drop_duplicates():
    # get only the rows of the best-fit dataframe for the current model type
    df_model = df_bestfit[df_bestfit['agent_type'] == model_type]
    print(f"MODEL TYPE: {model_type}")
    print(f"exp param: {df_model['exp_param'].mean()}")
    print(f"vm param: {df_model['vm_param'].mean()}")
    if model_type in ("hill_climbing", "sparse_max_continuous", "sparse_max_discrete"):
        print(f"step size: {df_model['step_size'].mean()}")
    if model_type in ("sparse_lqr", "sparse_max_continuous", "sparse_max_discrete"):
        print(f"attention cost: {df_model['attention_cost'].mean()}")
    if model_type == "null_model_1":  # n and b parameters for nm1
        print(f"n: {np.round(df_model['n']).mean()}")
        print(f"b: {df_model['b'].mean()}")

MODEL TYPE: null_model_2
exp param: 0.02945709633235715
vm param: 5.549289613974336
MODEL TYPE: hill_climbing
exp param: 0.06982937336977825
vm param: 5.850748535172354
step size: 0.38101831604162145
MODEL TYPE: sparse_max_discrete
exp param: 0.08594615322697012
vm param: 4.1322034045095934
step size: 0.6517082678364109
attention cost: 14.843517028179017
MODEL TYPE: sparse_lqr
exp param: 0.04818076964280734
vm param: 4.6945031869458385
attention cost: 135.88634989998812
MODEL TYPE: sparse_max_continuous
exp param: 0.05239728619664927
vm param: 5.334390566884107
step size: 0.20743542412651053
attention cost: 13.66615299113521
MODEL TYPE: lqr
exp param: 0.0247858565556271
vm param: 4.218178722788898


## Compare Participant Scores by Best-Fitting Model

In [25]:
# read the participant data
raw_pp_data_path = '../../data/experimental_data/experiment_actions.csv'
df_pps = pd.read_csv(raw_pp_data_path)
df_last = df_pps.loc[df_pps.groupby("pp_id")['Unnamed: 0'].idxmax()]

Get scores by which model explains each pp best

In [26]:
scores_by_best_model = defaultdict(list)
for index, row in df_last.iterrows():
    scores_by_best_model[participant_to_best_model[row['pp_id']]].append(np.sqrt(row['total_cost']))

In [27]:
print(f"sparse lqr median: {np.median(scores_by_best_model['sparse_lqr'])}")
print(f"hill climbing median: {np.median(scores_by_best_model['hill_climbing'])}")

sparse lqr median: 95.97784526310616
hill climbing median: 114.59424069297725


In [28]:
def bootstrap_ci(data, n=1000000):
    all_medians = []
    for i in range(n):
        bs_data = np.random.choice(data, len(data), replace=True)
        med = np.median(bs_data)
        all_medians.append(med)
    all_medians = np.array(all_medians)
    lower_bound = np.percentile(all_medians, 2.5)
    upper_bound = np.percentile(all_medians, 97.5)
    
    return lower_bound, upper_bound

In [29]:
for model_type in ("sparse_lqr", "hill_climbing"):
    scores = scores_by_best_model[model_type]
    lower_bound, upper_bound = bootstrap_ci(scores)
    print(f"{model_type}: {np.median(scores)}, [{lower_bound}, {upper_bound}]")

sparse_lqr: 95.97784526310616, [27.803339825313234, 172.17916861758323]
hill_climbing: 114.59424069297725, [96.429404229208, 127.38928526371438]


In [30]:
stats.kruskal(scores_by_best_model['sparse_lqr'], scores_by_best_model['hill_climbing'])

KruskalResult(statistic=0.5535135135135079, pvalue=0.45688561764287994)

Get descriptive stats like number of variables manipulated.

In [16]:
print(len(scores_by_best_model['sparse_lqr']) + len(scores_by_best_model['hill_climbing']))

48


Compute the number of variables manipulated and input norm standard deviation for humans.