# Effects of Competition on the Illusion of Control

In [1]:
from dotenv import load_dotenv
load_dotenv()

import os
import numpy as np
import pandas as pd
from scipy import stats

### Reading & Preparing Data

In [2]:
def read_results(results_folder, run_id_list, experiment_id_list, model_id_list):
    r_df = pd.DataFrame()
    
    with os.scandir(results_folder) as runs:
        for r in runs:
            if r.is_file():
                continue
            run_id = r.path.split('/')[-1]
            if len(run_id_list) > 0 and not run_id in run_id_list:
                continue

            with os.scandir(r.path) as experiments:
                for e in experiments:
                    if e.is_file():
                        continue
                    experiment_id = e.path.split('/')[-1]
                    if len(experiment_id_list) > 0 and not experiment_id in experiment_id_list:
                        continue

                    with os.scandir(e.path) as models:
                        for m in models:
                            if m.is_file():
                                continue
                            model_id = m.path.split('/')[-1]
                            if len(model_id_list) > 0 and not model_id in model_id_list:
                                continue

                            with os.scandir(m.path) as iterations:
                                for i in iterations:
                                    if i.is_dir():
                                        continue
                                    i_ts = i.path.split('/')[-1].split('.')[0].split('_')[1]

                                    i_df = pd.read_csv(i.path)
                                    
                                    i_df = i_df.drop(i_df.columns[i_df.columns.str.contains('unnamed', case=False)], axis=1)
                                    i_df['run_id'] = run_id
                                    i_df['experiment_id'] = experiment_id
                                    i_df['model_id'] = model_id
                                    i_df['iteration_ts'] = i_ts

                                    if r_df.empty:
                                        r_df = i_df
                                    else:
                                        r_df = pd.concat([r_df, i_df])

    return r_df

In [3]:
run_id_list = ['competition_run']
experiment_id_list = ['competition']
model_id_list = []

results_df = read_results('.' + os.getenv('RESULTS_FOLDER'), run_id_list, experiment_id_list, model_id_list)

In [4]:
results_df.isna().sum()

model_id              0
model_name            0
model_provider        0
participant_gender    0
bet_1_raw             0
bet_1                 0
bet_2_raw             0
bet_2                 0
bet_3_raw             0
bet_3                 0
bet_4_raw             0
bet_4                 0
condition_id          0
condition_title       0
experiment_id         0
experiment_title      0
run_id                0
iteration_ts          0
dtype: int64

In [5]:
df = results_df[[
    'run_id', 'experiment_id', 'model_id', 'iteration_ts',
    'participant_gender',
    'condition_id',
    'bet_1', 'bet_2', 'bet_3', 'bet_4'
]].sort_values(by=['run_id', 'experiment_id', 'model_id', 'iteration_ts'])

df.loc[:, 'bet_avg'] = 0.25 * (df['bet_1'] + df['bet_2'] + df['bet_3'] + df['bet_4'])

### Analysis

In [38]:
def group_function(
    group_df: pd.DataFrame,
    condition_column: str,
    condition_control: str,
    metric_columns: list[str]
):
    if condition_column not in group_df.columns:
        raise RuntimeError(f'Column {condition_column} is not found in group DataFrame.')
    if group_df[group_df[condition_column] == condition_control].empty:
        raise RuntimeError(f'Condition {condition_control} is not found in {condition_column} column.')

    result = dict()
    
    group_control_df = group_df[group_df[condition_column] == condition_control]
    
    for m in metric_columns:
        result[(condition_control, m, 'N')] = round(group_control_df[m].count())
        result[(condition_control, m, 'Avg')] = round(group_control_df[m].mean(), 1)
        result[(condition_control, m, 'Std')] = round(group_control_df[m].std(), 1)

    for c in group_df[condition_column].unique():
        if c == condition_control:
            continue
        
        group_condition_df = group_df[group_df[condition_column] == c]

        for m in metric_columns:
            result[(c, m, 'N')] = round(group_condition_df[m].count())
            result[(c, m, 'Avg')] = round(group_condition_df[m].mean(), 1)
            result[(c, m, 'Std')] = round(group_condition_df[m].std(), 1)
        
            diff = result[(c, m, 'Avg')] - result[(condition_control, m, 'Avg')]
            r_diff = result[(c, m, 'Avg')] / result[(condition_control, m, 'Avg')] if result[(condition_control, m, 'Avg')] != 0 else np.nan

            if result[(condition_control, m, 'N')] > 1 and result[(c, m, 'N')] > 1 and result[(condition_control, m, 'Std')] > 0 and result[(c, m, 'Std')] > 0:
                t_stat, p_value = stats.ttest_ind_from_stats(
                    mean1=result[(c, m, 'Avg')], std1=result[(c, m, 'Std')], nobs1=result[(c, m, 'N')],
                    mean2=result[(condition_control, m, 'Avg')], std2=result[(condition_control, m, 'Std')], nobs2=result[(condition_control, m, 'N')],
                    equal_var=False,
                )
            else:
                t_stat, p_value = np.nan, np.nan

            result[(c, m, 'Diff')] = round(diff, 1)
            result[(c, m, 'R. Diff')] = round(r_diff, 3)
            result[(c, m, 'T-Stat')] = round(t_stat, 2)
            result[(c, m, 'P-Value')] = round(p_value, 4)

    return pd.Series(result)


def calc(
    df: pd.DataFrame,
    group_columns: list[str],
    condition_column: str,
    condition_control: str,
    metric_columns: list[str]
):
    return df.groupby(by=group_columns).apply(
        lambda g_df: group_function(g_df, condition_column, condition_control, metric_columns),
        include_groups=False
    )

In [39]:
calc(
    df=df,
    group_columns=['run_id', 'experiment_id', 'model_id', 'iteration_ts'],
    condition_column='condition_id',
    condition_control='dapper',
    metric_columns=['bet_1']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dapper,dapper,dapper,schnook,schnook,schnook,schnook,schnook,schnook,schnook
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,bet_1,bet_1,bet_1,bet_1,bet_1,bet_1,bet_1,bet_1,bet_1,bet_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,N,Avg,Std,N,Avg,Std,Diff,R. Diff,T-Stat,P-Value
run_id,experiment_id,model_id,iteration_ts,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
competition_run,competition,gpt4omini,1768073253,18.0,11.9,5.5,18.0,23.9,3.7,12.0,2.008,7.68,0.0
competition_run,competition,gpt4omini,1768073361,18.0,10.0,6.2,18.0,23.3,4.2,13.3,2.33,7.54,0.0
competition_run,competition,gpt4omini,1768073472,18.0,11.7,6.6,18.0,20.6,5.7,8.9,1.761,4.33,0.0001
competition_run,competition,gpt4omini,1768073579,18.0,11.9,4.2,18.0,22.5,5.8,10.6,1.891,6.28,0.0
competition_run,competition,gpt4omini,1768073694,18.0,11.1,4.4,18.0,22.2,5.2,11.1,2.0,6.91,0.0
competition_run,competition,gpt4omini,1768073805,18.0,10.7,4.5,18.0,23.3,5.1,12.6,2.178,7.86,0.0
competition_run,competition,gpt4omini,1768073910,18.0,10.9,4.6,18.0,22.5,5.2,11.6,2.064,7.09,0.0
competition_run,competition,gpt4omini,1768074027,18.0,11.4,5.4,18.0,22.2,6.5,10.8,1.947,5.42,0.0
competition_run,competition,gpt4omini,1768074129,18.0,11.4,5.9,18.0,22.5,4.9,11.1,1.974,6.14,0.0
competition_run,competition,gpt4omini,1768074230,18.0,15.0,7.3,18.0,23.1,4.9,8.1,1.54,3.91,0.0005
