In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
df_lorenz2011 = pd.read_csv('../data/prior_research_data/lorenz2011.csv')
df_gurcay2015 = pd.read_csv('../data/prior_research_data/gurcay2015.csv')
df_becker2017 = pd.read_csv('../data/prior_research_data/becker2017.csv')
df_becker2019 = pd.read_csv('../data/prior_research_data/becker2019.csv')

# Preparing Lorenz 2011

In [5]:
df_lorenz2011['study'] = "lorenz2011"
df_lorenz2011['influence'] = 0
df_lorenz2011['group_id'] = df_lorenz2011['session']
df_lorenz2011.loc[(df_lorenz2011.network != 'Solo'),'influence']= 1
df_lorenz2011['theta'] = df_lorenz2011['truth'].astype(int)
df_lorenz2011['pre_influence'] = df_lorenz2011['response_1']
df_lorenz2011['post_influence'] = df_lorenz2011['response_5']
df_lorenz2011['subject_id'] = df_lorenz2011['subject_id'].astype(str)+"_"+df_lorenz2011['session'].astype(str)
df_lorenz2011['task_id'] = "lorenz2017_task_"+df_lorenz2011['truth'].astype(str)
df_lorenz2011['original_condition'] = df_lorenz2011['network']

# Preparing Gurcay 2015

In [6]:
df_gurcay2015['study'] = "gurcay2015"
df_gurcay2015['influence'] = 0
df_gurcay2015['group_id'] = df_gurcay2015['group'].astype(str)+'_'+df_gurcay2015['study'].astype(str)
df_gurcay2015.loc[(df_gurcay2015.condition != 'C'),'influence']= 1
df_gurcay2015['theta'] = df_gurcay2015['true values'].astype(int)
df_gurcay2015['pre_influence'] = df_gurcay2015['est1']
df_gurcay2015['post_influence'] = df_gurcay2015['est2']
df_gurcay2015['subject_id'] = df_gurcay2015['subject.no'].astype(str)+"_"+df_gurcay2015['group'].astype(str)
df_gurcay2015['task_id'] = "gurcay2015_task_"+df_gurcay2015['question.no'].astype(str)+"_"+df_gurcay2015['true values'].astype(str)
df_gurcay2015['original_condition'] = df_gurcay2015['condition']

# Preparing Becker 2017

In [7]:
df_becker2017['study'] = "becker2017"
df_becker2017['influence'] = 0
df_becker2017['group_id'] = df_becker2017['group_number'].astype(str)+'_'+df_becker2017['study'].astype(str)
df_becker2017.loc[(df_becker2017.network != 'Solo'),'influence']= 1
df_becker2017['theta'] = df_becker2017['truth'].astype(int)
df_becker2017['pre_influence'] = df_becker2017['response_1']
df_becker2017['post_influence'] = df_becker2017['response_3']
df_becker2017['subject_id'] = df_becker2017['subject_id'].astype(str)+"_"+df_becker2017['group_number'].astype(str)
df_becker2017['task_id'] = "df_becker2017_"+df_becker2017['truth'].astype(str) #Becker has multiple ids for the same task. best to use the truth
df_becker2017['original_condition'] = df_becker2017['network']

# Preparing Becker 2019

In [8]:
df_becker2019['study'] = "becker2019"
df_becker2019['influence'] = 0
df_becker2019['group_id'] = df_becker2019['set'].astype(str)+"_"+df_becker2019['pair_id'].astype(str)+"_"+df_becker2019['network'].astype(str)+"_"+df_becker2019['experiment'].astype(str)+"_"+df_becker2019['party'].astype(str)+'_'+df_becker2019['study'].astype(str)
df_becker2019.loc[(df_becker2019.network != 'Control'),'influence']= 1
df_becker2019['theta'] = df_becker2019['truth'].astype(int)
df_becker2019['pre_influence'] = df_becker2019['response_1']
df_becker2019['post_influence'] = df_becker2019['response_3']
df_becker2019['subject_id'] = df_becker2019['user_id'].astype(str)+"_"+df_becker2019['group_id'].astype(str)
df_becker2019['task_id'] = "df_becker2019_"+df_becker2019['q'].astype(str) 
df_becker2019['original_condition'] = df_becker2019['network']

One task id (`df_becker2019_Unemployment`) has two thetas listed in dataset. We just want obs where theta = 4, and remove those where theta = -46

In [9]:
df_becker2019 = df_becker2019[(df_becker2019['theta'] != -46)]

Removing tasks with negative pre-influence estimates

In [10]:
df_becker2019 = df_becker2019[(df_becker2019['task_id']!='df_becker2019_Immigration') & 
                              (df_becker2019['task_id']!='df_becker2019_Military') &
                              (df_becker2019['task_id']!='df_becker2019_Soldiers')]

# Merging all of the datasets

In [11]:
columns = ['study', 'influence', 'group_id', 'theta', 'pre_influence', 'post_influence',
           'subject_id', 'task_id', 'original_condition']
df_prior_work = df_lorenz2011.append(df_gurcay2015).append(df_becker2017).append(df_becker2019)
df_prior_work = df_prior_work[columns]
df_prior_work['group_task'] = df_prior_work['group_id'].astype(str)+'_'+df_prior_work['task_id'].astype(str)

#df_prior_work['valid'] = (~df_prior_work['pre_influence'].isnull()  ~df_prior_work['post_influence'].isnull())
df_prior_work.to_csv('./data/empirical_data/prior_research.csv',index=False)
df_prior_work

Unnamed: 0,study,influence,group_id,theta,pre_influence,post_influence,subject_id,task_id,original_condition,group_task
0,lorenz2011,0,081111_1557,184,430.0,422.0,1_081111_1557,lorenz2017_task_184,Solo,081111_1557_lorenz2017_task_184
1,lorenz2011,0,081111_1557,184,520.0,22.0,2_081111_1557,lorenz2017_task_184,Solo,081111_1557_lorenz2017_task_184
2,lorenz2011,0,081111_1557,184,50.0,4.0,3_081111_1557,lorenz2017_task_184,Solo,081111_1557_lorenz2017_task_184
3,lorenz2011,0,081111_1557,184,15.0,10.0,4_081111_1557,lorenz2017_task_184,Solo,081111_1557_lorenz2017_task_184
4,lorenz2011,0,081111_1557,184,1750.0,1650.0,5_081111_1557,lorenz2017_task_184,Solo,081111_1557_lorenz2017_task_184
...,...,...,...,...,...,...,...,...,...,...
4348,becker2019,1,C_3_Social_1_Dem_becker2019,4,6.0,6.0,589_C_3_Social_1_Dem_becker2019,df_becker2019_Unemployment,Social,C_3_Social_1_Dem_becker2019_df_becker2019_Unem...
4349,becker2019,1,C_3_Social_1_Dem_becker2019,14,12.0,12.0,596_C_3_Social_1_Dem_becker2019,df_becker2019_Taxes,Social,C_3_Social_1_Dem_becker2019_df_becker2019_Taxes
4350,becker2019,1,C_3_Social_1_Dem_becker2019,14,16.0,17.0,551_C_3_Social_1_Dem_becker2019,df_becker2019_Taxes,Social,C_3_Social_1_Dem_becker2019_df_becker2019_Taxes
4351,becker2019,1,C_3_Social_1_Dem_becker2019,224,320.0,260.0,598_C_3_Social_1_Dem_becker2019,df_becker2019_Election,Social,C_3_Social_1_Dem_becker2019_df_becker2019_Elec...


In [12]:
for study in df_prior_work.study.unique():
    df_study = df_prior_work[df_prior_work.study==study]
    print('Study', study)
    print('individuals', len(df_study.subject_id.unique()))
    print('tasks', len(df_study.task_id.unique()))
    print('groups', len(df_study.group_id.unique()))
    print('trials', len(df_study.group_task.unique()))
    print("##########")

Study lorenz2011
individuals 144
tasks 6
groups 12
trials 71
##########
Study gurcay2015
individuals 278
tasks 16
groups 21
trials 336
##########
Study becker2017
individuals 1360
tasks 28
groups 34
trials 152
##########
Study becker2019
individuals 1103
tasks 4
groups 32
trials 128
##########


In [13]:
print('individuals', len(df_prior_work.subject_id.unique()))

individuals 2885


In [14]:
print('groups', len(df_prior_work.group_id.unique()))

groups 99


In [15]:
print('tasks', len(df_prior_work.task_id.unique()))

tasks 54


In [16]:
print('group_task', len(df_prior_work.group_task.unique()))

group_task 687


In [17]:
print('group_task with social influence', len(df_prior_work[df_prior_work.influence==1].group_task.unique()))

group_task with social influence 582
