In [71]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

In [72]:
df = pd.read_csv('../data/online-v0-prolific-sample-130.csv')
df.head()

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,Q_attn_midway,Q_reduce_meat,Q_demos_age,Q_demos_gender,Q_demos_income,Q_demos_race,Q_comments,PROLIFIC_PID,STUDY_ID,treatment
0,Start Date,End Date,Response Type,IP Address,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Recipient Last Name,...,Do you want to help us find out who is paying ...,Do you ever reduce meat consumption on the bas...,How old are you?,What is your gender?,What was your total household income before ta...,Choose one or more races that you consider you...,Any extra comments? (optional),PROLIFIC_PID,STUDY_ID,treatment
1,"{""ImportId"":""startDate"",""timeZone"":""America/De...","{""ImportId"":""endDate"",""timeZone"":""America/Denv...","{""ImportId"":""status""}","{""ImportId"":""ipAddress""}","{""ImportId"":""progress""}","{""ImportId"":""duration""}","{""ImportId"":""finished""}","{""ImportId"":""recordedDate"",""timeZone"":""America...","{""ImportId"":""_recordId""}","{""ImportId"":""recipientLastName""}",...,"{""ImportId"":""QID56""}","{""ImportId"":""QID53""}","{""ImportId"":""QID30""}","{""ImportId"":""QID57""}","{""ImportId"":""QID25""}","{""ImportId"":""QID24""}","{""ImportId"":""QID58_TEXT""}","{""ImportId"":""PROLIFIC_PID""}","{""ImportId"":""STUDY_ID""}","{""ImportId"":""treatment""}"
2,2022-06-12 16:12:09,2022-06-12 16:12:26,0,*******,100,16,1,2022-06-12 16:12:26,R_3QVIXPx1zfkZNWH,*******,...,,,,,,,,5c818f0d4f23cf000132ad6f,62a6647d859008af5141202e,
3,2022-06-12 16:12:44,2022-06-12 16:14:57,0,*******,100,133,1,2022-06-12 16:14:57,R_3I4OtkJA2u88n7O,*******,...,5,1,3,1,4,1,,628d114b1acd87ea1b37874e,62a6647d859008af5141202d,unlabeled
4,2022-06-12 16:13:28,2022-06-12 16:15:12,0,*******,100,103,1,2022-06-12 16:15:12,R_3il2zP80lXOBuK5,*******,...,5,2,3,1,6,1,that one attention check to select both was du...,5c8e72856243bb0001dcdea2,62a6647d859008af5141202d,labeled


In [73]:
print(list(df.columns))

['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'ProlificID ', 'Q_attn_1', 'Q_attn_2', 'Q_pasta_1_unlabeled', 'Q_pasta_2_unlabeled', 'Q_pasta_3_unlabeled', 'Q_pasta_4_unlabeled', 'Q_pasta_5_unlabeled', 'Q_sandwich_1', 'Q_sandwich_2', 'Q_sandwich_3', 'Q_sandwich_4', 'Q_sandwich_5', 'Q_pasta_1_labeled', 'Q_pasta_2_labeled', 'Q_pasta_3_labeled', 'Q_pasta_5_labeled', 'Q_pasta_6_labeled', 'Q_sandwich_1_labeled', 'Q_sandwich_2_labeled', 'Q_sandwich_3_labeled', 'Q_sandwich_4_labeled', 'Q_sandwich_5_labeled', 'Q_attn_check', 'Q_current_hunger', 'Q_urban', 'Q_politics', 'Q_priorities_1', 'Q_priorities_4', 'Q_priorities_5', 'Q_priorities_6', 'Q_priorities_7', 'Q_special_diet', 'Q_special_diet_6_TEXT', 'Q_past_vegetarian', 'Q_past_vegan', 'Q_recent_meal_veget'

Data fixups

In [81]:
"""
Oops! 
Q_pasta_5_labeled should be Q_pasta_4_labeled
Q_pasta_6_labeled should be Q_pasta_5_labeled
"""
if 'Q_pasta_6_labeled' in df.columns:
    print('renaming Q_pasta_5_labeled, Q_pasta_6_labeled')
    df = df.rename(columns={'Q_pasta_5_labeled': 'Q_pasta_4_labeled'})
    df = df.rename(columns={'Q_pasta_6_labeled': 'Q_pasta_5_labeled'})

## Data preprocessing


Transform into 3 tables:

0. Codebook

Maps column names to questions/descriptive text


1. Respondent data table:

Contains all relevant non-choice data for a respondent

2. Menu choices table

```
ResponseId, labeled, QID, menu (sandwich/pata), comparison type (1-5), choice (1,2) 
```

In [82]:
# extract the columns we use
respondent_columns = [
    'ResponseId', 'PROLIFIC_PID', 'Finished',
    'Duration (in seconds)', 'RecordedDate',
    'Q_attn_1', 'Q_attn_2', 'Q_attn_check', 'Q_attn_midway', 
    'Q_current_hunger', 'Q_urban', 'Q_politics', 
    'Q_priorities_1', 'Q_priorities_4', 'Q_priorities_5', 'Q_priorities_6', 'Q_priorities_7',
    'Q_special_diet', 'Q_special_diet_6_TEXT', 
    'Q_past_vegetarian', 'Q_past_vegan', 'Q_recent_meal_veget', 'Q_recent_meal_vegan',
    'Q_vegetarian_not_me', 'Q_vegan_not_me', 'Q_good_for_env', 'Q_how_important_env', 'Q_reduce_meat', 
    'Q_demos_age', 'Q_demos_gender', 'Q_demos_income', 'Q_demos_race', 
    'Q_comments',
    'treatment'
]

In [83]:
choice_Q_prefix = ['Q_pasta', 'Q_sandwich']
choice_Qs = [q for q in df.columns if any([q.startswith(p) for p in choice_Q_prefix])]
# choice_Qs

### Codebook

In [84]:
"""
Codebook
QID, text
"""
codebook_df = df[respondent_columns + choice_Qs].iloc[0].rename('text').to_frame()
codebook_df.head(10)

Unnamed: 0,text
ResponseId,Response ID
PROLIFIC_PID,PROLIFIC_PID
Finished,Finished
Duration (in seconds),Duration (in seconds)
RecordedDate,Recorded Date
Q_attn_1,Please enter the number you see here into the ...
Q_attn_2,Help us keep track of who is paying attention....
Q_attn_check,Which item would you prefer?
Q_attn_midway,Do you want to help us find out who is paying ...
Q_current_hunger,How would you rank your current hunger level o...


### Respondent table

In [85]:
respondent_df = df.drop([0,1])[respondent_columns]
starting_respondent_N = respondent_df['ResponseId'].nunique()
print('N=%s total responses before data cleaning' % starting_respondent_N)
respondent_df.head(3)

N=130 total responses before data cleaning


Unnamed: 0,ResponseId,PROLIFIC_PID,Finished,Duration (in seconds),RecordedDate,Q_attn_1,Q_attn_2,Q_attn_check,Q_attn_midway,Q_current_hunger,...,Q_vegan_not_me,Q_good_for_env,Q_how_important_env,Q_reduce_meat,Q_demos_age,Q_demos_gender,Q_demos_income,Q_demos_race,Q_comments,treatment
2,R_3QVIXPx1zfkZNWH,5c818f0d4f23cf000132ad6f,1,16,2022-06-12 16:12:26,21,6,,,,...,,,,,,,,,,
3,R_3I4OtkJA2u88n7O,628d114b1acd87ea1b37874e,1,133,2022-06-12 16:14:57,21,5,12.0,5.0,4.0,...,5.0,3.0,5.0,1.0,3.0,1.0,4.0,1.0,,unlabeled
4,R_3il2zP80lXOBuK5,5c8e72856243bb0001dcdea2,1,103,2022-06-12 16:15:12,21,5,2.0,5.0,2.0,...,3.0,2.0,4.0,2.0,3.0,1.0,6.0,1.0,that one attention check to select both was du...,labeled


In [86]:
# only include completed surveys
N_uncompleted = len(respondent_df[respondent_df['Finished'] != '1'])
respondent_df = respondent_df[respondent_df['Finished'] == '1']
print('dropping %s incomplete responses' % N_uncompleted)

dropping 1 incomplete responses


In [87]:
def passed_attn_checks(row):
    if row['PROLIFIC_PID'] == '5589a860fdf99b11b2d0404d': # they DM'd to say they fat fingered last attn check
        return True
    if int(row['Q_attn_1']) != 21:
        print(row['ResponseId'], 'failed Q_attn_1')
        return False
    if int(row['Q_attn_2']) != 5:
        print(row['ResponseId'], 'failed Q_attn_2')
        return False
    if row['Q_attn_check'] != '1,2':
        print(row['ResponseId'], 'failed Q_attn_check')
        return False
    if int(row['Q_attn_midway']) != 5:
        print(row['ResponseId'], 'failed Q_attn_midway')
        return False
    return True

In [88]:
respondent_df['passed_attn_checks'] = respondent_df.apply(passed_attn_checks, axis=1)
passed_attn_checks_ResponseId_list = list(respondent_df[respondent_df.passed_attn_checks == True].ResponseId)
print('%s/%s respondents passed all attention checks' % (len(passed_attn_checks_ResponseId_list), len(respondent_df)))
passed_attn_checks_ResponseId_list

R_3QVIXPx1zfkZNWH failed Q_attn_2
R_3il2zP80lXOBuK5 failed Q_attn_check
R_PvXWPrLkXyhJjxf failed Q_attn_2
R_1kTuAvHs1eI18ko failed Q_attn_2
R_2PnhjASWnDceNtA failed Q_attn_2
R_2CUMXBtM9T9891B failed Q_attn_check
R_1QE4d9AOcRg1vSu failed Q_attn_2
122/129 respondents passed all attention checks


['R_3I4OtkJA2u88n7O',
 'R_2azWqLxVBzcA2KZ',
 'R_10IsfPbYwY0wERg',
 'R_1n1pb9SbHwD3XOr',
 'R_3iExvfnLn47ukrC',
 'R_3htHJhGptJExUft',
 'R_1NhbuKqiEsRy2DD',
 'R_3hmWa2jK7lzK0ub',
 'R_24eEsnPSi0l8d1s',
 'R_4JIewS5n6lOLSvv',
 'R_3qW4GUVHVpDPZNv',
 'R_2P5UFinveb1H6gh',
 'R_2OZ3xFEfQVOh6lG',
 'R_32ONX06Kysa0vsG',
 'R_3NVCmFVPrinTXN8',
 'R_3QDDtSrTsOejt55',
 'R_3HNcjW336H2njv6',
 'R_1LZj0t9MAuLfwrq',
 'R_Ra9BGKCIVvWFuud',
 'R_24x89ccbVwJkHhP',
 'R_86zZbDSJgKQxoxH',
 'R_1Cw1PBVeVHzrhG3',
 'R_1GhjboAvfG7rE5j',
 'R_eWgmOQtdDVIKZl7',
 'R_3feH8sy4afR3svz',
 'R_1dsm1BorD6Za2tp',
 'R_3suJdCibtHH4Vj3',
 'R_11gZNQ4U6jMcSHs',
 'R_28S72uaINIb2rqU',
 'R_1MZ2Lf2eSAxceS1',
 'R_3qU5RdsIg6vP3Gl',
 'R_29aa6OibOBCvJKQ',
 'R_2CHgn1mmiKcR2SX',
 'R_1Namekh4pLeMf3T',
 'R_3fZhmaMuabhFptf',
 'R_1et3vn32HIHCaJ6',
 'R_vGYJVmKlJIMxFM5',
 'R_r0D8RTlo2ovaEjD',
 'R_30leMsof6DNKb1K',
 'R_3sgZqmf1CJs3elj',
 'R_7ZNSaMaK6OA1cEF',
 'R_2QPrXAvSc5h6dNU',
 'R_3lSMKvXWjFEywu8',
 'R_1NhlhMVyDliqpKe',
 'R_1lxW2cdS76ntJbE',
 'R_eqEpPX

### Menu choices table

```
ResponseId, labeled (0/1), QID, menu (sandwich/pata), choice_type (1-5), choice (1,2) 
```

In [89]:
# Wide data
choices_wide_df = (df[['ResponseId'] + choice_Qs].drop([0,1]))
# drop data from respondents who failed attention checks
choices_wide_df = choices_wide_df[
    (choices_wide_df.ResponseId.isin(passed_attn_checks_ResponseId_list))
]
choices_wide_df.head(3)

Unnamed: 0,ResponseId,Q_pasta_1_unlabeled,Q_pasta_2_unlabeled,Q_pasta_3_unlabeled,Q_pasta_4_unlabeled,Q_pasta_5_unlabeled,Q_sandwich_1,Q_sandwich_2,Q_sandwich_3,Q_sandwich_4,...,Q_pasta_1_labeled,Q_pasta_2_labeled,Q_pasta_3_labeled,Q_pasta_4_labeled,Q_pasta_5_labeled,Q_sandwich_1_labeled,Q_sandwich_2_labeled,Q_sandwich_3_labeled,Q_sandwich_4_labeled,Q_sandwich_5_labeled
3,R_3I4OtkJA2u88n7O,,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,...,,,,,,,,,,
5,R_2azWqLxVBzcA2KZ,,,,,,,,,,...,2.0,1.0,2.0,1.0,,2.0,1.0,,2.0,2.0
6,R_10IsfPbYwY0wERg,2.0,1.0,,1.0,1.0,2.0,2.0,,1.0,...,,,,,,,,,,


In [90]:
# Transform the wide data to long data.
choices_df = pd.melt(choices_wide_df, 
                     id_vars=['ResponseId'], value_vars=choice_Qs, 
                     var_name='QID', value_name='choice')
choices_df.head()

Unnamed: 0,ResponseId,QID,choice
0,R_3I4OtkJA2u88n7O,Q_pasta_1_unlabeled,
1,R_2azWqLxVBzcA2KZ,Q_pasta_1_unlabeled,
2,R_10IsfPbYwY0wERg,Q_pasta_1_unlabeled,2.0
3,R_1n1pb9SbHwD3XOr,Q_pasta_1_unlabeled,
4,R_3iExvfnLn47ukrC,Q_pasta_1_unlabeled,


In [91]:
# Add Q data
# ResponseId, labeled (0/1), QID, menu (sandwich/pata), choice_type (1-5), choice (1,2) 
choices_df['menu'] = choices_df['QID'].apply(lambda QID: QID.split('_')[1])
choices_df['choice_type'] = choices_df['QID'].apply(lambda QID: QID.split('_')[2])
choices_df['labeled'] = choices_df['QID'].apply(lambda QID: 1 if len(QID.split('_')) > 3 and QID.split('_')[3] == 'labeled' else 0)
# only keep responses for answered questions
print('dropping unanswered Qs; respondents answer %s = %s/%s of total Qs' % (
    len(choices_df.dropna())/len(choices_df), len(choices_df.dropna()), len(choices_df)
))
choices_df = choices_df.dropna()
choices_df.head()

dropping unanswered Qs; respondents answer 0.4 = 976/2440 of total Qs


Unnamed: 0,ResponseId,QID,choice,menu,choice_type,labeled
2,R_10IsfPbYwY0wERg,Q_pasta_1_unlabeled,2,pasta,1,0
6,R_1NhbuKqiEsRy2DD,Q_pasta_1_unlabeled,1,pasta,1,0
7,R_3hmWa2jK7lzK0ub,Q_pasta_1_unlabeled,2,pasta,1,0
10,R_3qW4GUVHVpDPZNv,Q_pasta_1_unlabeled,2,pasta,1,0
11,R_2P5UFinveb1H6gh,Q_pasta_1_unlabeled,1,pasta,1,0


In [92]:
# make choice 0/1. choice currently coded as 1: X (labeled); 2: Y 
# relabel to change value of 2 (Y)
# 1: X (labeled) 0: Y
choices_df['choice'] = choices_df['choice'].apply(lambda c: 1 if int(c) == 1 else 0)
choices_df.head()

Unnamed: 0,ResponseId,QID,choice,menu,choice_type,labeled
2,R_10IsfPbYwY0wERg,Q_pasta_1_unlabeled,0,pasta,1,0
6,R_1NhbuKqiEsRy2DD,Q_pasta_1_unlabeled,1,pasta,1,0
7,R_3hmWa2jK7lzK0ub,Q_pasta_1_unlabeled,0,pasta,1,0
10,R_3qW4GUVHVpDPZNv,Q_pasta_1_unlabeled,0,pasta,1,0
11,R_2P5UFinveb1H6gh,Q_pasta_1_unlabeled,1,pasta,1,0


## Invalid simple model

Warning:  just messing around here!
- not controlling for effects by ResponseId -- mixing the many responses per ResponseId together
- mixing all of the choice types together


In [93]:
simple_model = smf.logit("choice ~ labeled", data=choices_df).fit()
print(simple_model.summary())

Optimization terminated successfully.
         Current function value: 0.689626
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                 choice   No. Observations:                  976
Model:                          Logit   Df Residuals:                      974
Method:                           MLE   Df Model:                            1
Date:                Thu, 16 Jun 2022   Pseudo R-squ.:                0.003037
Time:                        14:18:48   Log-Likelihood:                -673.08
converged:                       True   LL-Null:                       -675.13
Covariance Type:            nonrobust   LLR p-value:                   0.04286
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0250      0.091      0.274      0.784      -0.154       0.204
labeled       -0.2599      0.

In [94]:
simple_model_odds = pd.DataFrame(np.exp(simple_model.params), columns=['OR'])
simple_model_odds['z-value'] = simple_model.pvalues
simple_model_odds[['2.5%', '97.5%']] = np.exp(simple_model.conf_int())
print(simple_model_odds)

                 OR   z-value     2.5%     97.5%
Intercept  1.025316  0.784197  0.85733  1.226218
labeled    0.771092  0.043072  0.59942  0.991931


## Main effects by comparison type

In [97]:
for choice_type in choices_df.choice_type.unique():
    print('\n-------- Model for comparison type %s ---------\n' % choice_type)
    c_df = choices_df[choices_df.choice_type == choice_type]
    choice_type_model = smf.logit("choice ~ labeled", data=c_df).fit()
    print(choice_type_model.summary())
    choice_type_model_odds = pd.DataFrame(np.exp(choice_type_model.params), columns=['OR'])
    choice_type_model_odds['z-value'] = choice_type_model.pvalues
    choice_type_model_odds[['2.5%', '97.5%']] = np.exp(choice_type_model.conf_int())
    print(choice_type_model_odds)


-------- Model for comparison type 1 ---------

Optimization terminated successfully.
         Current function value: 0.656433
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                 choice   No. Observations:                  194
Model:                          Logit   Df Residuals:                      192
Method:                           MLE   Df Model:                            1
Date:                Thu, 16 Jun 2022   Pseudo R-squ.:               0.0005294
Time:                        14:19:29   Log-Likelihood:                -127.35
converged:                       True   LL-Null:                       -127.42
Covariance Type:            nonrobust   LLR p-value:                    0.7134
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4940      0.211     -2.336      0.019     