# Probe data


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import statsmodels.formula.api as smf

In [16]:
pd.set_option('max_columns', None)

sample_df = pd.read_csv('../data/sample-preprocessed.csv', dtype='str')
print(sample_df.shape)
sample_df.head(3)

(17, 49)


Unnamed: 0,Duration (in seconds),Finished,RecordedDate,ResponseId,intro-1,intro-2,download,download-fail-expl,download-fail-screen_Id,download-fail-screen_Name,download-fail-screen_Size,download-fail-screen_Type,q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,Q-amazon-use-hh-size,Q-amazon-use-how-oft,Q-substance-use_1,Q-substance-use_2,Q-substance-use_3,Q-personal_1,Q-personal_2,Q-life-changes,Q-control,Q-altruism,Q-bonus-05,Q-bonus-20,Q-bonus-50,Q-data-value-05,Q-data-value-20,Q-data-value-50,Q-data-value-100,Q-data-value-any,Q-data-value-any_1_TEXT,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society,Q-attn-check,showdata,incentive,RandomID
0,45,1,2022-09-13 08:39:39,R_2ykJU7y9XoyCZTk,4,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,altruism,
1,288,1,2022-09-13 08:44:33,R_BV7CBQjtGOjHZF7,4,1,2.0,test problem flow by alex,F_YVTItdPN6hSEAkp,Screen Shot 2022-09-12 at 4.56.40 PM.png,53964.0,image/png,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,altruism,
2,239,1,2022-09-13 08:48:35,R_w4seX0QuU1Lz477,4,1,1.0,,,,,,2.0,2.0,16.0,4.0,2.0,2.0,2.0,22.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,,,,1.0,,,,,,,,4.0,4.0,13.0,3.0,3.0,123.0,False,bonus-20,199989274.0


In [13]:
N = sample_df['ResponseId'].nunique()
print('N=%s total responses' % N)

N=17 total responses


In [None]:
codebook = {
    # intro-1, intro-2 ask for consent or early exit
    # intro-1
    'intro-2': {'1': 'Continue', '2': 'Exit'},
}

### Meta data flow checks

Check the data by counting useful things.

How many did consent to the survey? (vs exited early)

How many had a download failure? What was the problem? Let's fix it.

How many failed the attention check?

In [15]:
# Consent

# There are two intro Qs asking about requirements and consent.
# intro-1 and intro-2
# If intro-2 resulted in early exit, intro-2 should be empty
n_did_consent_responses = sample_df[sample_df['intro-2']=='1']['ResponseId'].nunique()
print('%0.3f (%s/%s) did consent' % (n_did_consent_responses/N, n_did_consent_responses, N))

0.353 (6/17) did consent


Download failure

In [21]:
download_cols = [c for c in sample_df.columns if c.startswith('download')]
download_fail_df = sample_df[['ResponseId', 'RandomID'] + download_cols]
download_fail_df = download_fail_df[download_fail_df['download']=='2']
n_download_fail = len(download_fail_df)
print('%0.3f (%s/%s) failed download step' % (n_download_fail/N, n_download_fail, N))
download_fail_df

0.118 (2/17) failed download step


Unnamed: 0,ResponseId,RandomID,download,download-fail-expl,download-fail-screen_Id,download-fail-screen_Name,download-fail-screen_Size,download-fail-screen_Type
1,R_BV7CBQjtGOjHZF7,,2,test problem flow by alex,F_YVTItdPN6hSEAkp,Screen Shot 2022-09-12 at 4.56.40 PM.png,53964,image/png
15,R_2bOclY6yJ6uno5H,504695955.0,2,test if randomID will be shown,F_2YXoy4cy2dLgd4T,Screen Shot 2022-09-14 at 2.17.45 PM.png,3981,image/png


Attention check

'Q-attn-check':
> This is an attention check. Help us find people who are not paying attention. Select all answers to this question.


In [29]:
def did_pass_attn_check(resp):
    return True if len(str(resp).split(',')) == 3 else False

sample_df['passed_attn_check'] = sample_df['Q-attn-check'].apply(did_pass_attn_check)
print('%s passed the attention check' % sample_df['passed_attn_check'].sum())
sample_df.head()

3 passed the attention check


Unnamed: 0,Duration (in seconds),Finished,RecordedDate,ResponseId,intro-1,intro-2,download,download-fail-expl,download-fail-screen_Id,download-fail-screen_Name,download-fail-screen_Size,download-fail-screen_Type,q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,Q-amazon-use-hh-size,Q-amazon-use-how-oft,Q-substance-use_1,Q-substance-use_2,Q-substance-use_3,Q-personal_1,Q-personal_2,Q-life-changes,Q-control,Q-altruism,Q-bonus-05,Q-bonus-20,Q-bonus-50,Q-data-value-05,Q-data-value-20,Q-data-value-50,Q-data-value-100,Q-data-value-any,Q-data-value-any_1_TEXT,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society,Q-attn-check,showdata,incentive,RandomID,passed_attn_check
0,45,1,2022-09-13 08:39:39,R_2ykJU7y9XoyCZTk,4,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,altruism,,False
1,288,1,2022-09-13 08:44:33,R_BV7CBQjtGOjHZF7,4,1.0,2.0,test problem flow by alex,F_YVTItdPN6hSEAkp,Screen Shot 2022-09-12 at 4.56.40 PM.png,53964.0,image/png,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,altruism,,False
2,239,1,2022-09-13 08:48:35,R_w4seX0QuU1Lz477,4,1.0,1.0,,,,,,2.0,2.0,16.0,4.0,2.0,2.0,2.0,22.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,,,,1.0,,,,,,,,4.0,4.0,13.0,3.0,3.0,123.0,False,bonus-20,199989274.0,True
3,335,1,2022-09-13 11:11:52,R_2rBGztPmiRoLHWj,4,1.0,1.0,,,,,,2.0,2.0,1.0,4.0,2.0,1.0,1.0,22.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,,,,1.0,,,,,,,,,2.0,2.0,1.0,1.0,1.0,123.0,False,bonus-05,689693001.0,True
4,1321,1,2022-09-13 12:49:38,R_1OkQY5Lrzb43QtM,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,control,,False


## Data sharing

How many did, vs did not, consent to share their data? 

Add boolean field: `share` coded as 1=share; 0=no share

Make a table:

```
incentive, yes, no, n
```

And make RandomIDs retrievable for those who shared data and were in a bonus incentive experimental arm.

In [42]:
incentives_qs = ['Q-control', 'Q-altruism', 'Q-bonus-05', 'Q-bonus-20', 'Q-bonus-50']

def get_share(row):
    """Returns bolean indicated whether participant consented to share."""
    for incentive_q in incentives_qs:
        if row[incentive_q] in ['0', '1']:
            return int(row[incentive_q])
    return np.nan

sample_df['share'] = sample_df.apply(get_share, axis=1)
sample_df.head()

Unnamed: 0,Duration (in seconds),Finished,RecordedDate,ResponseId,intro-1,intro-2,download,download-fail-expl,download-fail-screen_Id,download-fail-screen_Name,download-fail-screen_Size,download-fail-screen_Type,q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,Q-amazon-use-hh-size,Q-amazon-use-how-oft,Q-substance-use_1,Q-substance-use_2,Q-substance-use_3,Q-personal_1,Q-personal_2,Q-life-changes,Q-control,Q-altruism,Q-bonus-05,Q-bonus-20,Q-bonus-50,Q-data-value-05,Q-data-value-20,Q-data-value-50,Q-data-value-100,Q-data-value-any,Q-data-value-any_1_TEXT,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society,Q-attn-check,showdata,incentive,RandomID,passed_attn_check,share
0,45,1,2022-09-13 08:39:39,R_2ykJU7y9XoyCZTk,4,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,altruism,,False,
1,288,1,2022-09-13 08:44:33,R_BV7CBQjtGOjHZF7,4,1.0,2.0,test problem flow by alex,F_YVTItdPN6hSEAkp,Screen Shot 2022-09-12 at 4.56.40 PM.png,53964.0,image/png,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,altruism,,False,
2,239,1,2022-09-13 08:48:35,R_w4seX0QuU1Lz477,4,1.0,1.0,,,,,,2.0,2.0,16.0,4.0,2.0,2.0,2.0,22.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,,,,1.0,,,,,,,,4.0,4.0,13.0,3.0,3.0,123.0,False,bonus-20,199989274.0,True,1.0
3,335,1,2022-09-13 11:11:52,R_2rBGztPmiRoLHWj,4,1.0,1.0,,,,,,2.0,2.0,1.0,4.0,2.0,1.0,1.0,22.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,,,,1.0,,,,,,,,,2.0,2.0,1.0,1.0,1.0,123.0,False,bonus-05,689693001.0,True,1.0
4,1321,1,2022-09-13 12:49:38,R_1OkQY5Lrzb43QtM,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,control,,False,


In [70]:
shares_count_df = sample_df.groupby(
    ['incentive']
)['ResponseId'].count().rename('allocated').to_frame()

shares_count_df['yes'] = shares_count_df.index.map(
    sample_df[sample_df['share']==1].groupby('incentive')['ResponseId'].count()
).fillna(0)
shares_count_df['no'] = shares_count_df.index.map(
    sample_df[sample_df['share']==0].groupby('incentive')['ResponseId'].count()
).fillna(0)
shares_count_df['n'] = shares_count_df.apply(lambda row: sum(row[['yes','no']]), axis=1)
shares_count_df.loc['total'] = [shares_count_df[c].sum() for c in shares_count_df.columns]
shares_count_df.astype(int)

Unnamed: 0_level_0,allocated,yes,no,n
incentive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altruism,5,0,1,1
bonus-05,3,1,0,1
bonus-20,3,1,0,1
bonus-50,2,0,0,0
control,4,0,1,1
total,34,2,2,4


RandomID list for bonus payments

In [37]:
for bonus in ['Q-bonus-05', 'Q-bonus-20', 'Q-bonus-50']:
    print('bonus incentive list for %s:\n' % bonus)
    sample_df[sample_df[bonus]==1]['RandomID'].to_list()

bonus incentive list for Q-bonus-05:

bonus incentive list for Q-bonus-20:

bonus incentive list for Q-bonus-50:

