In [1]:
import random

import pandas as pd

# initialize sanity checking metrics

In [2]:
def count_frachuman_perslot(distribs, n_slots=5):
    fracs = {}
    for i in range(n_slots):
        d_ishuman = [e[i]>0 for e in distribs]
        fracs[i] = (sum(d_ishuman)/len(d_ishuman))
    return fracs

def count_numhuman_perdistrib(distribs):
    return pd.DataFrame([sum([e>0 for e in d]) for d in distribs]).value_counts()

In [3]:
count_frachuman_perslot([(1,1,1,1,1),(0,1,1,0,1)])

{0: 0.5, 1: 1.0, 2: 1.0, 3: 0.5, 4: 1.0}

In [4]:
count_numhuman_perdistrib([(1,1,1,1,1),(0,1,1,0,1)])

3    1
5    1
dtype: int64

# test different distribution creators

In [5]:
n_cases = 60
n_slots = 5

def generate_iid(n_cases, n_slots):
    generated = []
    # create each option IID
    for _ in range(n_cases):
        generated.append(tuple( [random.getrandbits(1) for s in range(n_slots)] ))
    return generated

generated = generate_iid(n_cases, n_slots)
print(count_frachuman_perslot(generated))
print(count_numhuman_perdistrib(generated))

{0: 0.5166666666666667, 1: 0.48333333333333334, 2: 0.43333333333333335, 3: 0.48333333333333334, 4: 0.48333333333333334}
3    20
2    16
1    12
4    10
0     2
dtype: int64


In [6]:
n_cases = 60
n_slots = 5

def generate_slotbalanced(n_cases, n_slots):
    generated = []
    # guarantee balanced distrib of frac_human per SLOT
    quota_h = [int(n_cases/2) for _ in range(n_slots)]
    quota_b = [int(n_cases/2) for _ in range(n_slots)]
    for _ in range(n_cases):
        attempt = [random.getrandbits(1) for s in range(n_slots)]
        for i in range(len(attempt)):
            if quota_h[i]==0:
                attempt[i] = 0
            if quota_b[i]==0:
                attempt[i] = 1
            quota_h[i] -= attempt[i]
            quota_b[i] -= (1-attempt[i])
        generated.append(tuple( attempt ))
    return generated

generated = generate_slotbalanced(n_cases, n_slots)
print(count_frachuman_perslot(generated))
print(count_numhuman_perdistrib(generated))

{0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5, 4: 0.5}
3    21
2    16
1    10
4    10
0     2
5     1
dtype: int64


In [7]:
n_cases = 60
n_slots = 5

def generate_distribbalanced(n_cases, n_slots):
    generated = []
    # guarantee balanced distrib of frac_human per DISTRIB
    for num_h in range(n_slots+1):
        # generate multiple distrib each containing num_h human options
        unshuffled = [1 for _ in range(num_h)]+[0 for _ in range(n_slots-num_h)]
        for i in range(int(n_cases/(n_slots+1))):
            generated.append(tuple( random.sample(unshuffled, k=n_slots) ))
    return generated

generated = generate_distribbalanced(n_cases, n_slots)
print(count_frachuman_perslot(generated))
print(count_numhuman_perdistrib(generated))

{0: 0.4666666666666667, 1: 0.4666666666666667, 2: 0.5, 3: 0.5833333333333334, 4: 0.48333333333333334}
0    10
1    10
2    10
3    10
4    10
5    10
dtype: int64


In [8]:
n_cases = 60
n_slots = 5

def generate_generalbalanced(n_cases, n_slots):
    generated = []
    # guarantee balanced distrib of frac_human per DISTRIB+SLOT
    quota_h = [int(n_cases/2) for _ in range(n_slots)]
    quota_b = [int(n_cases/2) for _ in range(n_slots)]
    for num_h in range(n_slots+1):
        # generate multiple distrib each containing num_h human options
        unshuffled = [1 for _ in range(num_h)]+[0 for _ in range(n_slots-num_h)]
        for i in range(int(n_cases/(n_slots+1))):
            attempt = random.sample(unshuffled, k=n_slots)
            for i in range(len(attempt)):
                if quota_h[i]==0:
                    attempt[i] = 0
                if quota_b[i]==0:
                    attempt[i] = 1
                quota_h[i] -= attempt[i]
                quota_b[i] -= (1-attempt[i])
            generated.append(tuple( attempt ))
    return generated

generated = generate_generalbalanced(n_cases, n_slots)
print(count_frachuman_perslot(generated))
print(count_numhuman_perdistrib(generated))

{0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5, 4: 0.5}
3    12
5    12
0    10
1    10
2    10
4     6
dtype: int64


In [9]:
n_cases = 60
n_slots = 5

def generate_thoroughly(n_cases, n_slots):
    generated = []
    # brute force through every possible combination (do not include all-same-author options)
    counter = 0
    for _ in range(n_cases):
        attempt = tuple([int(c) for c in ('{0:0'+str(n_slots)+'b}').format(counter)][-5:])
        while sum(attempt)==0 or sum(attempt)>=n_slots:
            counter += 1
            attempt = tuple([int(c) for c in ('{0:0'+str(n_slots)+'b}').format(counter)][-5:])
        generated.append( attempt )
        counter += 1
    return generated

generated = generate_thoroughly(n_cases, n_slots)
print(count_frachuman_perslot(generated))
print(count_numhuman_perdistrib(generated))

{0: 0.5, 1: 0.5, 2: 0.5, 3: 0.5, 4: 0.5}
2    20
3    20
1    10
4    10
dtype: int64


# create CSV of survey links

In [None]:
url_prefix = 'https://'
url = 'cornell.ca1.qualtrics.com/jfe/form/XXXXXXXXXXXXXXXXXX' # # TODO: qualtrics survey mturk distribution link, removed for sharing purposes

# want the format in a csv file:
#> SURVEY_LINK
#> Hit1_SURVEY_LINK_data
#> Hit2_SURVEY_LINK_data
#> Hit3_SURVEY_LINK_data

# This creates a size-30 set of parameters for every possible permutation of human vs. bot
generated = generate_thoroughly(30, 5)

# Transform parameters into a list of dict objects defining sets of parameters
def transform_params(gen, x_value, randomize_ae_values=False, randomize_ae_range_incl=(1,5), seed=0):
    gen = gen.copy()
    param_manual = ['param_x']
    param_tuples = ['param_a', 'param_b', 'param_c', 'param_d', 'param_e']
    # first apply randomized values for param_a through param_e if applicable
    if randomize_ae_values:
        # for each of the elements in param_tuples, grab indices for all values!=0 and split into equal size buckets
        # so that each int within randomize_ae_range_incl represents one bucket
        random.seed(seed)
        ae_label_range = list(range(randomize_ae_range_incl[0], randomize_ae_range_incl[1]+1))
        for p_x in range(len(param_tuples)):
            # pick out which indices are relevant to each param name
            p_ixs = [i for i in range(len(gen)) if gen[i][p_x]!=0]
            # Check if the labels can be evenly assigned. If not, quit
            num_repeat = len(p_ixs)/len(ae_label_range)
            if num_repeat!=int(num_repeat):
                raise ValueError(f'you sure this is OK?: {p_ixs}, {ae_label_range}')
            num_repeat = int(num_repeat)
            # assign labels to shuffled indices
            random.shuffle(p_ixs)
            p_ixs_labels = list(zip(p_ixs, ae_label_range*num_repeat))
            for ix, newval in p_ixs_labels:
                gen[ix] = tuple([(gen[ix][i] if i!=p_x else newval) for i in range(len(gen[ix]))])
    # then create the list
    output = []
    for e in gen:
        output.append(dict(zip(param_manual+param_tuples, [x_value]+list(e))))
    return output

generated_params = []
# generated_params = generated_params+transform_params(generated, 0, randomize_ae_values=True, seed=10)
# generated_params = generated_params+transform_params(generated, 1, randomize_ae_values=True, seed=10)
generated_params = generated_params+transform_params(generated, 0, randomize_ae_values=True, seed=11)
generated_params = generated_params+transform_params(generated, 1, randomize_ae_values=True, seed=11)
# generated_params = generated_params+transform_params(generated, 0, randomize_ae_values=True, seed=12)
# generated_params = generated_params+transform_params(generated, 1, randomize_ae_values=True, seed=12)
# generated_params = generated_params+transform_params(generated, 0, randomize_ae_values=True, seed=13)
# generated_params = generated_params+transform_params(generated, 1, randomize_ae_values=True, seed=13)

def to_url(url, params, htmlescaped=False):
    str_param = '&'.join([e+'='+str(params[e]) for e in params.keys()])
    if htmlescaped:
        str_param = '&amp;'.join([e+'='+str(params[e]) for e in params.keys()])
    return url+'?'+str_param

csv_output = pd.DataFrame({'SURVEY_LINK':[], 'DISPLAY_LINK':[]})
for e in generated_params:
    csv_output = pd.concat([
        csv_output,
        pd.DataFrame({'SURVEY_LINK':[
            to_url(url, e), 
        ], 'DISPLAY_LINK':[
            to_url(url, e, htmlescaped=True), 
        ]})
    ], ignore_index=True)

csv_output.to_csv('mturk_urls.csv', index=False)
csv_output[:5]