In [1]:
import os
import sys
import pandas as pd
import numpy as np
import json 

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Question import Question, Category, Options
from XMLdoc import XMLdoc, Module, Task, Document, HIT, Set
from createSurvey import *

from PIL import Image
import imagehash
import base64
import io

In [2]:
def encode_b64_image(filename, type = "HTML"):
    ext = os.path.splitext(filename)[1][1:]
    with open(filename, "rb") as f:
        data = f.read()
    imgsrc = "data:image/{0};base64,{1}".format(ext, base64.b64encode(data).decode())
    
    if type == "HTML":
        return '<img class="img-thumbnail" src="{0}" alt="{1}">'.format(imgsrc, filename)
    return data

In [3]:
imagedir = "images"
sample_images = [os.path.join(imagedir, img) for img in os.listdir(imagedir)]
sample_images_encoded = [encode_b64_image(file) for file in sample_images]

sample_images

['images\\FluVaccine_Bing_Chrome_20200623.PNG',
 'images\\FluVaccine_Bing_Edge_20200623.PNG',
 'images\\FluVaccine_Bing_Mozilla_20200623.PNG',
 'images\\FluVaccine_Google_Chrome_20200623.PNG',
 'images\\FluVaccine_Google_Edge_20200623.PNG',
 'images\\FluVaccine_Google_Mozilla_20200623.PNG']

In [10]:
header = '''
<head>
<style type = "text/css">
    div {
        margin: 2em;
    }
    
    .footer {
        position: relative;
    }
</style>
</head>
'''

consent_form = '''
<h1>Microsoft Research Project Participation Consent Form</h1>
<h2>INTRODUCTION</h2>
Thank you for taking the time to consider volunteering in a Microsoft Corporation research project.  This form explains what would happen if you join this research project. 
Please read it carefully and take as much time as you need. Email the study team to ask about anything that is not clear.  
Participation in this study is voluntary and you may withdraw at any time. 
  
<h2>TITLE OF RESEARCH PROJECT</h2>
Search Results Task
  
<h3>Principal Investigator</h3>
David Rothschild
  
<h2>PURPOSE</h2>
The purpose of this project is to collect information about search results from search engines.

<h2>PROCEDURES</h2>
<p>During this session, you will be given an image of the first page of search results for a particular search query.</p>
<p>Your task is to count and categorize the links returned in the search results.</p>
<p>The task should take less than a minute.</p>

<h3>PAYMENT FOR PARTICIPATION</h3>
<p>You will receive $0.15 for completing this session.</p>

<h2>PERSONAL INFORMATION</h2>
<p>Aside from your WorkerID, we do not collect any personal information in this project. </p>
<p>Your WorkerID and response will be temporarily recorded and used for the purpose of analyzing per-user task performance.</p>
<p>Your WorkerID will not be shared outside of Microsoft Research and the confines of this study without your permission, and will be promptly deleted after compensation has been successfully provided (30 days or less). De-identified data may be used for future research or given to another investigator for future use without additional consent. </p>
<p>Responses from all participants will be aggregated and stored for a period of up to 5 years. Once your WorkerID is disassociated from your responses we may not be to remove your data from the study without re-identifying you.</p>
<p>For additional information on how Microsoft handles your personal information, 
please see the <a href="https://privacy.microsoft.com/en-us/privacystatement">Microsoft Privacy Statement</a>.</p>

<h2>BENEFITS AND RISKS</h2>
<p>Benefits: The research team expects to use these results in research projects relating to search engines. You will receive any public benefit that may come these Research Results being shared with the greater scientific community. </p>
<p>Risks:  During your participation, you should experience no greater risks than in normal daily life. <p/>
<p>You accept the risks described above and whatever consequences may come of those risks, however unlikely, 
unless caused by our negligence or intentional misconduct.  
You hereby release Microsoft and its affiliates from any claim you may have now or in the future arising from such risks or consequences.    
In addition, you agree that Microsoft will not be liable for any loss, damages or injuries 
that may come of improper use of the study prototype, equipment, facilities, or 
any other deviations from the instructions provided by the research team.   
Don’t participate in this study if you feel you may not be able to safely participate in any way 
including due to any physical or mental illness, condition or limitation.    
You agree to immediately notify the research team of any incident or issue or unanticipated risk or incident.</p>

<h2>CONTACT INFORMATION</h3>
<p>Should you have any questions concerning this project, or if you are harmed as a result of being in this study, please contact us at decisionresearchlab@outlook.com.</p>
<p>Should you have any questions about your rights as a research subject, please contact Microsoft Research Ethics Program Feedback at MSRStudyfeedback@microsoft.com.</p>
<p>Upon request, a copy of this consent form will be provided to you for your records. On behalf of Microsoft, we thank you for your contribution and look forward to your research session.</p>
'''

instructions = '''
<h1>Instructions</h1>
<p>In this task, you will be given an image of results returned by a search engine.</p>
<p>Using the image, you are asked to count the number of primary and secondary links that are organic (unpaid) search results, 
as well as number of primary and secondary links that are advertised (paid) search results.</p>

<h2>Definitions</h2>
<p>We define <b>primary links</b> to be links to unique <i>websites</i> returned by the search results.</p>
<p>Each primary link may or may not have one or more <b>secondary links</b> to unique <i>pages</i> on the primary website.</p>

<p>We define <b>organic</b> search results as the results that are returned by a search engine based on relevance to the user's search query, in contrast to paid <b>advertised</b> search results</p>
'''

In [11]:
ntasks = len(sample_images)
assignments_per_task = 5
tasks_per_assignment = 1

print(f"Creating mturk survey with {ntasks} tasks, {assignments_per_task} assignments per task, {tasks_per_assignment} tasks per assignment")
print(f"Total HITS: {(ntasks * assignments_per_task)/tasks_per_assignment}")

# QUESTIONS

q_consent = Question(varname = "consent", 
                    questiontext = "Read the consent form provided. Do you understand and agree to participate in this study?", 
                     valuetype = "categorical",
                     categories = ["I agree to participate in this study.", "I do not agree to participate in this study."],
                     values = [1, 0])

q_instructions = Question(varname = "instructions", 
                    questiontext = "Please carefully read the instructions before proceeding with this task.", 
                     valuetype = "categorical",
                     categories = ["I have read and understood the instructions."],
                     values = [1])

q_organic_primary = Question(varname = "organic_primary",
                  questiontext = "How many of the organic links shown in the image are primary links?",
                  valuetype = "numeric",
                  bonus = ["threshold:51", 1], 
                  helptext = "Organic search results are results that are returned by a search engine based on relevance to the user's search query, in contrast to paid advertisements.")

q_organic_secondary = Question(varname = "organic_secondary",
                  questiontext = "How many of the organic links shown in the image are secondary links?",
                  valuetype = "numeric",
                  bonus = ["threshold:51", 1], 
                  helptext = "Organic search results are results that are returned by a search engine based on relevance to the user's search query, in contrast to paid advertisements.")

q_ad_primary = Question(varname = "ad_primary",
                  questiontext = "How many of the advertised links shown in the image are primary links?",
                  valuetype = "numeric",
                  bonus = ["threshold:51", 1], 
                  helptext = "Organic search results are results that are returned by a search engine based on relevance to the user's search query, in contrast to paid advertisements.")

q_ad_secondary = Question(varname = "ad_secondary",
                  questiontext = "How many of the advertised links shown in the image are secondary links?",
                  valuetype = "numeric",
                  bonus = ["threshold:51", 1], 
                  helptext = "Organic search results are results that are returned by a search engine based on relevance to the user's search query, in contrast to paid advertisements.")

# MODULES

module_consent = Module(name = "consent", header = "Consent", 
                        questions = [q_consent])

module_instructions = Module(name = "instructions", header = "Instructions", 
                            questions = [q_instructions])

module_organic = Module(name = "organic", 
                        header = "Count the number of organic (unpaid) links in the search results", 
                        questions = [q_organic_primary, q_organic_secondary])

module_ad = Module(name = "ad", 
                        header = "Count the number of advertised (paid) links in the search results", 
                        questions = [q_ad_primary, q_ad_secondary])


# TASKS 

task_consent = Task(name = "consent", taskid = 0, modules = module_consent, 
                    content = header + wrap_div(consent_form, divclass = "doc"))

task_instructions = Task(name = "instructions", taskid = 1, modules = module_instructions, 
                        content = header + wrap_div(instructions, divclass = "doc"))

def get_label_task(image):
    return '''
    {img}
    '''.format(img = image)

tasklist_label = [Task(name = f"label_task{i}", taskid = i + 10, modules = [module_organic, module_ad],
                           content = header + wrap_div(get_label_task(sample_images_encoded[i]), divclass = "doc")) for i in range(ntasks)]



Creating mturk survey with 6 tasks, 5 assignments per task, 1 tasks per assignment
Total HITS: 30.0


In [12]:
XMLfile = XMLdoc(modules = [module_consent, module_instructions, module_organic, module_ad], 
                tasks = [task_consent, task_instructions] + tasklist_label, 
                hits = Survey(ntasks, assignments_per_task, tasks_per_assignment).get_hit_list(
                    start_task_index = 10, perm_tasks = [0, 1]))
XMLfile.add_documents([t.document for t in XMLfile.tasks])

for hit in XMLfile.hits:
    hit.add_taskcondition(1, "0*consent*consent>=1")
    for taskid in hit.task_list(start_task_index = 10):
        hit.add_taskcondition(taskid, "0*consent*consent>=1")

In [13]:
XMLfile.write_xml("search_result_ads_sample.xml")

In [14]:
sample_df = pd.DataFrame([os.path.basename(i) for i in sample_images], columns = ["image"])
sample_df["task"] = [i+10 for i in range(len(sample_images))]
#sample_df.to_csv("sample_images.csv", index = False)

In [15]:
sample_df

Unnamed: 0,image,task
0,FluVaccine_Bing_Chrome_20200623.PNG,10
1,FluVaccine_Bing_Edge_20200623.PNG,11
2,FluVaccine_Bing_Mozilla_20200623.PNG,12
3,FluVaccine_Google_Chrome_20200623.PNG,13
4,FluVaccine_Google_Edge_20200623.PNG,14
5,FluVaccine_Google_Mozilla_20200623.PNG,15
