# Qualtrics Data Cleaning 

In [1]:
import numpy as np
import pandas as pd

raw_data_filepath = '../data/survey-v0-sample-fake.csv'
# raw_data_filepath = '../data/survey-v0-sample-raw.csv'
CLEANED_FILEPATH = '../data/sample-preprocessed.csv'
FIELDS_FILEPATH = '../data/fields.csv'

cleaned_fields = [
    # Fields created by Qualtrics that we *DO NOT keep*
    #'StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
    #'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
    #'ExternalReference', 'LocationLatitude', 'LocationLongitude',
    #'DistributionChannel', 'UserLanguage', 
    
    # Fields created by Qualtrics that we *DO keep*
    'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
    
    # Fields to handle the uploaded file -- do not keep
    # 'Q43_Id', 'Q43_Name', 'Q43_Size', 'Q43_Type', 
    
    # Fields for setup that have consent and continue vs exit Qs
    'intro-1', 'intro-2', 
    
    # Fields for guiding the participant through the download process
    'download', 
    # Fields for download process failure
    'download-fail-expl', 'download-fail-screen_Id', 'download-fail-screen_Name',
    'download-fail-screen_Size', 'download-fail-screen_Type', 
    
    # Fields for personal Qs. e.g. demographics data, amazon usage, life changes
    'q-demos-age', 'Q-demos-hispanic', 'Q-demos-race', 'Q-demos-education',
    'Q-demos-income', 'Q-demos-gender', 'Q-sexual-orientation', 'Q-demos-state', 
    'Q-amazon-use-howmany', 'Q-amazon-use-hh-size', 'Q-amazon-use-how-oft', 
    'Q-substance-use_1', 'Q-substance-use_2', 'Q-substance-use_3', 
    'Q-personal_1', 'Q-personal_2', 
    'Q-life-changes',
    
    # Fields for Q asking if they will share data -- specific to experiment arm
    # 'Q-fast-completion', unused
    'Q-control', 'Q-altruism', 'Q-bonus-05',
    'Q-bonus-20', 'Q-bonus-50', 
    
    # Fields for Qs about perceived data value
    'Q-data-value-05', 'Q-data-value-20', 'Q-data-value-50', 'Q-data-value-100', 
    'Q-data-value-any', 'Q-data-value-any_1_TEXT', 
    
    # Fields for Qs about how your data should be used
    'Q-sell-YOUR-data', 'Q-sell-consumer-data', 'Q-small-biz-use', 
    'Q-census-use', 'Q-research-society', 'Q-attn-check',
    
    # Comments are not clean
    # 'Q-comments',
    
    # Fields for important embedded data set set
    # Used to indicate experiment arm:
    'showdata',
    'incentive', 
    # We set this to connect responses to mturk workers we pay
    'RandomID',
    # We set these to make the API hacks work -- do not need for analysis
    # 'SurveyID', 'ResponseID', 'FQID', 'API_TOKEN',
]

blacklist_fields = [
    'API_TOKEN',
    'BATCH',
    'DistributionChannel',
    'EndDate',
    'ExternalReference',
    'FQID',
    'Finished',
    'IPAddress',
    'LocationLatitude',
    'LocationLongitude',
    'Progress',
    'SurveyID',
    'ResponseID',
    'API_TOKEN',
    'Q-comments',
#    'Q-fast-completion',
    'Q43_Id',
    'Q43_Name',
    'Q43_Size',
    'Q43_Type',
    'StartDate',
    'EndDate',
    'Status',
    'IPAddress',
    'Progress',
    'RecipientLastName',
    'RecipientFirstName',
    'RecipientEmail',
    'ExternalReference',
    'LocationLatitude',
    'LocationLongitude',
    'DistributionChannel',
    'UserLanguage'
]

In [2]:
from QualtricsAPI.Setup import Credentials
survey_id = "SV_cMiItXdF95DdF5A"

#Call the qualtrics_api_credentials() method (XM Directory Users)
Credentials().qualtrics_api_credentials(
    # Generate at Account / Qualtrics IDs
    token='token',
    # data center ID is the subdomain in the URL after SSO with mit, like:
    # mit.co1.qualtrics.com
    data_center='co1',
    directory_id='directory_id') # found  on Account / Qualtrics IDs

In [3]:
from QualtricsAPI.Survey import Responses
responses = Responses().get_survey_responses(survey=survey_id, useLabels=False)
df = (responses
        [responses['Q43_Id'].isna()]
        .drop(blacklist_fields, axis=1)
     )

---

In [4]:
print('saving data (N=%s) to %s...' % (len(df), CLEANED_FILEPATH))
df.to_csv(CLEANED_FILEPATH, index=False)
print('...saved')

saving data (N=95) to ../data/sample-preprocessed.csv...
...saved


---

In [5]:
import csv
from lxml import etree
import botocore
import boto3
from boto.mturk.connection import MTurkConnection
from boto.mturk.question import HTMLQuestion

region_name = 'us-east-1' # modify region to match
aws_access_key_id='key' # put your key id here
aws_secret_access_key='secret' # put your secret key here

client = boto3.client(
    'mturk',
#    endpoint_url=endpoint_url,
    region_name=region_name,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)

In [6]:
def get_amazon_survey_HITs():
    def delete_key(x, k):
        """deletes key k and returns x"""
        del x[k]
        return x
    # adjust depending on what criteria our HIT needs to meet in order to be included
    def is_survey_hit(hit): 
        """returns True if the given hit object is a survey HIT"""
        return 'online purchases' in hit['Title']
    hit_pager = client.get_paginator('list_hits')
    pages = hit_pager.paginate(PaginationConfig = {'MaxItems': 5000, 'PageSize': 100})
    survey_HITs = []
    for page in pages:
        page_survey_hits = filter(is_survey_hit, page['HITs'])
        # 'Question' is very long, it gets annoying; delete it
        sh = map(lambda x: delete_key(x, 'Question'), page_survey_hits) 
        survey_HITs = survey_HITs + list(sh)
    return survey_HITs

survey_HITs = get_amazon_survey_HITs()

In [7]:
# parsing mturk API responses
def parse_survey_answer(answer):
    """Needed because aws stores the answer as xml... 
    returns just the code from the XML survey answer from the hit"""
    import xml.etree.ElementTree as ET
    tree = ET.fromstring(answer)
    notags = ET.tostring(tree, encoding='unicode', method='text').replace('surveycode', '')
    return notags

# utilities for getting information about an assignment from qualtrics
def get_bonus_amount(qualtrics_row):
    """Returns bonus amount ($.05, $.20, or $.50) if response was in bonus condition, $0 otherwise."""
    has_bonus = 'bonus' in qualtrics_row['incentive'] # if bonus not in incentive code, they don't get a bonus
    bonus_amt = qualtrics_row['incentive'].replace('bonus-', '') if has_bonus else 0
    return int(bonus_amt)

def did_pass_attention(qualtrics_row):
    """returns True if they passed the attention check, false otherwise."""
    attn_cols = list(filter(lambda x: 'attn' in x, list(qualtrics_row.index)))
    # they have three options. to pass they need to check all.
    assert len(attn_cols) == 3
    attn_answers = qualtrics_row[attn_cols].fillna(0).astype('int')
    # if all answers are 1 (pass), sum should be 3. 
    return np.sum(attn_answers.values) == 3


In [8]:
# hypothetical bonus columns

def get_assignments_for_HIT(HIT_id):
    """returns list of assignment dicts from all iterable pages for HIT_id"""
    all_assignments = []
    
    assn_pager = client.get_paginator('list_assignments_for_hit')
    pages = assn_pager.paginate(HITId = HIT_id, 
                            AssignmentStatuses = ['Submitted'], 
                            PaginationConfig = {'MaxItems': 5000, 'PageSize': 100})
    for page in pages:
        for assignment in page['Assignments']:
            all_assignments.append(assignment)
    return all_assignments

def get_worker_assignment_data(HIT_id, qualtrics_df):
    """returns a list of dicts representing HIT assignment data for payment.
    
    each dict in the list is of the form:
     {'worker_id': str,
      'random_id': str,
      'bonus_amount': int | np.nan,
      'passed_attention': bool | np.nan,
      'found_randomID_in_qualtrics': bool},
      
    notes: 
    - bonus_amount and passed_attention are np.nan if we cannot link an assignment to a qualtrics response.
    - bonus_amount is an int, so `50` corresponds to a $.50 bonus., `05` to $.05, etc.
    
    """
    assignment_results = []
    HIT_assignments = get_assignments_for_HIT(HIT_id)
    for assignment in HIT_assignments:
        # get qualtrics response row
        randomid_entered_on_hit = parse_survey_answer(assignment['Answer'])
        worker_id = assignment['WorkerId']
        this_assignment_data = {
            "worker_id": worker_id,
            "random_id": randomid_entered_on_hit
        }
        # dc - 10/4/22
        # Note: some workers use their WorkerId as their RandomID in the qualtrics
        # survey; e.g.:
        # 'worker_id': 'A28L0Q6S2GGBJQ',
        # 'random_id': 'A28L0Q6S2GGBJQ'
        # no way to track what survey response that assignment connects to, so no way to check
        # bonuses etc.
        # the below code tries to fix for this but there are edge cases, like multiple submissions
        # from the same worker if they enter their worker ID as their random ID each time.
        assignment_qualtrics_row = qualtrics_df[qualtrics_df.RandomID == randomid_entered_on_hit]
        if len(assignment_qualtrics_row) == 0:
            # if it's 0, we didn't find the randomID in our qualtrics responses
            this_assignment_data['found_randomID_in_qualtrics'] = False
            this_assignment_data['bonus_amount'] = np.nan
            this_assignment_data['passed_attention'] = np.nan
            assignment_results.append(this_assignment_data)
            continue
        assert len(assignment_qualtrics_row) == 1
        assignment_qualtrics_row = assignment_qualtrics_row.iloc[0]
        this_assignment_data['bonus_amount'] = get_bonus_amount(assignment_qualtrics_row)
        this_assignment_data['passed_attention'] = did_pass_attention(assignment_qualtrics_row)
        this_assignment_data['found_randomID_in_qualtrics'] = True
        assignment_results.append(this_assignment_data)
    return assignment_results

In [9]:
HIT_ID = survey_HITs[0]['HITId']
assignment_results = get_worker_assignment_data(HIT_ID, df)

In [10]:
assignment_results

[{'worker_id': 'A1JI19KPIVNL3Y',
  'random_id': '239152963',
  'bonus_amount': 0,
  'passed_attention': True,
  'found_randomID_in_qualtrics': True},
 {'worker_id': 'A28L0Q6S2GGBJQ',
  'random_id': 'A28L0Q6S2GGBJQ',
  'found_randomID_in_qualtrics': False,
  'bonus_amount': nan,
  'passed_attention': nan},
 {'worker_id': 'ABGKJYEITBKIL',
  'random_id': '284052890',
  'bonus_amount': 20,
  'passed_attention': False,
  'found_randomID_in_qualtrics': True},
 {'worker_id': 'A9W3MMLGGWVT1',
  'random_id': 'A9W3MMLGGWVT1',
  'found_randomID_in_qualtrics': False,
  'bonus_amount': nan,
  'passed_attention': nan},
 {'worker_id': 'AQKNC6HX4QOIT',
  'random_id': '307955165',
  'bonus_amount': 0,
  'passed_attention': False,
  'found_randomID_in_qualtrics': True},
 {'worker_id': 'A3Q8YKHCMJL6ZF',
  'random_id': '284934782',
  'bonus_amount': 50,
  'passed_attention': False,
  'found_randomID_in_qualtrics': True},
 {'worker_id': 'A2N6XPJRQCVN5W',
  'random_id': '380946321',
  'bonus_amount': 50,
  

Reference
---

In [None]:
sorted(list(df[df.RandomID == id1].columns))