In [1]:
"""
This file cleans trialData.csv, outputs two files: the cleaned version, and the feedback
"""
import json
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
from os.path import basename

## Prepare data into dataframe

In [3]:
input_path = '../../ptdir/trialdata.csv'
# Creates the dataframe
columnNames = ["SubjectID", "#", "trialId", "jsonStr"]
# File Selector
trial = pd.read_csv(input_path, names=columnNames, header=None)

In [4]:
# Set up the values we want to extract from the jsonStr column. The jsonStr column contains
# the data 
keys = ['rt', 'trial_type', 'view_history', 'internal_node_id', 'time_elapsed',
        'trial_index', 'responses', 'questions', 'imgName', 'isRepeat',
        'isRandom', 'stimulus', 'key_press', 'payment', 'total_unique_trials', 
        'random_num', 'task_version']
data = {key: [] for key in keys}

In [5]:
# Extractor
for jsonStr in trial.jsonStr:
    jsonDict = json.loads(jsonStr)
    for key in keys:
        try:
            data[key].append(jsonDict[key])
        except KeyError:
            data[key].append(None)

In [6]:
# Inserts the data into the trial dataframe
for key in keys:
    trial[key] = data[key]
    
    
# These are the keys for the question prompt.
# set up the values we want to extract for the question
qKeys = ['prompt', 'options', 'required', 'horizontal', 'labels']
qData = {key: [] for key in qKeys}

In [7]:
# Extractor for question - potential for multiple questions in a single trial hence the 
# lists
for questions in trial.questions:
    if questions is not None:
        prompt = []
        options = []
        required = []
        horizontal = []
        labels = []
        # row in the Column
        for question in json.loads(questions):
            try:
                prompt.append(question[key])
            except KeyError:
                prompt.append(None)

            try:
                options.append(question[key])
            except KeyError:
                options.append(None)

            try:
                required.append(question[key])
            except KeyError:
                required.append(None)

            try:
                horizontal.append(question[key])
            except KeyError:
                horizontal.append(None)

            try:
                labels.append(question[key])
            except KeyError:
                labels.append(None)

        qData['prompt'].append(prompt)
        qData['options'].append(options)
        qData['required'].append(required)
        qData['horizontal'].append(horizontal)
        qData['labels'].append(labels)
    else:
        for key in qKeys:
            qData[key].append([None])

There are currently 38 people's data. (excluding Amanda's 2 debugs in pilot 1 and pilot 2, there are actually 36 AMTurkers' data)

In [8]:

# Inserts question dictionary values into trial dataframe
for key in qKeys:
    trial[key] = qData[key]

## Subject Identification 

In [10]:
# Debug ID: A6XBXQC3G59N8:3VP0C6EFSI3WXKQRF9ERIP60I906M0 - in pilot 1, sandbox mode
# Debug ID: 'debugV9SJY8:debugA4LEZF', in pilot 2, debug mode. 
trial = trial[trial.SubjectID != 'A6XBXQC3G59N8:3VP0C6EFSI3WXKQRF9ERIP60I906M0']
trial = trial[trial.SubjectID != 'debugV9SJY8:debugA4LEZF']

In [12]:

# We want a readable ID that we can quickly scan
subID_easy = {subId: None for subId in trial.SubjectID}
for i, key in enumerate(subID_easy):
    subID_easy[key] = int(i + 1)
trial.loc[:, 'SubjectID'] = trial.SubjectID.map(subID_easy)
trial

Unnamed: 0,SubjectID,#,trialId,jsonStr,rt,trial_type,view_history,internal_node_id,time_elapsed,trial_index,...,key_press,payment,total_unique_trials,random_num,task_version,prompt,options,required,horizontal,labels
172,2,0,1542250518533,"{""rt"": 12305, ""trial_type"": ""instructions"", ""v...",12305.0,instructions,"[{""page_index"":0,""viewing_time"":4433},{""page_i...",0.0-0.0,12322,0,...,,,,,,[None],[None],[None],[None],[None]
173,2,1,1542250531739,"{""rt"": 13198, ""trial_type"": ""survey-multi-choi...",13198.0,survey-multi-choice,,0.0-1.0-0.0,25529,1,...,,1.0,80.0,4.0,,"[None, None, None, None, None, None]","[None, None, None, None, None, None]","[None, None, None, None, None, None]","[None, None, None, None, None, None]","[None, None, None, None, None, None]"
174,2,2,1542250557617,"{""rt"": 25872, ""trial_type"": ""survey-text-req"",...",25872.0,survey-text-req,,0.0-2.0,51407,2,...,,,,,,[None],[None],[None],[None],[None]
175,2,3,1542250576588,"{""rt"": 17612, ""trial_type"": ""survey-text-req"",...",17612.0,survey-text-req,,0.0-3.0-0.0,69022,3,...,,,,,,[None],[None],[None],[None],[None]
176,2,4,1542250587256,"{""rt"": 8383, ""trial_type"": ""survey-text-req"", ...",8383.0,survey-text-req,,0.0-3.1-0.1,78762,4,...,,,,,,[None],[None],[None],[None],[None]
177,2,5,1542250608069,"{""rt"": 20813, ""trial_type"": ""survey-text-req"",...",20813.0,survey-text-req,,0.0-3.2-0.2,101859,5,...,,,,,,[None],[None],[None],[None],[None]
178,2,6,1542250614780,"{""rt"": 6706, ""trial_type"": ""face-likert-amanda...",6706.0,face-likert-amanda,,0.0-4.0,108570,6,...,,,,,,"[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),..."
179,2,7,1542250616362,"{""rt"": 1579, ""trial_type"": ""instructions"", ""vi...",1579.0,instructions,"[{""page_index"":0,""viewing_time"":1579}]",0.0-5.0,110152,7,...,,,,,,[None],[None],[None],[None],[None]
180,2,8,1542250622621,"{""rt"": 6257, ""trial_type"": ""face-likert-amanda...",6257.0,face-likert-amanda,,0.0-6.0-0.0-0.0,116411,8,...,,,,,,"[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),..."
181,2,9,1542250626338,"{""rt"": 3712, ""trial_type"": ""face-likert-amanda...",3712.0,face-likert-amanda,,0.0-6.0-0.0-1.0,120128,9,...,,,,,,"[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),...","[[1 <br/>(Completely unwilling to take risks),..."


In [13]:

# t contains subject attributes, payment, and total_unique_trials
t = trial[trial.trial_type == 'survey-multi-choice']
subAttribute = {iden: json.loads(resp) for iden, resp in zip(t.SubjectID, t.responses)}
payment = {i + 1: pay for i, pay in enumerate(t.payment)}
tut = {i + 1: to for i, to in enumerate(t.total_unique_trials)}


# This is used to identify the subject's location
numSubjects = max(trial.SubjectID)
subLoc = {i: None for i in range(1, numSubjects + 1)}
txts = trial[trial.trial_type == 'survey-text-req']
for i, txt in zip(txts.SubjectID, txts.responses):
    if i <= 20 and subLoc[i] is None:
        subLoc[i] = json.loads(txt)
        

# Reseting allows the index to be proper since we removed the debugger
trial = trial.reset_index(drop=True)

In [15]:
# Create the new columns/change the values in the columns
for i, subject in enumerate(trial.SubjectID):
    try: 
        trial.loc[i, 'Age'] = subAttribute[subject]['Q0']
        trial.loc[i, 'Gender'] = subAttribute[subject]['Q1']
        trial.loc[i, 'Hispanic/Latino'] = subAttribute[subject]['Q2']
        trial.loc[i, 'Race'] = subAttribute[subject]['Q4']
        trial.loc[i, 'Education'] = subAttribute[subject]['Q5']

        trial.loc[i, 'State'] = subLoc[subject]['Q0']
        trial.loc[i, 'City'] = subLoc[subject]['Q1']
        trial.loc[i, 'Zipcode'] = subLoc[subject]['Q2']

        trial.loc[i, 'payment'] = payment[subject]

        trial.loc[i, 'total_unique_trials'] = tut[subject]
    except:
        pass

In [16]:

# Extraction of feedback and output of file
feedbacks = trial[trial.trial_type == 'survey-text'].responses

# We now only want the trials
trial_only = trial[trial.trial_type == 'face-likert-amanda'].reset_index(drop=True)

In [22]:

# Makes the image names easier to read
easy_name = {name: None for name in list(set(trial_only.imgName))}
for i, key in enumerate(easy_name):
    easy_name[key] = int(i + 1)
trial_only['ImgID'] = trial_only.imgName.map(easy_name)

In [23]:
# Score - we want to extract it from the json type and change it
for i,score in enumerate(trial_only.responses):
    try:
        trial_only.loc[i, 'Score'] = json.loads(score)['Q0']
    except: 
        print i, score

In [24]:
# imgName -> we want it to be converted so that we have only the file name
for i, img in enumerate(trial_only.imgName):
    trial_only.loc[i, 'imgName'] = basename(img)

In [25]:
trial_only.columns

Index([u'SubjectID', u'#', u'trialId', u'jsonStr', u'rt', u'trial_type',
       u'view_history', u'internal_node_id', u'time_elapsed', u'trial_index',
       u'responses', u'questions', u'imgName', u'isRepeat', u'isRandom',
       u'stimulus', u'key_press', u'payment', u'total_unique_trials',
       u'random_num', u'task_version', u'prompt', u'options', u'required',
       u'horizontal', u'labels', u'Age', u'Gender', u'Hispanic/Latino',
       u'Race', u'Education', u'State', u'City', u'Zipcode', u'ImgID',
       u'Score'],
      dtype='object')

In [26]:
# Fixes order and selects the columns we want
trial_only = trial_only[['total_unique_trials', 'SubjectID', 'Score', 'rt', 'ImgID', 'imgName', 'trial_index', 'Age', 'Gender', 'Hispanic/Latino', 'Race', 'Education', 'State', 'Zipcode', 'payment']]

In [27]:
trial_only.head()

Unnamed: 0,total_unique_trials,SubjectID,Score,rt,ImgID,imgName,trial_index,Age,Gender,Hispanic/Latino,Race,Education,State,Zipcode,payment
0,80.0,2,4.0,6706.0,54,empty-image-1.jpg,6,26-35,Male,No,White,High school degree or equivalent (e.g. GED),CO,80014,1.0
1,80.0,2,7.0,6257.0,11,Google_1_Kenneth Cabral_19_oval.jpg,8,26-35,Male,No,White,High school degree or equivalent (e.g. GED),CO,80014,1.0
2,80.0,2,3.0,3712.0,64,Google_1_Gary Nance_3_oval.jpg,9,26-35,Male,No,White,High school degree or equivalent (e.g. GED),CO,80014,1.0
3,80.0,2,4.0,3759.0,6,Google_1_Victoria Ambrose_17_oval.jpg,10,26-35,Male,No,White,High school degree or equivalent (e.g. GED),CO,80014,1.0
4,80.0,2,2.0,4251.0,60,Google_1_Claudia Rohrer_13_oval.jpg,11,26-35,Male,No,White,High school degree or equivalent (e.g. GED),CO,80014,1.0


In [28]:
trial_1 = trial_only[trial_only['payment']==1.0]

In [29]:
trial_1.shape

(973, 15)

In [30]:
trial_2 = trial_only[trial_only['payment']==2.0]

In [31]:
trial_2.shape

(1054, 15)

In [None]:
trial_1.name.unique()