In [63]:
import pandas as pd
import numpy as np
import json
import ast

In [141]:
follow_up_ques = {
    'fu_ques_0':'Do you use ad blocking software on your computer?',
    'fu_ques_1':'How relevant was the ad to you?',
    'fu_ques_2':'Do you generally find that ads are relevant to you?',
    'fu_ques_3':'How important is it to you that ads help you find products you are interested in?',
    'fu_ques_4':'Did the ad you saw help you find a product you are interested in?',
    'fu_ques_5':'Do you think this ad technology can reliably help you find products you are interested in?',
    'fu_ques_6':'Do you have a smart phone?',
    'fu_ques_7':'How often do you make purchases using your mobile phone?',
    'fu_ques_8':'How often do you make purchases using your computer?'
}

profile_ques = {
    'pro_ques_0':'What is your age?',
    'pro_ques_1':'What is your gender?',
    'pro_ques_2':'What type of environment do you most identify with?'
}

In [58]:
rawData = open('surveyResponses.json').read().split('\n')[:-1]
surveyResponses = [ json.loads(res) for res in rawData ]

### Process Mouse Movement

In [52]:
def calcDist(x, y):
    #assert len(x) == len(y), 'Length of mouse movements must be equal'
    if len(x) != len(y):
        return np.nan
    
    dist = 0
    for i in range(2, len(x)):
        dist += np.sqrt((x[i] - x[i-1])**2 + (y[i] - y[i-1])**2)
        
    return dist

In [60]:
for response in surveyResponses:
    if 'mouse_x_pos' in response.keys():
        response['mouse_x_pos'] = [float(num) for num in response['mouse_x_pos'].split(',')]
        response['mouse_y_pos'] = [float(num) for num in response['mouse_y_pos'].split(',')]
        response['dist'] = calcDist(response['mouse_x_pos'], response['mouse_y_pos'])
    else:
        response['dist'] = np.nan

### Process Responses

In [133]:
def processQuestions(response):
    ad_cats = ast.literal_eval(response['ad_cats'])
    products = ast.literal_eval(response['products'])
    follow_ups = response['follow_up'].strip('{}').split(',')[:-1]
    follow_ups = dict([('fu_'+ques).split(':') for ques in follow_ups])
    for key in follow_ups:
        if ' - ' in follow_ups[key]:
            follow_ups[key] = follow_ups[key].split(' - ')[0]
    profile = response['profile'].strip('{}').split(',')[:-1]
    profile = dict([('pro_'+ques).split(':') for ques in profile])
    
    return {**ad_cats, **products,  **follow_ups, **profile}

In [134]:
keysToKeep = ['date', 'ad', 'ad_type', 'primed', 'browser_size', 'dist']
otherKeys = list(processQuestions(surveyResponses[90]).keys())
otherKeys.sort()

In [135]:
flattened = {}

for res in surveyResponses:
    if 'profile' in res.keys():
    
        resID = res['__key__']['id']
        resData = {}

        for key in keysToKeep:
            try:
                resData[key] = res[key]
            except KeyError:
                resData[key] = 'NA'

        ques = processQuestions(res)

        flattened[resID] = {**resData, **ques}
    

In [145]:
data = pd.DataFrame.from_dict(flattened, orient='index')[keysToKeep + otherKeys]

origCols = list(data.columns)

for i in range(len(origCols)):
    if origCols[i] in follow_up_ques.keys():
        origCols[i] = follow_up_ques[origCols[i]]
    elif origCols[i] in profile_ques.keys():
        origCols[i] = profile_ques[origCols[i]]
        
data.columns = origCols

In [148]:
data.to_csv('surveyResponses.csv')