In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import json
from helpers import *

# Load data

In [None]:
# Load Data
with open('data/data.json') as file:
    setup_data = json.load(file)
    
with open('data/trials-09-11.json') as file:
    trials = json.load(file)  

with open('data/survey-09-11.json') as file:
    survey = json.load(file)
                       
survey = pd.DataFrame(survey)
survey = survey.set_index('survey_id').drop_duplicates()

# Preprocess objects

In [None]:
# Store object names per image
objNames = {}
for picture, picData in setup_data.items():
    if picture not in objNames.keys():
        objNames[picture] = []
    
    if picture == 'MHC2':
        for i, obj in enumerate(picData['objects']):
            if obj['id'] == 2:
                obj['desc'] = 'Cat'
                
    for obj in picData['objects']:
        objNames[picture] += [obj['desc']]

# Preprocess the trials

In [None]:
preprocessed_trials = []
for trial in trials:
    results = json.loads(trial['results'])

    # Start from scratch
    analogies = ['' for i in range(len(objNames[results['test']]))]
    
    ## Fix an error:
    # in MHC2, the object discription 'Bush' is used twice.
    # Luckily, the id's are unique, so we replace the description 
    # of objects with id 2 by 'cat'
    if results['test'] == 'MHC2':
        for i, a in enumerate(results['analogies']):
            if a['id'] == 2:
                a['desc'] = 'Cat'
                
    # Analogy A_u(t)
    for a in results['analogies']:
        testIndex = objNames[results['test']].index(a['desc'])
        try:
            instrIndex = objNames[results['instruction']].index(a['analogDesc'])
            analogies[testIndex] = instrIndex
        except ValueError:
            analogies[testIndex] = ''
            
    preprocessed_trials +=[{
        'analogies': analogies,
        'num_analogies': results['num_analogies'],
        'instruction': results['instruction'],
        'test': results['test'],
        'trial_id': trial['trial_id'],
        'subject_id': trial['subject_id'],
        'timestamp': trial['timestamp'],
        'trial_number': trial['trial_number']
    }]

df = pd.DataFrame(preprocessed_trials).sort('timestamp').set_index('trial_id')
df['num_analogies'] = df['num_analogies'].astype(int)
df['trial_number'] = df['trial_number'].astype(int)
df['subject_id'] = df['subject_id'].astype(int)
df['trial_type'] = df['instruction'].str[:-1]
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Drop all duplicates
df = df.drop_duplicates(subset=['subject_id', 'trial_number', 'trial_type'])

In [None]:
# Due to an error different subjects have been assigned the same id's
# fortunately, we recorded the timestamp and can thus distinguish
# the two subjects. The latests tests are reindexed by former_id + 1000
for subject_id in df['subject_id'].unique():
    for trial_number in df['trial_number'].unique():
        
        # Search for duplicates
        dupl = df[(df['subject_id'] == subject_id) & (df['trial_number'] == trial_number)]
                
        if len(dupl) > 1:
            if list(dupl['timestamp'])[0] >= list(dupl['timestamp'])[1]:
                print('Error: identical time')
                print(dupl)
            else:
                df.loc[dupl.index[1],'subject_id'] += 1000
                
                # Fix survey id's
                survey_dupl = survey[survey['subject_id'] == subject_id]
                try:
                    survey.loc[survey_dupl.index[1], 'subject_id'] += 1000
                except:
                    continue

In [None]:
# Remove all noncomplete trials
incompletes = []
for subject_id in df['subject_id'].unique():
    test = df[df['subject_id'] == subject_id]
    if len(test) != 8:
        incompletes += [subject_id]

# Only store complete responses.
valid_ids = [i for i in survey['subject_id'].unique() 
             if i not in incompletes 
             and i in df['subject_id'].unique()]

df = df[df['subject_id'].isin(valid_ids)]
survey = survey[survey['subject_id'].isin(valid_ids)]
survey = survey.drop_duplicates('subject_id')

# Export

In [None]:
survey.reset_index().to_json('data/survey-preprocessed.json')
df.reset_index().to_json('data/trials-preprocessed.json')

with open('data/objNames.json', 'w') as file:
    json.dump(objNames, file)