# Preprocessing of self reported answers and performance

This file creates a numpy array that associates the self reported answers to its corresponding task. This association is done per subject. Therefore for every subject there are four rows that describe these tasks and answers in numerical form.

Each in row in addition to include the self reported answers, it also has the performance obtained by that subject on an specific task and the number of times it was selected. For consistency, the latter is included whether it is relevant or not (e.g. during training or testing the number of times a task is selected is the same for all of them).

In order to obtain those metrics, six columns are extracted from the trial by trial file:

- id
- whether the subject has been informed about a random task or not
- task family
- task category
- answer

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
import warnings

from ipywidgets import widgets
from IPython.html.widgets import *

warnings.filterwarnings('ignore')
rcParams.update({'font.size': 15})
#plt.style.use('ggplot')
#plt.style.use('seaborn-dark-palette')
plt.style.use('fivethirtyeight')

In [None]:
TASKS = 4
USERS = 201
PHASE = 'exploration'
#[b'free', b'test', b'train']
dphase = {'exploration':0, 'test':1, 'train':2}
# Indicate for which modality you want to generate the files
TYPE = 's' #s=strategic, ft=free exploration and training, f = free exploration

if TYPE == 's':
    csvFile = '../data/monsters_data_strategic_052217.csv'
elif TYPE == 'ft':
    csvFile = '../data/monsters_data_free_familiarize_05232017.csv'
elif TYPE == 'f':
    csvFile = '../data/monsters_data_free_only_052417.csv'

## Preprocess trial instances

In [None]:
def stringToInt(arr, position, values):
    for i, v in enumerate(values):
        arr[arr[:,position] == v, position] = i
    return arr

# get id(0), condition(1), phase(2), family(7), category(8), correct(11)
csv = np.genfromtxt(csvFile, dtype=np.string_, delimiter=',', usecols=(0,1,2,7,8,11), skip_header=1)

# Get monster type
monsters = [m for m in np.unique(csv[:,3])]
# Get category
categories = [b'category1D', b'categoryIgnore1D', b'category2D', b'categoryRandom']
# Get ids
ids = [i for i in np.unique(csv[:,0]) ]
# Get condition
cond = [co for co in np.unique(csv[:,1]) ]
# Get unique phases
phases = [p for p in np.unique(csv[:,2]) ]
# For converting string to int boolean
bo = [b'False', b'True']

csvInt = csv.copy()
# Convert fields to ints for easy processing
for i,j in enumerate([ids, cond, phases, monsters, categories, bo]):
    csvInt = stringToInt(csvInt, i, j)
csvInt = csvInt.astype('int')

if PHASE == 'exploration': 
    # Get only those in free exploration phase
    csvInt = csvInt[csvInt[:, 2] == dphase[PHASE]]
    
    if TYPE == 's':
        # For now also remove condition
        csvInt = np.delete(csvInt, (1,2), axis=1)
    else:
        # Remove the phase column
        csvInt = np.delete(csvInt, 2, axis=1)

# Split by user
splitCsv = [csvInt[csvInt[:,0]==i] for i in np.unique(csvInt[:,0])]


# Count times correct per task, times task chosen, % for both
metricsUser = []

if PHASE == 'exploration':
    cuser = 0
    # id, monster, category, success
    for user in range(np.shape(splitCsv)[0]):
        questions = splitCsv[user]
        
        if type != 'S':
            condition = questions[0,1]
            # Count number of times a task was selected
            task, ctask = np.unique(questions[:,3], return_counts=True)
        else:
            task, ctask = np.unique(questions[:,2], return_counts=True)

        metricsTask = []
        # Count number of times the answer was correct per task and percent
        for t in range(TASKS):
            if TYPE != S:
                correct = np.sum(questions[questions[:,3] == t,4] == 1) 
                total = np.sum(questions[:,3] == t)
                if t in task:
                    # Task category, times task selected, % selection, #times correct on this task, % correct
                    metricsTask.append([user, condition, t, ctask[np.where(task==t)][0], ctask[np.where(task==t)][0]/np.shape(questions)[0], correct, np.nan_to_num(correct/total)])
                else:
                    metricsTask.append([user, condition, t, 0., 0., correct, np.nan_to_num(correct/total)])
            else:
                correct = np.sum(questions[questions[:,2] == t,3] == 1) 
                total = np.sum(questions[:,2] == t)
                if t in task:
                    # Task category, times task selected, % selection, #times correct on this task, % correct
                    metricsTask.append([user, t, ctask[np.where(task==t)][0], ctask[np.where(task==t)][0]/np.shape(questions)[0], correct, np.nan_to_num(correct/total)])
                else:
                    metricsTask.append([user, t, 0., 0., correct, np.nan_to_num(correct/total)])

        metricsUser.append(metricsTask)

## Preprocess self reported data

In [None]:
# Subjective data
if TYPE == 's':
    csvFile = '../data/monsters_extra_data_strategic_052217.csv'
elif TYPE == 'ft'"
    csvFile = '../data/monsters_extra_data_free_familiarize_05232017.csv'
else:
    csvFile = '../data/monsters_extra_data_free_only_05242017.csv'

scsv = np.genfromtxt(csvFile, dtype=np.string_, delimiter=',', skip_header=1, usecols=range(30))

if TYPE == 's':
    # NOTE For now remove column future-learn-1-Bear (8), no data there
    # So remove also 15, 22 and 29
    scsv = np.delete(scsv, (8,15,22,29), axis=1)
elif TYPE == 'f':
    scsv = np.delete(scsv, (7,14,21,28), axis=1)


# For those people that did not explore all tasks we still have to find a way to know what monster corresponds
# to what category so get that information from training/testing phase
csvInt2 = csv.copy()
# Convert fields to ints for easy processing
for i,j in enumerate([ids, cond, phases, monsters, categories, bo]):
    csvInt2 = stringToInt(csvInt2, i, j)
csvInt2 = csvInt2.astype('int')

# Get only those in free exploration phase
csvInt2 = csvInt2[csvInt2[:, 2]==train]

# For now also remove condition
csvInt2 = np.delete(csvInt2, (1,2), axis=1)
# Split by user
splitCsv = [csvInt2[csvInt2[:,0]==i] for i in np.unique(csvInt2[:,0])]

# each row consists of uid, monster, category, answers for that category
postCsv = []
for user in range(np.shape(splitCsv)[0]):
    # id(0), family(1), category(2), correct(3)
    # Get monster family and task category
    questions = splitCsv[user][:,1:3]
    postQ = scsv[user]
    _, mIdx = np.unique(questions[:,0], return_index=True)
    _, cIdx = np.unique(questions[:,0], return_index=True)
    # stack monster type, complexity type
    moncat = np.vstack((questions[np.sort(mIdx),0], questions[np.sort(cIdx),1])).T
    moncat = moncat[moncat[:,0].argsort()]
    for n, mon in enumerate(moncat[:,0]):
        if TYPE == 'ft':
            #Use this when ALL answer columns are used
            answ = postQ[n*7+2:7*(n+1)+2]
        else:
            answ = postQ[n*6+2:6*(n+1)+2]
        # stack user id, monster id, category id, answers for that category
        postCsv.append(np.hstack((user, mon, moncat[n,1], answ.tolist())).tolist())

In [None]:
postCsv = np.asarray(postCsv)
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
np.set_printoptions(precision=2)

metricsUser = np.asarray(metricsUser, dtype='float')
NUSERS = np.shape(metricsUser)[0]
postCsv = postCsv.astype('float')

# Order by first column (id) and then by third column (category task complexity)
postCsv = postCsv[np.lexsort((postCsv[:,2], postCsv[:,0]))]

# user, condition, category task (complexity), #times task selected, % selection, #times correct on this task, % correct
#print(metricsUser)
metricsUser = metricsUser.reshape(NUSERS*TASKS,7)

# Check that columns (user and task category complexity) are the same in order to fusion them
if np.array_equal(metricsUser[:,0], postCsv[:,0]) and np.array_equal(metricsUser[:,2], postCsv[:,2]):
    finalStats = np.hstack((metricsUser, postCsv[:,3:]))
else:
    print('ERROR: columns do not match, someone did not select all tasks')

## Saving numpy arrays

In [None]:
if TYPE == 's':
    if PHASE == 'train': 
        np.savetxt('strategic-train.txt', finalStats)
    elif PHASE == 'test':
        np.savetxt('strategic-test.txt', finalStats)
    else:
        np.savetxt('strategic-free.txt', metricsUser)
elif TYPE == 'ft':
    if PHASE == 'train':
        np.savetxt('freeTrain-train.txt', finalStats)
    elif PHASE == 'exploration':
        np.savetxt('freeTrain-free.txt', metricsUser)
elif TYPE == 'f':
    if PHASE == 'train':
        np.savetxt('strategic-train.txt', finalStats)
    elif PHASE == 'test':
        np.savetxt('strategic-test.txt', finalStats)
    else:
        #np.savetxt('free-free.txt', metricsUser)
        np.savetxt('free-free.txt', finalStats)