In [2]:
from utils import *

# 1 Data preparation

In [3]:
respondents = pd.read_csv(os.path.join('..', 'data', 'raw', 'raw_respondents.csv'), index_col=0)
results = pd.read_csv(os.path.join('..', 'data', 'raw', 'raw_results.csv'), index_col=0)
_, _, cardsE, cardsB = loadData()

## Respondents

### CRT calculation

In [4]:
respondents['crt'] = 0
respondents.loc[respondents.crt1 == '5 pence', 'crt'] += 1
respondents.loc[respondents.crt2 == '5 minutes', 'crt'] += 1
respondents.loc[respondents.crt3 == '47 days', 'crt'] += 1

#### CRT check

In [5]:
respondents.crtCheck.value_counts()

crtCheck
No     100
Yes     60
Name: count, dtype: int64

In [6]:
for i in respondents.crtCheckElaborate[respondents.crtCheckElaborate != 'Did not answer'].values:
    print(i)

all of them, honestly. 
During a past survey I did, these questions were present.
MQ3 about the lily pads 
All of them.
All three questions many times over the last five to ten years, I couldn't say when I first encountered them
lilly pad
All three of them I have encountered previously
All of them, although I can't say precisely where.
I've seen a variation of the lily pad question
The bat and ball question
All 3
All three were in a work quiz
The bat and ball, and the lily pads in the lake
I have been asked the exact same three questions multiple times. 
The bat and the ball one is a frequent one I've seen.
The first one - though im not sure where
nan
Bat & Ball definitely and im sure the lily pads but cannot remember the answer.
MQ3: In a lake, there is a patch of lily pads. Every day, the patch doubles in size. If it takes 48 days for the patch to cover the entire lake, how long would it take for the patch to cover half of the lake? *

I have encountered similar questions to all 3 of

In [7]:
respondents.crt.corr(respondents.crtCheck.apply(lambda x: 0 if x == 'No' else 1))

np.float64(0.42771019398377813)

In [8]:
respondents.groupby('crtCheck').crt.agg(['mean', 'median', 'std'])

Unnamed: 0_level_0,mean,median,std
crtCheck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,1.17,1.0,1.128644
Yes,2.25,3.0,1.083388


In [9]:
mannwhitneyu(respondents[respondents.crtCheck == 'Yes'].crt, respondents[respondents.crtCheck == 'No'].crt)

MannwhitneyuResult(statistic=np.float64(4484.0), pvalue=np.float64(5.309013691076753e-08))

### Big 5 calculation

In [10]:
respondents['bfExtra'] = 0
respondents.bfExtra += (6 - respondents.bf1) + respondents.bf6 + respondents.bf11

respondents['bfAgree'] = 0
respondents.bfAgree += respondents.bf2 + (6 - respondents.bf7) + respondents.bf12

respondents['bfConsc'] = 0
respondents.bfConsc += (6 - respondents.bf3) + (6 - respondents.bf8) + respondents.bf13

respondents['bfNegEmo'] = 0
respondents.bfNegEmo += respondents.bf4 + respondents.bf9 + (6 - respondents.bf14)

respondents['bfOpenMind'] = 0
respondents.bfOpenMind += respondents.bf5 + (6 - respondents.bf10) + respondents.bf15

### Straightlining check

In [11]:
for index, row in respondents[['respondent', 'variant', 'bf1', 'bf2', 'bf3', 'bf4', 'bf5', 'bf6', 
                               'bf7', 'bf8', 'bf9', 'bf10', 'bf11', 'bf12', 'bf13', 'bf14', 'bf15']].iterrows():
    if(row[2:17].var() < 0.6):
        print(row.variant, row.respondent, row[2:17].var())

E50 26 0.40952380952380957
E50 37 0.5714285714285714
E50 39 0.20952380952380958
E30 22 0.42857142857142855
E30 23 0.5523809523809524
E30 36 0.5523809523809524
B50 6 0.5714285714285714
B50 30 0.5523809523809524
B50 40 0.5428571428571429
B30 22 0.49523809523809514


### Reformatting

In [12]:
respondents.time = respondents.time.apply(lambda x: int(x.split(':')[0]) * 60 * 60 + int(x.split(':')[1]) * 60 + int(x.split(':')[2]))

In [13]:
respondents = respondents[[
    'respondent', 'variant', 'time', 'categories', 'age', 'gender',
    'education', 'income', 'domain', 'crt', 'labelClarity', 'sortDifficulty',
    'concentration', 'timeAmount', 'cardQuantity', 'bfExtra', 'bfAgree', 
    'bfConsc', 'bfNegEmo', 'bfOpenMind', 'att1', 'att2', 'note'
]]

## Results and export

In [14]:
results = results[['respondent', 'card', 'category', 'categoryId', 'categoryStandardized',
       'order', 'variant']]

### Categories as keywords

In [15]:
results['categoryEdited'] = results.category.str.lower()
results['categoryEdited'] = results.categoryEdited.apply(lambda x: x.strip())
results['categoryEdited'] = results.categoryEdited.apply(lambda x: re.sub('[.&,\'/,()+!#]|( - )', ' ', x))
results['categoryEdited'] = results.categoryEdited.apply(lambda x: ' '.join(x.split()))

In [16]:
for index, row in respondents.iterrows():
    cats = results[(results.variant==row.variant) & (results.respondent==row.respondent)].categoryEdited.unique()
    respondents.loc[index, 'informativeness'] = informativeness(' '.join(cats)) / len(cats)

### Similarity matrices

See `makeMatrix()` and `BMM()` in the `utils.ipynb` file.

In [17]:
for m in ['paired', 'seen', 'similarity', 'similarityAbsolute', 'C-similarity', 'C-similarityAbsolute']:
    for v in ['E50', 'E30', 'B50', 'B30']:
        if(v in ['E50', 'E30']):
            dataF = makeMatrix(m, results[results.variant==v], cardsE, 40)
        if(v in ['B50', 'B30']):
            dataF = makeMatrix(m, results[results.variant==v], cardsB, 40)
        dataF.to_csv(os.path.join('..', 'data', 'matrices', m + v + '.csv'))

### Agreement scores

In [18]:
for variant in ['E50', 'E30', 'B50', 'B30']:
    for index, group in results[results.variant==variant].groupby('categoryStandardized'):
        if(len(group.categoryId.unique())!=1):
            cards = {}
            cats = 0
            resps = len(group.respondent.unique())
            for jndex, jgroup in group.groupby(['respondent', 'categoryId']):
                cats += 1
                for kndex, row in jgroup.iterrows():
                    cards[row.card] = 1 if row.card not in cards else cards[row.card] + 1
            x = list(map(lambda x: x/len(cards.values()) *100, cards.values()))
            a = sum(x)/resps
            results.loc[(results.variant==variant) & (results.categoryStandardized==index), 'agreement'] = a

#### Author duplicates check

In [19]:
duplicates = (
    results.groupby(['variant', 'categoryStandardized', 'respondent'])
           .categoryId.nunique()
           .reset_index(name='unique_categories')
)

In [20]:
(duplicates[duplicates.unique_categories > 1].groupby('variant')['categoryStandardized'].nunique() / results.groupby(['variant', 'categoryStandardized']).size().reset_index().groupby('variant').size() * 100)

variant
B30     3.296703
B50     5.376344
E30    11.864407
E50    15.384615
dtype: float64

## File export

In [21]:
respondents.to_csv(os.path.join('..', 'data', 'respondents.csv'))
results.to_csv(os.path.join('..', 'data', 'results.csv'))