In [None]:
## remember to run conn_cocolab from the terminal before running cells in this notebook!

import os
import urllib, cStringIO

import pymongo as pm

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
from __future__ import division
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64

### file hierarchy and database connection vars

In [None]:
# directory & file hierarchy
iterationName = 'e1'
exp_path = 'museumdraw'
analysis_dir = os.getcwd()
exp_dir = os.path.abspath(os.path.join(os.getcwd(),'../..','experiments'))
##
sketch_dir = os.path.join(analysis_dir,'sketches')
if not os.path.exists(sketch_dir):
    os.makedirs(sketch_dir)

In [None]:
# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['kiddraw']
coll = db['E1c']

### render out all pngs you can find!

In [None]:
final_images = coll.find({'dataType':'finalImage'}).sort('time')
imsize = 224
for rec in final_images:
    imgData = rec['imgData'];
    filestr = base64.b64decode(imgData)
    if rec['time']> 1510252452134: ## this is the timepoint after which real data started getting collected
        if 'age' in rec.keys():
            category_dir = os.path.join(sketch_dir,rec['category'])
            if not os.path.exists(category_dir):
                os.makedirs(category_dir)
            fname = os.path.join(category_dir,'{}_sketch_{}_{}.png'.format(rec['category'], rec['age'],rec['sessionId']))
            with open(fname, "wb") as fh:
                fh.write(imgData.decode('base64'))
                    

### Preprocessing data

Notes:

The collection that contains the data we will analyze for VSS is called 'E1c'.
There are two types of records in the database: 'stroke' and 'finalImage'. This is stored under the key: 'dataType'.
The 'stroke'-type of record contains the svg string information. Every stroke event is stored as a separate record.
The session identifier is called "sessionId".

In [None]:
experiment_name = 'E1c'
time_threshold = 1510252452134 ## this is the timepoint after which real data started getting collected
all_sessions = coll.distinct('sessionId') ## this returns ALL sessions in this collection. we will then filter on time_threshold
valid_sessions = coll.find({'time':{'$gt': time_threshold}}).distinct('sessionId')
practice_categories = ['circle','triangle']
print 'We currently have {} valid sessions.'.format(len(valid_sessions))

In [None]:
## desired output: a dataframe that has trials on the rows, and the following columns:
## category, age, number of strokes, mean_pixel_intensity, bounding_box_coordinates_LTRB, list of strokes, 
## PNG string, submission_time, submission_date
## to be saved out as a nice tidy CSV
session_id = []
trial_num = []
category = []
age = []
num_strokes = []
mean_pixel_intensity = []
bounding_box_coords = []
svg = []
svg_times = []
png = []
submit_time = []
submit_date = []
draw_duration = []
for s in valid_sessions:
#     print 'Analyzing {}'.format(s)
    image_recs = coll.find({'$and': [{'time': {'$gt': time_threshold}}, {'sessionId':s}, {'dataType':'finalImage'}]}).sort('time')    
    for imrec in image_recs:
        if imrec['category'] not in practice_categories: ## don't save practice category trials
            stroke_recs = coll.find({'$and': [{'time': {'$gt': time_threshold}}, 
                                              {'sessionId':s}, 
                                              {'dataType':'stroke'},
                                              {'trialNum': imrec['trialNum']}]}).sort('time')   
            if stroke_recs.count() > 0: ## only include trials if the drawings are not blank            
                session_id.append(imrec['sessionId'])        
                trial_num.append(imrec['trialNum']) 
                category.append(imrec['category'])
                age.append(imrec['age'])
                png.append(imrec['imgData'])
                submit_time.append(imrec['time'])
                submit_date.append(imrec['date'])
                num_strokes.append(stroke_recs.count())
                _svg = [] # this keeps track of the strokes from THIS final image
                _svg_times = []
                for strec in stroke_recs:
                    _svg.append(strec['svg'])
                    _svg_times.append(strec['time'])
                draw_duration.append((_svg_times[-1] - _svg_times[0])/1000) ## in seconds
                svg.append(_svg)
                svg_times.append(_svg_times)

In [None]:
X = pd.DataFrame([session_id,trial_num,category,age,submit_time,submit_date,num_strokes,svg,svg_times,png,draw_duration])
X = X.transpose()
X.columns = ['session_id','trial_num','category','age','submit_time','submit_date','num_strokes','svg','svg_times','png','draw_duration']

In [None]:
## add mean pixel intensity (amount of ink spilled) 
mean_intensity = []
imsize = 100
numpix = imsize**2
thresh = 250
for i,_d in X.iterrows():
    imgData = _d['png']
    filestr = base64.b64decode(imgData)
    fname = os.path.join('sketch.png')
    with open(fname, "wb") as fh:
        fh.write(imgData.decode('base64'))
    im = Image.open(fname).resize((imsize,imsize))
    _im = np.array(im)
    mean_intensity.append(len(np.where(_im[:,:,3].flatten()>thresh)[0])/numpix)
X = X.assign(mean_intensity=pd.Series(mean_intensity).values)
print stats.spearmanr(X['mean_intensity'].values,X['num_strokes'].values)

In [None]:
X.to_csv('museumdraw_{}_data.csv'.format(experiment_name))

### Some very basic descriptive stats

In [None]:
fig = plt.figure(figsize=(4,4))
h = plt.hist(X.trial_num.values,normed=True)
plt.ylabel('proportion')
plt.xlabel('number of trials completed')

In [None]:
fig = plt.figure(figsize=(4,4))
h = plt.hist(X.num_strokes.values,normed=True)
plt.ylabel('proportion')
plt.xlabel('number of strokes')

In [None]:
fig = plt.figure(figsize=(4,4))
h = plt.hist(X.draw_duration.values,normed=True)
plt.ylabel('proportion')
plt.xlabel('draw duration')

In [None]:
plt.figure(figsize=(4,4))
_D = X.groupby('age')['num_strokes'].apply(lambda x: np.mean(x))
D = pd.DataFrame(_D).transpose()
seq = ['4','5','6','7','8','9','10']
sns.barplot(data=D,order=seq)
plt.ylabel('num strokes')

In [None]:
plt.figure(figsize=(4,4))
_D = X.groupby('age')['draw_duration'].apply(lambda x: np.mean(x))
D = pd.DataFrame(_D).transpose()
seq = ['4','5','6','7','8','9','10']
sns.barplot(data=D,order=seq)
plt.ylabel('draw duration (s)')