In [48]:
## remember to run conn_cocolab from the terminal before running cells in this notebook!
import os
import urllib, cStringIO

import pymongo as pm
import cv2
import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('poster')
sns.set_style('white')

import numpy as np
from __future__ import division
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64
import datetime

### File hierarchy and database connection vars 

In [104]:
# directory & file hierarchy
iterationName = 'cdm_run_v3'
exp_path = 'museumstation'
analysis_dir = os.getcwd()
exp_dir = os.path.abspath(os.path.join(os.getcwd(),'../..','experiments'))
sketch_dir = os.path.join(analysis_dir,'sketches')
data_dir = os.path.join(analysis_dir,'usage_data')
scribble_dir = os.path.join(analysis_dir,'scribbles')

# load data
raw_data = pd.read_csv(os.path.join(data_dir, "scribble_annotate.csv"))


In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['kiddraw']
coll = db['cdm_run_v3']


### Get features and preprocessing

#### Proportion of black pixels on the png image

In [105]:
# Get pixel proportions on each sketch
raw_data['num_black_pixels'] = 0
raw_data['prop_black'] = 0.0

for index, row in raw_data.iterrows():
    fname = row.filename
    img_name = fname.split('/')[-1]
    cat_name = img_name.replace('.png','').split('_')[0]
    fpath = os.path.join(scribble_dir, cat_name, img_name)
    
    sketch = cv2.imread(fpath, cv2.IMREAD_UNCHANGED)
    
    # get the number of all black pixels
    pixels_black = np.argwhere(sketch.sum(axis=2) > 0)
    num_black_pixels = len(pixels_black)
    
    # the number of all pixels
    num_all_pixels = len(sketch) * len(sketch[0])  # row * col
    
    raw_data.at[index, 'num_black_pixels'] = num_black_pixels
    raw_data.at[index, 'prop_black'] = float(num_black_pixels)/float(num_all_pixels)


#### Stroke length

#### Stroke Similarity

In [106]:
# change age from strings to integers
raw_data['age'] = raw_data['age'].str.replace('age','')
pd.to_numeric(data.age)

# transfer categorical columns into integers
raw_data.category = pd.Categorical(raw_data.category)
raw_data['cat_code'] = raw_data.category.cat.codes

feature_cols = ['cat_code', 'age', 'num_strokes','num_black_pixels', 'prop_black']
select_cols = feature_cols[:]
select_cols.append('scribble')
data = raw_data.loc[:, select_cols]

print data.shape

(800, 6)


### Set a classification model

In [146]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import train_test_split

# split data into train and test sets
train, test = train_test_split(data, test_size=0.4)

# get features
train_x = train.loc[:, feature_cols]
test_x = test.loc[:, feature_cols]

# get scribble values
train_y = train.scribble
test_y = test.scribble

print train_x.shape, train_y.shape
print test_x.shape, test_y.shape

# train
logreg = LogisticRegression()
logreg.fit(train_x, train_y)

gbm = GradientBoostingClassifier(min_samples_split=50, min_samples_leaf=10)
gbm.fit(train_x, train_y)

# prediction on the test set
test_pred_log = logreg.predict(test_x)
test_pred_gbm = gbm.predict(test_x)


(480, 5) (480,)
(320, 5) (320,)


### Evaluate the model

In [151]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# evaluate the model
# f1 = f1_score(test_y, test_pred, average='micro')
# print 'f1 score: ', f1

final_test_pred = np.zeros(len(test_y))
                           
for i, v in enumerate(test_pred_log):
    if v==1 and test_pred_gbm[i]==1:
        final_test_pred[i] = 1
                           
target_names = ['0', '1']
print 'logistic regression'
print(classification_report(test_y, test_pred_log, target_names=target_names))

print 'gradient boosting classifer'
print(classification_report(test_y, test_pred_gbm, target_names=target_names))
                           
print 'combination'
print(classification_report(test_y, final_test_pred, target_names=target_names))

logistic regression
             precision    recall  f1-score   support

          0       0.92      0.99      0.96       282
          1       0.83      0.39      0.54        38

avg / total       0.91      0.92      0.91       320

gradient boosting classifer
             precision    recall  f1-score   support

          0       0.93      0.98      0.96       282
          1       0.75      0.47      0.58        38

avg / total       0.91      0.92      0.91       320

combination
             precision    recall  f1-score   support

          0       0.92      1.00      0.96       282
          1       0.93      0.34      0.50        38

avg / total       0.92      0.92      0.90       320

