In [1]:
# imports and globals
import numpy as np
from cnns.core import LasagneGoogLeNetInceptionV3 as inceptv3
from cnns.utils import cnn_utils as cu
from cnns.utils import process_utils as pu
from cnns.utils import training_utils as tu
from ds_utils import data_utils as du
from sklearn import linear_model
from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
import pickle as pkl
import os
import csv
import datetime
import sys
import pandas as pd
import json


cnn_dir = '/Users/babasarala/repos/cnns'
model_dir = '%s/models'%cnn_dir
output_dir = '/Users/babasarala/Desktop'

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

Collect data from tables 

In [3]:
from collections import defaultdict

# define the categories:
categories = {'food':0.6, 'documents':1., 'whiteboards':1., 'sketches':1., 'other':0.1} 

# define the model we want to use for generating CNN codes
param_pkl_filepath = '/Users/babasarala/repos/cnns/models/inception_v3.pkl'
model = inceptv3.LasagneGoogLeNetInceptionV3(param_pkl_filepath)
layer = 'pool3'

In [4]:
# load in the data
dfs = []
cur = tu.start_psycon()
for category in categories:
    everalbum_sample_urls = tu.get_sample_image_urls_from_everalbum_for_category(cur, category=category, 
                                                                                 p=categories[category])
    imagenet_sample_urls = tu.get_sample_image_urls_from_imagenet_for_category(cur, category=category, 
                                                                               p=categories[category])
    curr_df = pd.DataFrame(everalbum_sample_urls + imagenet_sample_urls, columns=['img_url'])
    curr_df['category'] = category
    dfs.append(curr_df)

df_img_urls = pd.concat(dfs)

In [5]:
df_img_urls.to_csv('/Users/babasarala/Desktop/intermediate_file.csv', index=False)

In [4]:
def construct_complete_data_from_intermediate_results(results_dirpath, csv_filepath=None):
    img_urls, X = pu.combine_checkpoint_intermediate_results(results_dirpath)
    if csv_filepath is not None:
        img_url_cats = pd.read_csv(csv_filepath)
        assert 'img_url' in img_url_cats.columns
        assert 'category' in img_url_cats.columns
    
    m, d = X.shape
    feat_cols = ['f%i'%i for i in range(d)]
    data_matrix = pd.DataFrame(data=X, columns=feat_cols)
    data_matrix['img_url'] = img_urls
    complete_data = pd.merge(data_matrix, img_url_cats)
    
    return complete_data, feat_cols

In [5]:
dataset_dirpath = '/Users/babasarala/Desktop/evaluation/dataset/productivity_food_dataset_v1.1'
results_dirpath = '%s/GoogLeNetv3_pool3_fvs'%(dataset_dirpath)
csv_filepath = '%s/productivity_food_dataset_v1.1.csv'%(dataset_dirpath)
complete_data, feat_cols = construct_complete_data_from_intermediate_results(results_dirpath, csv_filepath)

In [6]:
print complete_data['category'].value_counts()
categories = complete_data['category'].unique()
print categories

other          53265
food           14338
documents       1373
whiteboards      168
sketches          69
dtype: int64
['food' 'sketches' 'documents' 'whiteboards' 'other']


Train a classifier

In [7]:
# set seeds
train_random_seed = np.random.randint(low=0, high=99999)
prod_random_seed = np.random.randint(low=0, high=99999)

# create mappings
mappings = {}
for idx, category in enumerate(categories):
    mappings[category] = idx
    mappings[idx] = category

# tag training and testing labels
p = 0.6
m = complete_data.shape[0]
train_test_cats = tu.assign_train_test(m, p=p)
complete_data['set'] = train_test_cats

In [16]:
# train a logistic regression model on this data
train_lr = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', solver='lbfgs', 
                                           random_state=train_random_seed)
pipe = Pipeline(steps=[
        ('normalizer', StandardScaler()),
        ('classifier', train_lr)
        ])
params = dict(classifier__C=[1e-4, 1e-3, 1e-2, 0.1, 1, 10, 1e2, 1e3, 1e4])
train_clf = GridSearchCV(pipe, param_grid=params, cv=5)

# full data, in matrix form
X = complete_data[feat_cols].values
y = [mappings[category] for category in complete_data['category'].values]

# train-test split
training_data =  complete_data[complete_data['set'] == 'train']
testing_data = complete_data[complete_data['set'] == 'test']
X_tr = training_data[feat_cols].values
y_tr = [mappings[category] for category in training_data['category'].values]
X_te = testing_data[feat_cols].values
y_te = [mappings[category] for category in testing_data['category'].values]

best_clf = train_clf.fit(X_tr,y_tr)
y_pred = best_clf.predict(X_te)
clf_report = classification_report(y_te, y_pred)
conf_mat = confusion_matrix(y_te,y_pred)

# final classifier
best_C = train_clf.best_params_['classifier__C']
prod_lr = linear_model.LogisticRegression(C=best_C, penalty='l2', multi_class='multinomial', solver='lbfgs', 
                                           random_state=prod_random_seed)
prod_clf = Pipeline(steps=[
            ('normalizer', StandardScaler()),
            ('classifier', prod_lr)
            ])
prod_clf.fit(X,y)

[autoreload of sklearn.utils failed: Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
ImportError: cannot import name warn_if_not_float
]
[autoreload of sklearn failed: Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
ImportError: cannot import name __check_build
]
[autoreload of sklearn.svm.classes failed: Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
ValueError: decision_function() requires a code object with 0 free vars, not 2
]
[autoreload of sklearn.svm.base failed: Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247

TypeError: super(type, obj): obj must be an instance or subtype of type

In [13]:
print clf_report
print conf_mat

             precision    recall  f1-score   support

          0       0.95      0.85      0.90      5663
          1       0.92      0.50      0.65        24
          2       0.82      0.64      0.72       547
          3       0.97      0.84      0.90        73
          4       0.95      0.99      0.97     21379

avg / total       0.95      0.95      0.95     27686

[[ 4821     0     0     0   842]
 [    0    12     5     0     7]
 [    2     0   349     2   194]
 [    0     0     7    61     5]
 [  247     1    64     0 21067]]


In [14]:
mappings

{0: 'food',
 1: 'sketches',
 2: 'documents',
 3: 'whiteboards',
 4: 'other',
 'documents': 2,
 'food': 0,
 'other': 4,
 'sketches': 1,
 'whiteboards': 3}

In [15]:
dt = datetime.datetime.now()
formatted_time = du.format_tstamp(dt, frmt='%Y-%m-%d %H_%M_%S')
posix_time = du.datetime_to_posix(dt)
description_str = 'L2-regularized Logistic Regression on CNN codes of Food (No drink) / Documents / \
                   Sketches / Whiteboards after applying Inception v3 as a filter.'

data_and_model = { # metadata 
                  'unique_id': posix_time,
                  'date_created': formatted_time,
                  'mappings': mappings,
                  'model_lib':model.model_lib,
                  'model_name':model.model_name,
                  'layer': layer,
    
                  # complete data
                  'complete_data': complete_data,
                  
                  # training and performance eval
                  'perc_tr': p,
                  'training_seed': train_random_seed,
                  'training_model':train_clf,
                  'description': description_str,
                  'confusion_matrix': conf_mat,
                  'classification_report': clf_report,
                  
                  # production classifier
                  'production_seed': prod_random_seed,
                  'prod_model': prod_clf
                  }

model_only = {'unique_id':posix_time, 
              'mappings':mappings, 
              'date_created': formatted_time, 
              'prod_model': prod_clf}

data_and_model_filename_str = '%s_L1_LR.p'%(formatted_time)
data_and_model_savepath = '%s/%s'%(model_dir, data_and_model_filename_str)
model_only_filename_str = '%s_model_only_L1_LR.p'%(formatted_time)
model_only_savepath = '%s/%s'%(model_dir, model_only_filename_str)

if os.path.isfile(data_and_model_savepath):
    print 'This file already exists!!'
else:
    pkl.dump(data_and_model, open(data_and_model_savepath, 'wb'))

if os.path.isfile(model_only_savepath):
    print 'This file already exists!!'
else:
    pkl.dump(model_only, open(model_only_savepath, 'wb'))