# Training a Car Classifier

In [1]:
%matplotlib inline

import numpy as np
from scipy.stats import describe

from sklearn.grid_search import GridSearchCV
# from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.svm import SVC
from sklearn.metrics import make_scorer
from sklearn.cross_validation import StratifiedKFold, KFold

import matplotlib.pyplot as plt
import pickle

seed=0

In [16]:
def load_data(X_fname,Y_fname, drop_mask=True, add_preview_data=True,
              preview_data='test_locations_and_labels_preview.np',
              preview_im = 'parking_test_preview.png'):
    X = np.load(X_fname)
    Y = np.load(Y_fname)
    
    # remove observation with missing data due to being too close to the edge of the image.
    ix_0 = []
    for i,j in enumerate(X):
        if j.shape[0] < 1600:
            ix_0.append(i)
    ix_0

    mask = np.ones(len(X),dtype=bool)
    mask[ix_0]=False
    X, Y = X[mask], Y[mask]
    
    # return 2D array rather than array of arrays
    X = np.vstack(X)
    
    # add on PREVIEW TEST DATA (if we have access to it, why not??)
    if add_preview_data:
        X,Y = add_data(X,Y,preview_data,preview_im)
    
    # drop transparency mask values
    num_ftrs = X.shape[1]
    mask = np.ones(num_ftrs,dtype=bool)
    trans_ix = range(3,num_ftrs,4)
    mask[trans_ix]=False
    X = X[:,mask]
    
    return X,Y
    
def add_data(X,Y,preview_data,preview_im):
    im = plt.imread(preview_im)
    test_locs_labs = np.load('test_locations_and_labels_preview.np')
    test_locs   = test_locs_labs[:,0:2]
    Y_test = test_locs_labs[:,2]
    
    X_test = []
    for loc in test_locs:
        X_test.append( my_feature_vector(loc, im) )
    X_test = np.array(X_test)
    
    X_result = np.vstack((X,X_test))
    Y_result = np.hstack((Y,Y_test))
    return X_result, Y_result
    
def my_feature_vector(loc, im, size = 10):
    w = size
    # a patch of the size w cenetered at loc is extracted as a feature vector
    patch = im[loc[1]-w:loc[1]+w, loc[0]-w:loc[0]+w]
    p = np.array(patch).flatten()
    return p 

def scorer(estimator,X,y):
    score = 0
    y_pred = estimator.predict(X)
    score_arr = 2 * ((y_pred == 1.) & (y == 1.)) + .25 * ((y_pred == 0.) & (y == 0.)) - .5 * (y_pred != y)
    
    # weight such that there are equal cars and non-cars in sample (like in test dataset)
    total_cars = float((y==1.).sum())
    total_noncars = float((y==0.).sum())
    multiplier = ((y==1.) / total_cars + (y==0.) / total_noncars)
    score_arr = score_arr*multiplier
    
    score = score_arr.sum()
    return score

def get_new_search_limits(grid,selected_param):
    """Returns the adjacent grid elements to the optimal parameter from a grid search"""
    ix = np.nonzero(grid == selected_param)[0][0]
    if ix == 0:
        raise IndexError("ERROR: Best parameter is at edge of search grid. Adjust search grid and re-run.")
    return grid[ix-1],grid[ix+1]

def still_above_threshold(diff_dict,th_dict):
    assert len(diff_dict) == len(th_dict)
    for i in diff_dict.keys():
        if diff_dict[i] >= th_dict[i]:
            return True
    return False


In [3]:
X,Y = load_data('X_trn.np','Y_trn.np')




## Train SVC classifier

### Initial, broad grid search

In [17]:
grid_length=9
n_folds = 3
np.random.seed(0)

search_params = {}
search_params['C'] = np.logspace(-3,5,grid_length)
search_params['gamma'] = np.logspace(-5,0,grid_length)
# search_params['degree'] = [4,5,6]
search_params['coef0'] = np.array((-1,-.5,0,.5,1))
# not sure which is better, sample-frequency-weighting OR weighting by "opportunity cost" of
# missing each class (3.33 times greater for misclassifying car (2.5 pt hit) vs. misclassifying
# non-car (.75 pt hit)) OR the product of the two weights
total_cars = float((Y==1.).sum())
total_noncars = float((Y==0.).sum())
opp_cost = {1.:10/3}
opp_cost_X_freq_weight = {1.:(10/3) * total_noncars / total_cars}

# I THINK that the opp_cost_X_freq_weight weighting will be optimal,
# and I am searching across a wide array of params, so I'll leave 
# this set for now. If time, I will try all 3.
search_params['class_weight'] = [opp_cost_X_freq_weight]
# search_params['class_weight'] = ['auto',opp_cost,opp_cost_X_freq_weight]

# using limited grid search for polynomial kernel b/c of time expense
# these parameters were arrived at via separate poly-only, manual grid search
search_grid = [{'kernel':['linear'],  'C':search_params['C'], 'class_weight': search_params['class_weight']},
               {'kernel':['rbf'],     'C':search_params['C'], 'class_weight': search_params['class_weight'],
                    'gamma':search_params['gamma']},
               {'kernel':['poly'],    'C':[.01,.1,1], 'class_weight': search_params['class_weight'],
                    'gamma':[.001,.01,.1], 'coef0':[.5,1,1.5], 'degree':[2,3]},
               {'kernel':['sigmoid'], 'C':search_params['C'], 'class_weight': search_params['class_weight'],
                    'gamma':search_params['gamma'], 'coef0':search_params['coef0']}]
               
# {'kernel':['poly'],    'C':search_params['C'], 'class_weight': search_params['class_weight'],
#                     'gamma':search_params['gamma'], 'coef0':search_params['coef0'], 'degree':search_params['degree']}

score_func = make_scorer(scorer)
n = X.shape[0]
# use stratified k-fold so that hard-coded frequency weights make sense (opp_cost_X_freq_weight)
cross_val = StratifiedKFold(Y, n_folds, shuffle=True, random_state=seed)

In [18]:
# run this cell when comparing across all kernels
pred = GridSearchCV(SVC(random_state = seed),
                    param_grid = search_grid,
                    cv = cross_val, n_jobs = -1, scoring = scorer)


In [None]:
# Run these cells only for testing to make sure that the search grid is appropriately wide for each kernel type
pred = GridSearchCV(SVC(random_state = seed, kernel = 'sigmoid', class_weight = opp_cost_X_freq_weight),
                    param_grid = {'C':search_params['C'],
                                  'gamma':search_params['gamma'],
                                  'coef0':search_params['coef0']},
                    n_jobs = -1, scoring = scorer, cv = cross_val)

In [None]:
# Run these cells only for testing to make sure that the search grid is appropriately wide for each kernel type
pred = GridSearchCV(SVC(random_state = seed, kernel = 'linear', class_weight = opp_cost_X_freq_weight),
                    param_grid = {'C':search_params['C']},
                    n_jobs = -1, scoring = scorer, cv = cross_val)

In [19]:
# Run these cells only for testing to make sure that the search grid is appropriately wide for each kernel type
pred = GridSearchCV(SVC(random_state = seed, kernel = 'poly', class_weight = opp_cost_X_freq_weight),
                    param_grid = {'C':[.01,.1,1],
                                  'gamma':[.001,.01,.1],
                                  'coef0':[.5,1,1.5],
                                  'degree':[2,3]},
                    n_jobs = -1, scoring = scorer, cv = cross_val)

In [19]:
pred.fit(X,Y)
best_params = pred.best_params_
best_score = pred.best_score_

print best_params
print best_score

{'kernel': 'rbf', 'C': 0.10000000000000001, 'gamma': 0.056234132519034911, 'class_weight': {1.0: 6.7682926829268295}}
2.01270050538


In [20]:
# save this initial guess at a classifier
with open('classifier_0.pickle','wb') as f:
pickle.dump(pred.best_estimator_,f)

# pickle best_params, best_score, search_grid too
with open('first_search_params.pickle','wb') as f:
    pickle.dump((best_params,best_score,search_grid),f)
    

### Now, choose best kernel type and parameter order of magnitude, and do finer-grained grid searches until convergence

In [62]:
# load search params
with open('first_search_params.pickle','rb') as f:
    best_params,best_score,search_grid = pickle.load(f)
    

In [63]:
kernel = best_params['kernel']
class_weight = best_params['class_weight']

fixed_params = {}
fixed_params['kernel'] = kernel
fixed_params['class_weight'] = class_weight
if fixed_params['kernel'] == 'poly':
    degree = best_params['degree']
    fixed_params['degree'] = degree
    
# drop the fixed params from "best_params"
for i in fixed_params.keys():
    del best_params[i]

# choose starting parameters
if kernel == 'linear': grid_ix=0
elif kernel == 'rbf' : grid_ix=1
elif kernel == 'poly': grid_ix=2
elif kernel == 'sigmoid': grid_ix=3
    
search_grid = search_grid[grid_ix]

# set thresholds
th_all = {}
for i in best_params.keys():
    th_all[i] = abs(best_params[i]/10.)
th_score = abs(best_score / 1000.)
# create difference dict
diff = th_all.copy()
diff_score = th_score

# create new cv object w/ more folds
n_folds_2 = 10
cross_val_2 = StratifiedKFold(Y, n_folds_2, shuffle=True, random_state=seed)


In [64]:
# iterate linear grid search with finer mesh until the change in best values is under a threshold
iterations = 1
while (still_above_threshold(diff,th_all)) and (diff_score >= th_score):
    print "iteration #:", iterations
    
    search_grid_old = search_grid.copy()
    search_grid = {}

    for i in best_params.keys():
        lims = get_new_search_limits(search_grid_old[i],best_params[i])
        search_grid[i] = np.linspace(lims[0],lims[1],grid_length)
            
    print "search grid:", search_grid


    if kernel == 'poly':
        clf = SVC(random_state=seed, kernel=kernel, class_weight=class_weight, degree = degree)
    else:
        clf = SVC(random_state=seed, kernel=kernel, class_weight=class_weight)
        
    pred = GridSearchCV(clf, param_grid=search_grid, cv=cross_val_2, n_jobs=-1, scoring=scorer)

    pred.fit(X,Y)
    best_params_old = best_params.copy()
    best_score_old = best_score.copy()
    best_params = pred.best_params_
    best_score = pred.best_score_
    for i in best_params.keys():
        diff[i] = abs(best_params[i]-best_params_old[i])
    diff_score = abs(best_score - best_score_old)
    
    iterations += 1

    print "best params:", best_params
    print "best score:", best_score


iteration #: 1
search grid: {'C': array([ 0.01   ,  0.13375,  0.2575 ,  0.38125,  0.505  ,  0.62875,
        0.7525 ,  0.87625,  1.     ]), 'gamma': array([ 0.01333521,  0.04131048,  0.06928575,  0.09726102,  0.12523629,
        0.15321156,  0.18118683,  0.2091621 ,  0.23713737])}
best params: {'C': 0.38124999999999998, 'gamma': 0.097261022913332856}
best score: 2.047905351
iteration #: 2
search grid: {'C': array([ 0.2575   ,  0.2884375,  0.319375 ,  0.3503125,  0.38125  ,
        0.4121875,  0.443125 ,  0.4740625,  0.505    ]), 'gamma': array([ 0.06928575,  0.07627957,  0.08327339,  0.09026721,  0.09726102,
        0.10425484,  0.11124866,  0.11824248,  0.12523629])}
best params: {'C': 0.4740625, 'gamma': 0.11124865767861612}
best score: 2.04925968486


In [65]:
# save final classifier
with open('classifier.pickle','wb') as f:
    pickle.dump(pred.best_estimator_,f)

# pickle best_params, best_score, search_grid too
with open('final_search_params.pickle','wb') as f:
    pickle.dump((best_params,best_score,search_grid),f)