In [2]:
import cPickle as pickle
from matplotlib import pyplot as plt
import os
import numpy as np

# Grab Input Data
training_labels_file = '../finetuning/VOC_cropped_warped_train_small.txt'
data_files_path = '../../VOC2012/fc7_features/'

data = []
labels = []
data_files = []
with open(training_labels_file) as f:
    for line in f:
        data_file, label = line.split(' ')
        data_file = data_file.split('/')[-1].split('.')[0] + '.pi'
        try:
            data.append(pickle.load(open(data_files_path + data_file, 'rb')))
            labels.append(int(label))
            data_files.append(data_file)
        except IOError as e:
            pass
data = np.squeeze(np.array(data))
labels = np.array(labels)
data_files = np.array(data_files)
print data.shape, labels.shape, data_files.shape

(20005, 4096) (20005,) (20005,)


In [3]:
# Split data into training and test
num_vectors = data.shape[0]
test_ratio = .2
num_test = num_vectors * test_ratio

indices = np.random.permutation(num_vectors)
training_idx, test_idx = indices[num_test:], indices[:num_test]
training_data, test_data = data[training_idx,:], data[test_idx,:]
training_labels, test_labels = labels[training_idx,], labels[test_idx,]

print training_data.shape, test_data.shape
print training_labels.shape, test_labels.shape

(16004, 4096) (4001, 4096)
(16004,) (4001,)


In [57]:
# Define Classifier
# http://scikit-learn.org/stable/modules/grid_search.html
from sklearn import svm, neighbors, linear_model, ensemble, grid_search 

scoring = 'f1_weighted'
class_weight = None # 'subsample' (forests only), 'auto'
# classifier = ensemble.RandomForestClassifier(max_depth=15, class_weight=class_weight)
classifier = grid_search.GridSearchCV(svm.LinearSVC(class_weight=class_weight), {'C':[.01, .1, 1, 10, 100]}, scoring=scoring)

# classifier = linear_model.RidgeClassifierCV()
# classifier = linear_model.SGDClassifier()
# classifier = ensemble.AdaBoostClassifier()
# classifer = ensemble.GradientBoostingClassifier()
# classifer = ensemble.ExtraTreesClassifier()

## Slow
# classifier = grid_search.GridSearchCV(svm.SVC(class_weight=class_weight), {'kernel':('linear','rbf'), 'C':[.01, .1, 1, 10, 100]}, scoring=scoring)
# classifier = neighbors.KNeighborsClassifier()

In [None]:
%%time
# Train Classifier
classifier.fit(training_data, training_labels);

In [None]:
# Test Classifier
from sklearn.cross_validation import cross_val_score

predicted_labels = classifier.predict(test_data)
print classifier.score(training_data, training_labels) # score is always accuracy
print cross_val_score(classifier, training_data, training_labels, scoring=scoring)
print classifier.score(test_data, test_labels) # score is always accuracy

In [None]:
import pandas
from sklearn.metrics import confusion_matrix
wrong_test_idx = np.arange(test_labels.shape[0])[predicted_labels != test_labels]
wrong_data_files_idx = test_idx[wrong_test_idx]
wrong_data_files = data_files[wrong_data_files_idx]

wrong_predicted = predicted_labels[predicted_labels != test_labels]
wrong_true = test_labels[predicted_labels != test_labels]

# Labels 0-4 = person, bike, bus, car, motorbike
print "Training Confusion Matrix"
print confusion_matrix(training_labels, classifier.predict(training_data))
print "Test Results"
print "Wrong:", len(wrong_true), '/', len(test_labels), ',', float(len(wrong_true))/len(test_labels)
print confusion_matrix(test_labels, predicted_labels)
print pandas.DataFrame({"Image Index": wrong_data_files, "Predicted": wrong_predicted, "Truth": wrong_true})

In [None]:
%matplotlib inline
# Visualize Feature Importances
# Forest/Tree Classifiers only
import matplotlib.pyplot as plt

importances = classifier.feature_importances_

width = 0.35       # the width of the bars
plt.subplot(211)
plt.bar(np.arange(importances.shape[0]), importances, width, color='r')
plt.title("Feature Importances of Model")
plt.ylabel("Importance")
plt.subplot(212)
plt.bar(np.arange(importances.shape[0]),sorted(importances), width, color='r')
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()