In [1]:
from __future__ import print_function, division
from data_utils import load_CIFAR10

X_train, y_train, X_test, y_test = load_CIFAR10("cifar-10-batches-py")

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (50000, 32, 32, 3)
y_train: (50000,)
X_test: (10000, 32, 32, 3)
y_test: (10000,)


In [5]:
import numpy as np
from skimage.feature import hog
from skimage import color, exposure

orie = 9
ppc = (8,8)
cpb = (2,2)
tmp_train = []
for i in xrange(X_train.shape[0]):
    gray_img = color.rgb2gray(X_train[i])
    fd = hog(gray_img, orientations=orie, pixels_per_cell=ppc,
                    cells_per_block=cpb, visualise=False, transform_sqrt=True)
    fd = fd[np.newaxis, :]
    tmp_train.append(fd)
hog_train = np.concatenate(tmp_train, axis=0)

tmp_test = []
for i in xrange(X_test.shape[0]):
    gray_img = color.rgb2gray(X_test[i])
    fd = hog(gray_img, orientations=orie, pixels_per_cell=ppc,
                    cells_per_block=cpb, visualise=False, transform_sqrt=True)
    fd = fd[np.newaxis, :]
    tmp_test.append(fd)

hog_test= np.concatenate(tmp_test, axis=0)

print(hog_train.shape)
print(hog_test.shape)

(50000, 324)
(10000, 324)


In [8]:
from skimage.feature import local_binary_pattern

radius = 3
n_points = 8 * radius
METHOD = 'uniform'
eps = 1e-7

lbp_train = []
for i in xrange(X_train.shape[0]):
    gray_img = color.rgb2gray(X_train[i])
    lbp = local_binary_pattern(gray_img, n_points, radius, METHOD)
    h, _ = np.histogram(lbp.ravel(),bins=np.arange(0, n_points+3), range=(0, n_points+2))
    h = h / (h.sum()+eps)
    h = h[np.newaxis, :]
    lbp_train.append(h)
    
lbp_train = np.concatenate(lbp_train, axis=0)

lbp_test = []
for i in xrange(X_test.shape[0]):
    gray_img = color.rgb2gray(X_test[i])
    lbp = local_binary_pattern(gray_img, n_points, radius, METHOD)
    h, _ = np.histogram(lbp.ravel(),bins=np.arange(0, n_points+3), range=(0, n_points+2))
    h = h / (h.sum()+eps)
    h = h[np.newaxis, :]
    lbp_test.append(h)

lbp_test = np.concatenate(lbp_test, axis=0)

print(lbp_train.shape)
print(lbp_test.shape)

(50000, 26)
(10000, 26)


In [6]:
interv = 64
data_train = X_train.copy()
data_train = data_train.astype('uint8') // interv
base = 256 // interv
fea_train = []
for i in xrange(data_train.shape[0]):
    h = np.zeros((base**3,))
    tmp = data_train[i].reshape(-1, 3)
    tmp = tmp[:, 0]*base*base + tmp[:, 1]*base + tmp[:, 2]
    uni, cnt = np.unique(tmp, return_counts=True)
    h[uni] = cnt
    h = h / h.sum()
    h = h[np.newaxis, :]
    fea_train.append(h)

fea_train = np.concatenate(fea_train, axis=0)

data_test = X_test.copy()
data_test = data_test.astype('uint8') // interv
fea_test = []
for i in xrange(data_test.shape[0]):
    h = np.zeros((base**3,))
    tmp = data_test[i].reshape(-1, 3)
    tmp = tmp[:, 0]*base*base + tmp[:, 1]*base + tmp[:, 2]
    uni, cnt = np.unique(tmp, return_counts=True)
    h[uni] = cnt
    h = h / h.sum()
    h = h[np.newaxis, :]
    fea_test.append(h)

fea_test = np.concatenate(fea_test, axis=0)

print(fea_train.shape)
print(fea_test.shape)

(50000, 64)
(10000, 64)


In [9]:
import pickle
data_train = np.concatenate([hog_train, lbp_train, fea_train], axis=1)
data_test = np.concatenate([hog_test, lbp_test, fea_test], axis=1)

with open('data_train.arr', 'wb') as handle:
    pickle.dump(data_train, handle)

with open('data_test.arr', 'wb') as handle:
    pickle.dump(data_test, handle)
print(data_train.shape)
print(data_test.shape)


(50000, 414)
(10000, 414)


In [3]:
import pickle
with open('data_train.arr', 'rb') as handle:
    data_train = pickle.load(handle)
with open('data_test.arr', 'rb') as handle:
    data_test = pickle.load(handle)

print(data_train.shape)
print(data_test.shape)

(50000, 3454)
(10000, 3454)


In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components="mle")
pca.fit(data_train)
data_train = pca.transform(data_train)
data_test = pca.transform(data_test)
print(data_train.shape)
print(data_test.shape)

KeyboardInterrupt: 

In [None]:
from sklearn import neighbors
import time 

group_K = [1, 3, 5, 20, 50, 100]
acc = np.zeros((len(group_K),))
#for i in xrange(len(group_K)):
for i in range(1,100,2):
    print("K: %d" %i)
    clf = neighbors.KNeighborsClassifier(group_K[i], n_jobs=-1)
    clf.fit(data_train, y_train)
    tic = time.time()
    preds = clf.predict(data_test)
    toc = time.time()
    print("time for prediction: %d seconds" %(toc-tic))

    acc[i] = (preds==y_test).mean()
    print("accuracy: ", acc[i])

print(acc)

In [22]:
from sklearn import svm

tic = time.time()
lin_clf = svm.LinearSVC() 

lin_clf.fit(data_train, y_train)
toc_1 = time.time()
print("time for training: %d seconds" %(toc_1-tic))

preds_train = lin_clf.predict(data_train)
acc_train = (preds_train==y_train).mean()

preds_test = lin_clf.predict(data_test)
acc_test = (preds_test==y_test).mean()
toc_2 = time.time()
print("time for prediction: %d seconds" %(toc_2-toc_1))

print("train accuracy: ", acc_train)
print("test accuracy: ", acc_test)

time for training: 221 seconds
time for prediction: 0 seconds
train accuracy:  0.68418
test accuracy:  0.6113


In [20]:
from sklearn import linear_model

tic = time.time()
lin_reg = linear_model.LogisticRegression(multi_class='multinomial', n_jobs=-1, solver='sag', max_iter=500)
lin_reg.fit(data_train, y_train)
toc_1 = time.time()
print("time for training: %d seconds" %(toc_1-tic))

preds_train = lin_reg.predict(data_train)
acc_train = (preds_train==y_train).mean()

preds_test = lin_reg.predict(data_test)
acc_test = (preds_test==y_test).mean()
toc_2 = time.time()
print("time for prediction: %d seconds" %(toc_2-toc_1))

print("train accuracy: ", acc_train)
print("test accuracy: ", acc_test)

time for training: 101 seconds
time for prediction: 0 seconds
train accuracy:  0.60714
test accuracy:  0.5855


In [6]:
from sklearn.ensemble import RandomForestClassifier
import time
tic = time.time()
rf = RandomForestClassifier(max_depth = 12, min_samples_leaf = 15, n_jobs = -1, random_state=0)
rf.fit(data_train,y_train)
toc_1 = time.time()
print("time for training: %d seconds" %(toc_1-tic))

preds = rf.predict(data_train)
acc_train = (preds==y_train).mean()

preds = rf.predict(data_test)
acc_test = (preds==y_test).mean()
toc_2 = time.time()
print("time for prediction: %d seconds" %(toc_2-toc_1))

print("train accuracy: ", acc_train)
print("test accuracy: ", acc_test)

time for training: 3 seconds
time for prediction: 0 seconds
train accuracy:  0.67062
test accuracy:  0.4267


In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
import time
bag_clf = BaggingClassifier(base_estimator=svm.LinearSVC(), n_estimators=10, max_samples=0.7, max_features=0.7, n_jobs=-1)

tic = time.time()
bag_clf.fit(data_train, y_train)
toc_1 = time.time()
print("time for training: %d seconds" %(toc_1-tic))
preds = bag_clf.predict(data_train)
acc_train = (preds==y_train).mean()

preds = bag_clf.predict(data_test)
acc_test = (preds==y_test).mean()
toc_2 = time.time()
print("time for prediction: %d seconds" %(toc_2-toc_1))

print("train accuracy: ", acc_train)
print("test accuracy: ", acc_test)

time for training: 229 seconds
time for prediction: 46 seconds
train accuracy:  0.66918
test accuracy:  0.616


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
import time

tic = time.time()
rf = RandomForestClassifier(max_depth = 12, min_samples_leaf = 15, n_jobs = -1)
ada_clf = AdaBoostClassifier(base_estimator=rf, algorithm='SAMME')
scores = cross_val_score(ada_clf, data_train, y_train,n_jobs=-1)
toc_1 = time.time()
print("time for training: %d seconds" %(toc_1-tic))
print("scores",scores.mean()) 

preds = ada_clf.predict(data_train)
acc_train = (preds==y_train).mean()

preds = ada_clf.predict(data_test)
acc_test = (preds==y_test).mean()
toc_2 = time.time()
print("time for prediction: %d seconds" %(toc_2-toc_1))

print("train accuracy: ", acc_train)
print("test accuracy: ", acc_test)