In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import imutils
import cv2
import os

In [2]:
# Takes the image and collapses it down into a feature vector, in
# which the RGB pixel intensities are flattened into a single list
# of numbers
def make_vector(image, size=(32, 32)):
    return cv2.resize(image, size).flatten()

In [3]:
# Makes a color histogram of each image, and normalize it to make
# another feature vector based on frequency of color
def make_histogram(image, bins=(8, 8, 8)):

    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
        [0, 180, 0, 256, 0, 256])

    if imutils.is_cv2():
        hist = cv2.normalize(hist)
        
    else:
        cv2.normalize(hist, hist)

    return hist.flatten()

In [4]:
# Path to the images
imagePaths = list(paths.list_images("./training_set/"))

In [5]:
# Initialize each matrix
rawImages = []
features = []
labels = []

In [6]:
# Loop over the input images, load the images, and get the label and
# feature vectors for each image.
for (i, imagePath) in enumerate(imagePaths):
    image = cv2.imread(imagePath)
    label = imagePath.split(os.path.sep)[-1].split(".")[0]

    pixels = make_vector(image)
    hist = make_histogram(image)

    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)

    # show an update every 1,000 images
    if i > 0 and i % 1000 == 0:
        print("{}/{}".format(i, len(imagePaths)))

1000/8005
2000/8005
3000/8005
4000/8005
5000/8005
6000/8005
7000/8005
8000/8005


In [7]:
# show some information on the memory consumed by the raw images
# matrix and features matrix
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)
print("[INFO] pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

[INFO] pixels matrix: 24.02MB
[INFO] features matrix: 16.01MB


In [8]:
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
(trainRI, testRI, trainRL, testRL) = train_test_split(
	rawImages, labels, test_size=0.25, random_state=42)
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
	features, labels, test_size=0.25, random_state=42)

In [11]:
# train and evaluate a k-NN classifer on the raw pixel intensities
print("[INFO] evaluating raw pixel accuracy...")
model = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
model.fit(trainRI, trainRL)
acc1 = model.score(trainRI, trainRL)
acc = model.score(testRI, testRL)
print("Raw pixel training accuracy: {:.2f}%".format(acc1 * 100))
print("Raw pixel testing accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating raw pixel accuracy...
[INFO] raw pixel training accuracy: 63.32%
[INFO] raw pixel testing accuracy: 54.95%


In [12]:
# train and evaluate a k-NN classifer on the histogram
# representations
print("[INFO] evaluating histogram accuracy...")
model = KNeighborsClassifier(n_neighbors=40,
	n_jobs=-1)
model.fit(trainFeat, trainLabels)
acc = model.score(trainFeat, trainLabels)
acc = model.score(testFeat, testLabels)
print("Histogram training accuracy: {:.2f}%".format(acc1 * 100))
print("Histogram testing accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating histogram accuracy...
Histogram training accuracy: 63.32%
Histogram testing accuracy: 59.49%
