# Classification of images by KNN

In [3]:
# Import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf 
from tensorflow.examples.tutorials.mnist import input_data
import pandas as pd
import matplotlib.image as mpimg
import numpy as np
import os

## CIFAR10 dataset classification by KNN

The dataset labeled as "train" has 50000 images saved as png files and a csv file with labels to identify each of these files are also available. We will use this dataset and split it into a training set and testing set for this project.

We will use 'imread()' function to extract the pixel RGB information and matched label in CSV file.

Then we split 80% as the training set, and 20% as the testing set.

In [10]:
# Load the CIFAR10 dataset
os.chdir('/Users/YINA/Desktop/MLPythonProj/train') # Change to the directory of data 

data=[]
# Read the images features and save them in 'data' list
for i in range(1,50001):
    x=str(i)+'.png'
    y=mpimg.imread(x)
    data.append(y)
    
# Load the label file and save them in 'label' list    
label_o=pd.read_csv('trainLabels.csv')
class_le = LabelEncoder()
label_ = class_le.fit_transform(label_o['label'])
label = np.array(label_)

# Split 80% as the training set, 20% as the testing set
X_train=np.array(data[:40000]).reshape(-1,3072)
X_test=np.array(data[40000:]).reshape(-1,3072)
y_train=np.array(label[:40000])
y_test=np.array(label[40000:])

## Fit the training model and evaluate the testing accuracy score

In [11]:
# CIFAR-10 training process
# Define the classifier
model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
# Fit the model
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
           weights='uniform')

In [7]:
# CIFAR-10 testing result
acc = model.score(X_test, y_test)
print("Accuracy: {:.2f}%".format(acc * 100))

Accuracy: 33.83%


## MNIST dataset classification by KNN

Images have been preprocessed and readily available in the tensorflow package. For this analysis, we just import data as numpy arrays.

In [7]:
# Load the MNIST dataset from the tensorflow package
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Fit the training model and evaluate the testing accuracy score

In [8]:
# MNIST training process
# Define the classifier
model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
# Fit the model
model.fit(mnist.train.images, mnist.train.labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
           weights='uniform')

In [9]:
# MNIST testing result
acc = model.score(mnist.test.images, mnist.test.labels)
print("Accuracy: {:.2f}%".format(acc * 100))

Accuracy: 96.77%
