# MNIST

This notebook contains code for using OpenCV on the MNIST handwritten digit dataset. 

We ingest and pre-process the data, then extract features using Histogram of Oriented Gradients (HOG). 

Then, we apply SVM, Decision Tree, Random Forest, and Logistic Regression Classifiers on the extracted features. 

The decision tree is exported as tree.dot, a GraphViz visualization. 

In [None]:
import cv2
from sklearn import datasets, svm, metrics, tree, ensemble, linear_model
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import os
import glob

rootdir_training = 'mnist_png/training'
rootdir_testing = 'mnist_png/testing'

In [None]:
winSize = (16, 16)
blockSize = (16,16)
blockStride = (1,1)
cellSize = (8,8)
nbins = 9
derivAperture = 1
winSigma = 4.
histogramNormType = 0
L2HysThreshold = 2.0000000000000001e-01
gammaCorrection = True

hog2 = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma,
                        histogramNormType,L2HysThreshold,gammaCorrection)

In [None]:
img_paths = glob.glob(rootdir_training + "/*/*.png")
train_labels =[]
training = []
for img in img_paths[:]: 
    label = img.split("/")[-2]
    data = cv2.imread(img)
    vec = hog2.compute(data)
    train_labels.append(label)
    training.append(vec)

In [None]:
img_paths = glob.glob(rootdir_testing + "/*/*.png")
test_labels =[]
testing = []
for img in img_paths[:]: 
    label = img.split("/")[-2]
    data = cv2.imread(img)
    vec = hog2.compute(data)
    test_labels.append(label)
    testing.append(vec)

In [None]:
nsamples, nx, ny = np.asarray(training).shape
training = np.asarray(training).reshape((nsamples,nx*ny))
nsamples, nx, ny = np.asarray(testing).shape
testing = np.asarray(testing).reshape((nsamples,nx*ny))

# SVM Classifier

In [None]:
clf = svm.SVC()
clf.fit(training, train_labels)
predict = clf.predict(testing)
metrics.accuracy_score(predict, test_labels)

# Decision Trees Classifier

In [7]:
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(training, train_labels)
predict = tree_clf.predict(testing)
metrics.accuracy_score(predict, test_labels)

0.89139999999999997

# Visualize the Decision Tree

In [9]:
tree.export_graphviz(tree_clf, out_file='tree.dot')

# Random Forest Classifier

In [13]:
rf_clf = ensemble.RandomForestClassifier()
rf_clf.fit(training, train_labels)
predict = rf_clf.predict(testing)
metrics.accuracy_score(predict, test_labels)

0.95840000000000003

# Logistic Regression Classifier


In [15]:
lr_clf = linear_model.LogisticRegression()
lr_clf.fit(training, train_labels)
predict = lr_clf.predict(testing)
metrics.accuracy_score(predict, test_labels)

0.97250000000000003