# Project: Decision Trees

## Part a: feature generation

In [175]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import skimage.io as io
from skimage import feature
from skimage.color import rgb2gray
from skimage.feature import hog
from skimage import data, exposure
import skimage
import os
import cv2
from skimage.transform import rotate
from skimage.feature import local_binary_pattern
from skimage import data
from skimage.color import label2rgb

import numpy as np
import skimage.io as io
from skimage.color import rgb2hsv
from skimage import data
import matplotlib.pyplot as plt

In [176]:
# You can change this part!!!
feature_name = 'RGB'
dataset_name = './Shoes'

In [177]:
def LBP(image, region):
    METHOD = 'uniform'
    plt.rcParams['font.size'] = 9

    # settings for LBP
    radius = 3
    n_points = 8 * radius


    def overlay_labels(image, lbp, labels):
        mask = np.logical_or.reduce([lbp == each for each in labels])
        return label2rgb(mask, image=image, bg_label=0, alpha=0.5)


    def highlight_bars(bars, indexes):
        for i in indexes:
            bars[i].set_facecolor('r')

    lbp = local_binary_pattern(image, n_points, radius, METHOD)

    titles = ('edge', 'flat', 'corner')
    w = width = radius - 1
    edge_labels = range(n_points // 2 - w, n_points // 2 + w + 1)
    flat_labels = list(range(0, w + 1)) + list(range(n_points - w, n_points + 2))
    i_14 = n_points // 4            # 1/4th of the histogram
    i_34 = 3 * (n_points // 4)      # 3/4th of the histogram
    corner_labels = (list(range(i_14 - w, i_14 + w + 1)) +
                     list(range(i_34 - w, i_34 + w + 1)))

    label_sets = (edge_labels, flat_labels, corner_labels)
    
    if region == 'edge':
        return overlay_labels(image, lbp, edge_labels)
    elif region == 'flat':
        return overlay_labels(image, lbp, flat_labels)
    elif region == 'corner':
        return overlay_labels(image, lbp, corner_labels)
    else:
        raise Exception("Can't recognize region name!!!")

In [178]:
def get_features(image, feature_name):
    if feature_name == 'raw':
        return image.flatten()
    elif feature_name == 'RGB':
#         image = np.moveaxis(image, -1, 0)
        hist = cv2.calcHist([image], [0, 1, 2], None, [16, 16, 16], [0, 256, 0, 256, 0, 256])
        return hist.flatten()
    elif feature_name == 'canny':
        edges = feature.canny(rgb2gray(image), sigma=1)
        return edges.flatten()
    elif feature_name == 'hog':
        _, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True, channel_axis=-1)
        hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
        return hog_image_rescaled.flatten()
    elif feature_name == 'LBP_edge':
        return LBP(rgb2gray(image), 'flat').flatten()
    if feature_name == 'LBP_flat':
        return LBP(rgb2gray(image), 'flat').flatten()
    if feature_name == 'LBP_corner':
        return LBP(rgb2gray(image), 'flat').flatten()
    else:
        raise Exception("Can't recognize feature name!!!")

In [179]:
def process_data(images, labels, feature_name):
    X = []
    
    raw_imgs = np.load(images, allow_pickle=True)
    
    y = np.load(labels, allow_pickle=True)
    
    for i in range(raw_imgs.shape[0]):
        X.append(get_features(raw_imgs[i], feature_name))
    
    return np.array(X), y

In [180]:
X_train, y_train = process_data(dataset_name + "/train_images_32.npy", dataset_name + "/train_labels.npy", feature_name)
X_valid, y_valid = process_data(dataset_name + "/valid_images_32.npy", dataset_name + "/valid_labels.npy", feature_name)
X_test, y_test = process_data(dataset_name + "/test_images_32.npy", dataset_name + "/test_labels.npy", feature_name)

In [181]:
# X, y = get_data("train")
# X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.7)
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

In [182]:
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(10000, 4096)
(10000,)
(2500, 4096)
(2500,)
(1215, 4096)
(1215,)


(None, None)

In [183]:
# np.save("./X_train", X_train)
# np.save("./y_train", y_train)
# np.save("./X_test", X_rem)
# np.save("./y_test", y_rem)

## Part b: training

In [184]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [185]:
def select_tree_model(model_num, max_depth, criteria, X_train, y_train, X_valid, y_valid):
    clf = DecisionTreeClassifier(criterion=criteria, max_depth=max_depth)
    
    clf = clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_valid)
    
    accuracy = metrics.accuracy_score(y_valid, y_pred)
    
    print("Model %s: when training with max_depth = %d and criterion = %s, the accuracy = %f" % (model_num, max_depth, criteria, accuracy))
    
    return (clf, accuracy)

In [186]:
model_num = 1
clf_best = None
accuracy_best = -1
model_best = 0

for max_depth in [5, 10, 15, 20, 25]:
    for criteria in ['gini', 'entropy']:
        clf_curr, accuracy = select_tree_model(model_num, max_depth, criteria, X_train, y_train, X_valid, y_valid)
        
        if accuracy > accuracy_best:
            accuracy_best = accuracy
            clf_best = clf_curr
            model_best = model_num
            
        model_num += 1

print()
print("The best model is model", model_best, 'with the accuracy of', accuracy_best)

Model 1: when training with max_depth = 5 and criterion = gini, the accuracy = 0.371600
Model 2: when training with max_depth = 5 and criterion = entropy, the accuracy = 0.379600
Model 3: when training with max_depth = 10 and criterion = gini, the accuracy = 0.420000
Model 4: when training with max_depth = 10 and criterion = entropy, the accuracy = 0.421200
Model 5: when training with max_depth = 15 and criterion = gini, the accuracy = 0.428800
Model 6: when training with max_depth = 15 and criterion = entropy, the accuracy = 0.437600
Model 7: when training with max_depth = 20 and criterion = gini, the accuracy = 0.435600
Model 8: when training with max_depth = 20 and criterion = entropy, the accuracy = 0.428800
Model 9: when training with max_depth = 25 and criterion = gini, the accuracy = 0.425200
Model 10: when training with max_depth = 25 and criterion = entropy, the accuracy = 0.430400

The best model is model 6 with the accuracy of 0.4376


In [187]:
# np.save("./X_train", X_train)
# np.save("./y_train", y_train)
# np.save("./X_valid", X_valid)
# np.save("./y_valid", y_valid)
# np.save("./X_test", X_test)
# np.save("./y_test", y_test)

## Part c: evaluation

In [188]:
y_pred = clf_best.predict(X_test)
    
accuracy = metrics.accuracy_score(y_test, y_pred)

print("The testing accuracy is", accuracy)

The testing accuracy is 0.43868312757201644


In [189]:
# from matplotlib import pyplot as plt
# from sklearn import tree

In [190]:
# text_representation = tree.export_text(clf_best)
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(clf_best,
#                    class_names=['0', '1', '2', '3', '4'],
#                    max_depth=2,
#                    filled=True)