# Project: Decision Trees

## Part a: feature generation

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import skimage.io as io
from skimage import feature
from skimage.color import rgb2gray
from skimage.feature import hog
from skimage import data, exposure
import skimage
import os
import cv2
from skimage.transform import rotate
from skimage.feature import local_binary_pattern
from skimage import data
from skimage.color import label2rgb

import numpy as np
import skimage.io as io
from skimage.color import rgb2hsv
from skimage import data
import matplotlib.pyplot as plt

In [2]:
# # You can change this part!!!
# feature_name = 'LBP_corner'
# dataset_name = 'Fast_Food'

In [3]:
def LBP(image, region):
    METHOD = 'uniform'
    plt.rcParams['font.size'] = 9

    # settings for LBP
    radius = 3
    n_points = 8 * radius


    def overlay_labels(image, lbp, labels):
        mask = np.logical_or.reduce([lbp == each for each in labels])
        return label2rgb(mask, image=image, bg_label=0, alpha=0.5)


    def highlight_bars(bars, indexes):
        for i in indexes:
            bars[i].set_facecolor('r')

    lbp = local_binary_pattern(image, n_points, radius, METHOD)

    titles = ('edge', 'flat', 'corner')
    w = width = radius - 1
    edge_labels = range(n_points // 2 - w, n_points // 2 + w + 1)
    flat_labels = list(range(0, w + 1)) + list(range(n_points - w, n_points + 2))
    i_14 = n_points // 4            # 1/4th of the histogram
    i_34 = 3 * (n_points // 4)      # 3/4th of the histogram
    corner_labels = (list(range(i_14 - w, i_14 + w + 1)) +
                     list(range(i_34 - w, i_34 + w + 1)))

    label_sets = (edge_labels, flat_labels, corner_labels)
    
    if region == 'edge':
        return overlay_labels(image, lbp, edge_labels)
    elif region == 'flat':
        return overlay_labels(image, lbp, flat_labels)
    elif region == 'corner':
        return overlay_labels(image, lbp, corner_labels)
    else:
        raise Exception("Can't recognize region name!!!")

In [4]:
def get_features(image, feature_name):
    if feature_name == 'raw':
        return image.flatten()
    elif feature_name == 'RGB':
#         image = np.moveaxis(image, -1, 0)
        hist = cv2.calcHist([image], [0, 1, 2], None, [16, 16, 16], [0, 256, 0, 256, 0, 256])
        return hist.flatten()
    elif feature_name == 'canny':
        edges = feature.canny(rgb2gray(image), sigma=1)
        return edges.flatten()
    elif feature_name == 'hog':
        _, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True, channel_axis=-1)
        hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
        return hog_image_rescaled.flatten()
    elif feature_name == 'LBP_edge':
        return LBP(rgb2gray(image), 'edge').flatten()
    if feature_name == 'LBP_flat':
        return LBP(rgb2gray(image), 'flat').flatten()
    if feature_name == 'LBP_corner':
        return LBP(rgb2gray(image), 'corner').flatten()
    else:
        raise Exception("Can't recognize feature name!!!")

In [5]:
def process_data(images, labels, feature_name):
    X = []
    
    raw_imgs = np.load(images, allow_pickle=True)
    
    y = np.load(labels, allow_pickle=True)
    
    for i in range(raw_imgs.shape[0]):
        X.append(get_features(raw_imgs[i], feature_name))
    
    return np.array(X), y

In [6]:
def get_dataset(dataset_name, feature_name):
    if dataset_name == "CIFAR_10":
        X_train, y_train = process_data('./' + dataset_name + "/train_images_small_32.npy", './' + dataset_name + "/train_labels_small.npy", feature_name)
        X_valid, y_valid = process_data('./' + dataset_name + "/train_images_32.npy", './' + dataset_name + "/train_labels.npy", feature_name)
    else:
        X_train, y_train = process_data('./' + dataset_name + "/train_images_32.npy", dataset_name + "/train_labels.npy", feature_name)
        X_valid, y_valid = process_data('./' + dataset_name + "/valid_images_32.npy", dataset_name + "/valid_labels.npy", feature_name)
    
    X_test, y_test = process_data('./' + dataset_name + "/test_images_32.npy", dataset_name + "/test_labels.npy", feature_name)
    
#     print(X_train.shape), print(y_train.shape)
#     print(X_valid.shape), print(y_valid.shape)
#     print(X_test.shape), print(y_test.shape)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [7]:
# np.save("./X_train", X_train)
# np.save("./y_train", y_train)
# np.save("./X_test", X_rem)
# np.save("./y_test", y_rem)

## Part b: training

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [9]:
def select_tree_model(model_num, max_depth, criteria, X_train, y_train, X_valid, y_valid):
    clf = DecisionTreeClassifier(criterion=criteria, max_depth=max_depth)
    
    clf = clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_valid)
    
    accuracy = metrics.accuracy_score(y_valid, y_pred)
    
#     print("Model %s: when training with max_depth = %d and criterion = %s, the accuracy = %f" % (model_num, max_depth, criteria, accuracy))
    
    return (clf, accuracy)

In [10]:
def train_model(X_train, y_train, X_valid, y_valid):
    model_num = 1
    clf_best = None
    accuracy_best = -1
    model_best = 0

    for max_depth in [5, 10, 15, 20, 25]:
        for criteria in ['gini', 'entropy']:
            clf_curr, accuracy = select_tree_model(model_num, max_depth, criteria, X_train, y_train, X_valid, y_valid)

            if accuracy > accuracy_best:
                accuracy_best = accuracy
                clf_best = clf_curr
                model_best = model_num

            model_num += 1

#     print()
#     print("The best model is model", model_best, 'with the accuracy of', accuracy_best)
    
    return clf_best

In [11]:
# np.save("./X_train", X_train)
# np.save("./y_train", y_train)
# np.save("./X_valid", X_valid)
# np.save("./y_valid", y_valid)
# np.save("./X_test", X_test)
# np.save("./y_test", y_test)

## Part c: evaluation

In [12]:
def validate_model(clf_best, X_test, y_test):
    y_pred = clf_best.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)

#     print("The testing accuracy is", accuracy)
    
    return accuracy

## Part d: Main Method

In [13]:
# # datasets = ['Shoes', 'Fast_Food', 'CIFAR_10']
# datasets = ['CIFAR_10']
# # features = ['raw', 'RGB', 'canny', 'hog', 'LBP_edge', 'LBP_flat', 'LBP_corner']
# features = ['LBP_edge', 'LBP_flat', 'LBP_corner']

In [14]:
# for dataset in datasets:
#     print(dataset + ":")
    
#     for f in features:
#         X_train, y_train, X_valid, y_valid, X_test, y_test = get_dataset(dataset, f)
        
#         clf_best = train_model(X_train, y_train, X_valid, y_valid)
        
#         best_accuracy = validate_model(clf_best, X_test, y_test)
        
#         print(f + ": " + str(best_accuracy))

In [15]:
def merge_features(X_train_1, X_train_2, X_test_1, X_test_2):
    return np.concatenate((X_train_1, X_train_2), axis=1), np.concatenate((X_test_1, X_test_2), axis=1)

def save_features(dataset, feature, X_train, y_train, X_test, y_test):
    path = "./final_results/" + dataset + "/" + feature
    os.makedirs(path)
    
    np.save(path + './X_train_' + feature, X_train)
    np.save(path + './y_train_' + feature, y_train)
    np.save(path + './X_test_' + feature, X_test)
    np.save(path + './y_test_' + feature, y_test)

In [19]:
datasets = ['CIFAR_10']

# os.makedirs("./final_results")

for dataset in datasets:
    os.makedirs("./final_results/" + dataset)
    
#     X_train_raw, y_train, X_valid_raw, y_valid, X_test_raw, y_test = get_dataset(dataset, 'raw')
#     X_train_RGB, _, X_valid_RGB, _, X_test_RGB, _ = get_dataset(dataset, 'RGB')
#     X_train_LBP_corner, _, X_valid_LBP_corner, _, X_test_LBP_corner, _ = get_dataset(dataset, 'LBP_corner')
    X_train_raw, y_train, _, _, X_test_raw, y_test = get_dataset(dataset, 'raw')
    X_train_RGB, _, _, _, X_test_RGB, _ = get_dataset(dataset, 'RGB')
    X_train_LBP_corner, _, _, _, X_test_LBP_corner, _ = get_dataset(dataset, 'LBP_corner')
    
    X_train_raw_RGB, X_test_raw_RGB = merge_features(X_train_raw, X_train_RGB, X_test_raw, X_test_RGB)
    X_train_raw_LBP_corner, X_test_raw_LBP_corner = merge_features(X_train_raw, X_train_LBP_corner, X_test_raw, X_test_LBP_corner)
    X_train_RGB_LBP_corner, X_test_RGB_LBP_corner = merge_features(X_train_RGB, X_train_LBP_corner, X_test_RGB, X_test_LBP_corner)
    
    X_train_raw_RGB_LBP_corner, X_test_raw_RGB_LBP_corner = merge_features(X_train_raw_RGB, X_train_LBP_corner, X_test_raw_RGB, X_test_LBP_corner)
    
    save_features(dataset, "raw", X_train_raw, y_train, X_test_raw, y_test)
    save_features(dataset, "RGB", X_train_RGB, y_train, X_test_RGB, y_test)
    save_features(dataset, "LBP_corner", X_train_LBP_corner, y_train, X_test_LBP_corner, y_test)
    save_features(dataset, "raw_RGB", X_train_raw_RGB, y_train, X_test_raw_RGB, y_test)
    save_features(dataset, "raw_LBP_corner", X_train_raw_LBP_corner, y_train, X_test_raw_LBP_corner, y_test)
    save_features(dataset, "RGB_LBP_corner", X_train_RGB_LBP_corner, y_train, X_test_RGB_LBP_corner, y_test)
    save_features(dataset, "raw_RGB_LBP_corner", X_train_raw_RGB_LBP_corner, y_train, X_test_raw_RGB_LBP_corner, y_test)