## Unsupervised Decision Tree

In [1]:
#Importing the required libraries
import os
import cv2
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

In [2]:
datasets = ['Shoes']
features = ['RGB']


output_dir = "./unsupervised_results/"

In [3]:
def load_data(feature):
    X_train = np.load( "./Shoes" + "/" + feature + "/X_train_" + feature + ".npy")
    X_test = np.load( "./Shoes"+ "/" + feature + "/X_test_" + feature + ".npy")
    y_train = np.load( "./Shoes" + "/" + feature + "/y_train_" + feature + ".npy")
    y_test = np.load( "./Shoes" + "/" + feature + "/y_test_" + feature + ".npy")
    
    return X_train, X_test, y_train, y_test

## DecisionTree: max_leaf_nodes = num of class

In [4]:
# # Prepare dataset
# # CHANGE THIS
# X_train = np.load("./supervised/final_results/Shoes/RGB/X_train_RGB.npy")
# X_test = np.load("./supervised/final_results/Shoes/RGB/X_test_RGB.npy")
# y_test = np.load("./supervised/final_results/Shoes/RGB/y_test_RGB.npy")
# y_train = np.load("./supervised/final_results/Shoes/RGB/y_train_RGB.npy")

In [5]:
def train(X_train, X_test, y_test, y_train, number_of_class):
    '''
    We treat this as a regression problem, basically just treat the dataset as our label. 
    Spliting the decision tree based on the variance reduction rule.
    Returns the test dataset with assigned cluster index
    '''
    # Regression Decision Tree
    reg = DecisionTreeRegressor(criterion="squared_error", max_leaf_nodes=number_of_class)
    reg = reg.fit(X_train,X_train)
    y_pred = reg.predict(X_test) #values in y_pred are cluster centers!
    
    y_train_pred = reg.predict(X_train)
    
    unique = np.unique(y_pred, axis=0) # a numpy array with only unique clusters
    #train_unique = np.unique(y_train_pred, axis=0)
    #print(unique, train_unique)
    # assign a cluster index to our prediction points
    
    cluster_index = np.zeros((y_pred.shape[0], )) 
    for i, cluster in enumerate(unique):
        for j, datapoint in enumerate(y_pred):
            if np.array_equal(datapoint, cluster):
                cluster_index[j] = int(i)
                
    train_cluster_index = np.zeros((y_train_pred.shape[0], )) 
    for i, cluster in enumerate(unique):
        for j, datapoint in enumerate(y_train_pred):
            if np.array_equal(datapoint, cluster):
                train_cluster_index[j] = int(i)
    return cluster_index, train_cluster_index
    
    

def get_population_in_cluster_i(number_of_class, y_test, cluster_index, cluster_i):
    '''
    This is function that returns the number of labels for cluster_i
    '''
    true_label, counts =  np.unique(y_test[np.where(np.array(cluster_index)==cluster_i)[0]], return_counts=True)
    #add dummy into counts 
    for i in range(number_of_class):
        if i not in true_label:
            counts = np.insert(counts, i , 0)
    return counts

def label_assign(y_test, cluster_index, number_of_class):
    '''
    Perform two accuracy measurements
    '''
    
    # Use the majority rule to assign a label to each cluster
    
    population_of_cluster = np.asmatrix([get_population_in_cluster_i(number_of_class, y_test, cluster_index, i) for i in range(number_of_class)])
    # Assigned label for each cluster
    labels = np.argmax(population_of_cluster,axis=1)
    labels_simple = []
    for i in np.array(labels):
        labels_simple.append(i[0])
#     print("When using the simple majority rule to assign a label to each cluster:")
#     for i, label in enumerate(labels_simple):
#         print("The label for cluster", i, "is:", label)
    
    # Majority Rule with assigned weight
    
    #precentage_in_cluster = normalize(population_of_cluster, axis=1, norm='l1')
    #print(precentage_in_cluster)
    precentage_in_label = normalize(population_of_cluster, axis=0, norm='l1')
    #print(precentage_in_label)
    score_matrix = np.multiply(population_of_cluster,precentage_in_label)
    #score_matrix
    norm_score = normalize(score_matrix, axis=1, norm='l1')
    labels_weight = np.argmax(norm_score, axis=1)
#     print("When using weighted majority rule to assign a label to each cluster:")
#     for i, label in enumerate(labels_weight):
#         print("The label for cluster", i, "is:", label)
    
    return labels_simple, labels_weight
    
def get_accuracy(cluster_new_label, cluster_index, y_test):
    labels = [cluster_new_label[int(ci)] for ci in cluster_index]
    return accuracy_score(y_test, labels)
    

What you need to save: cluster_index, labels_simple, labels_weight, accuracy_simple, accuracy_weight FOR BOTH TWO METHODS.

If you don't want printed results, comment all print in the function definition

In [6]:
def no_regroup(dataset, feature, X_train, X_test, y_train, y_test):
    # Wrapper function
    number_of_class = len(np.unique(y_test))
    cluster_index, train_cluster_index = train(X_train, X_test, y_test, y_train, number_of_class)
    labels_simple, labels_weight = label_assign(y_train, train_cluster_index, number_of_class)
    print(labels_simple, labels_weight)
    """
    #cluster_index = train(X_train, X_test, y_test, y_train, number_of_class)
    labels_simple, labels_weight = label_assign(y_test, cluster_index, number_of_class)

    accuracy_simple = get_accuracy(labels_simple, cluster_index, y_test)
    accuracy_weight = get_accuracy(labels_weight, cluster_index, y_test)
    """
    accuracy_simple = get_accuracy(labels_simple, cluster_index, y_test)
    accuracy_weight = get_accuracy(labels_weight, cluster_index, y_test)
    print("Accuracy score using unweighted labeling method", accuracy_simple)
    print("Accuracy score using weighted labeling method", accuracy_weight)

#     print("Accuracy score using unweighted labeling method", accuracy_simple)
#     print("Accuracy score using weighted labeling method", accuracy_weight)


## Decision Tree: Merge clusters

In [7]:
def train_merge(X_train, X_test, y_test, y_train, number_of_class):
    reg = DecisionTreeRegressor(criterion="squared_error", max_leaf_nodes=50)
    reg = reg.fit(X_train,X_train)
    y_pred = reg.predict(X_test)
    
    # Regroup using KMeans
    kmeans = KMeans(n_clusters=number_of_class,random_state=0).fit(y_pred)
    cluster_index = kmeans.labels_
    return cluster_index

In [8]:
def regroup(dataset, feature, X_train, X_test, y_train, y_test):
    # Wrapper function
    number_of_class = len(np.unique(y_test))
    cluster_index = train_merge(X_train, X_test, y_test, y_train, number_of_class)
    labels_simple, labels_weight = label_assign(y_test, cluster_index, number_of_class)

    accuracy_simple = get_accuracy(labels_simple, cluster_index, y_test)
    accuracy_weight = get_accuracy(labels_weight, cluster_index, y_test)
#     print("Accuracy score using unweighted labeling method", accuracy_simple)
#     print("Accuracy score using weighted labeling method", accuracy_weight)



In [9]:
def trim(X_test, y_test, number_of_class):
    temp_label = -1
    split_at_index = []
    for index, label in enumerate(y_test):
        if temp_label != label:
            #print("new label", label)
            temp_label = label
            split_at_index.append(index)

    label_size = [split_at_index[i+1] - split_at_index[i]for i in range(number_of_class-1)]
    label_size.append(len(y_test) - split_at_index[-1] )
    minimun = np.min(label_size)
    
    trim_y = np.concatenate([y_test[split_at_index[i]:(split_at_index[i]+minimun)] for i in range(number_of_class) ], axis=0)
    trim_x = np.concatenate([X_test[split_at_index[i]:(split_at_index[i]+minimun)] for i in range(number_of_class) ], axis=0)
    

    return trim_x, trim_y

In [11]:
import time

start = time.time()

index = 0

for dataset in datasets:
    
    for feature in features:
        
        X_train, X_test, y_train, y_test = load_data(feature)
        X_test, y_test = trim(X_test, y_test,5)
        no_regroup(dataset, feature, X_train, X_test, y_train, y_test)
        
        print(index)
        
        index += 1

print("The training took %f hours" % ((time.time()-start)/3600))

[4, 0, 3, 1, 2] [4 0 3 1 2]
Accuracy score using unweighted labeling method 0.1979381443298969
Accuracy score using weighted labeling method 0.1979381443298969
0
The training took 0.005351 hours


