<a href="https://colab.research.google.com/github/cakwok/CS6140-Machine-Learning/blob/main/1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CS6140 Assignment 1 Q1.1
Wing Man, Kwok  
05/18/2022

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split  #Split training and testing data

In [26]:
class ClassificationDecisionTree:                       #Build Classification Decision Tree for dataset data.csv
    def __init__(self, max_depth=10, min_samples=2):    #initialize object paramenters
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.root = None

    def terminate(self, depth):                         #check if termination criteria is met
        if (depth >= self.max_depth or self.numofClasslabels == 1 or self.numofSamples < self.min_samples):
            return True
        return False
           
    def fit(self, X, y):                                #fit dataset into the tree    
        self.root = self.build_tree(X, y)

    def compute_entropy(self, y):                       #compute root/parent node entropy
        entropy = 0
        bin_propability = np.bincount(y) / len(y)       #count occurence of a value which matches with index, then normalize
        for p in bin_propability:
          if p > 0:
            entropy += -p * np.log2(p)
        return entropy

    def split_datapoints_to_leftright(self, X, best_information_gain_X):
        left_index = np.argwhere(X <= best_information_gain_X).flatten()     #split data points to left if smaller than variable X of best information gain
        right_index = np.argwhere(X > best_information_gain_X).flatten()     #split data points to right if larger than variable X of best information gain
        return left_index, right_index                                       #return location(index) of the data point split into left and right correspondingly

    def compute_information_gain(self, X, y, best_information_gain_X):        #compute information gain of a branch
        parent_loss = self.compute_entropy(y)
        left_index, right_index = self.split_datapoints_to_leftright(X, best_information_gain_X)
        
        if len(left_index) == 0 or len(right_index) == 0: 
            return 0
        
        child_loss = (len(left_index) / len(y)) * self.compute_entropy(y[left_index]) + (len(right_index) / len(y)) * self.compute_entropy(y[right_index])
        return parent_loss - child_loss

    def find_max_information_gain(self, X, y, features):                      #find max information gain for each datapoint
      
        axis_aligned_rectangle_score = - 1
        axis_aligned_rectangle_feature = None
        axis_aligned_rectangle_iris_dimension = None

        for feat in features:                           #for each feature, source, eg, [3 1 2 0]
  
            X_feat = X[:, feat]                         #extract one column of X.  Format: X[row_index, column_index]
            iris_dimensions = np.unique(X_feat)         #returns the sorted unique elements of the X column.  each represents a data point of X column
   
            for iris_dimension in iris_dimensions:      #for each unique datapoint of iris species measurement
                score = self.compute_information_gain(X_feat, y, iris_dimension) #calculate information gain
                if score > axis_aligned_rectangle_score:
                    axis_aligned_rectangle_score = score
                    axis_aligned_rectangle_feature = feat
                    axis_aligned_rectangle_iris_dimension = iris_dimension

        return axis_aligned_rectangle_feature, axis_aligned_rectangle_iris_dimension
    
    def build_tree(self, X, y, depth=0):

        self.numofSamples, self.numofFeatures = X.shape   #X.shape = 105, 4 (105 training dataset, 4 features)
        self.numofClasslabels = len(np.unique(y))         #return number of unique labels of column y (label)

        #exit criteria
        if self.terminate(depth):                         
            predicted_label = np.argmax(np.bincount(y))   #return the max of count of elements value same as array index
            return Node(value=predicted_label)

        #iterate each data point, find the data point with maxiumn information gain
        random_features = np.random.choice(self.numofFeatures, self.numofFeatures, replace=False) #generate 4 numbers, range 0 to 4, eg [0 1 2 3], [3 2 0 1]
        best_feature, best_information_gain = self.find_max_information_gain(X, y, random_features) #locate the datapoint with best information gain

        # populate children 
        left_index, right_index = self.split_datapoints_to_leftright(X[:, best_feature], best_information_gain)
        left_child = self.build_tree(X[left_index, :], y[left_index], depth + 1)
        right_child = self.build_tree(X[right_index, :], y[right_index], depth + 1)
        return Node(best_feature, best_information_gain, left_child, right_child)

    def traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
    
        if x[node.feature] <= node.threshold:                   #follow the logic of building tree, so now look up the x variables of the node feature and compare the value with best information gain
            return self.traverse_tree(x, node.left)
        
        return self.traverse_tree(x, node.right)
        
    def print_tree(self, node):                                 #print tree by pre-order traversal (root left first)
      if node.is_leaf():
          print ("node", node)
          print ("Class", node.value)
          return node.value
      
      print ("node", node) 
      print ("feature", node.feature, "<=", node.threshold )
      print ("node.left", node.left)
      print ("node.right", node.right)
      print ("\n")
      
      print ("left ", end='')
      self.print_tree(node.left)

      print ("right ", end='')
      self.print_tree(node.right)

    def predict(self, X):
        predictions = []
        for x in X:                                                     #x represent each row of X
          predictions.append(self.traverse_tree(x, self.root))
        return np.array(predictions)
        

In [27]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [28]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [29]:
dataset = pd.read_csv("/content/drive/My Drive/Colab Notebooks/CS6140 Assignment1/data.csv")


In [30]:
feature_cols = ['feature1', 'feature2', 'feature3', 'feature4']
X = dataset[feature_cols]                         #Assign all feature columns into variable X
y = dataset['class']                              #Assign all target columns into variable y

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

model = ClassificationDecisionTree(max_depth=10)
model.fit(X_train.values , y_train.values)

In [None]:
print ("X_test", X_test)
y_pred = model.predict(X_test.values)
print ("y_pred", y_pred)
acc = accuracy(y_test, y_pred)
print("Accuracy:", acc)

In [32]:
model.print_tree(model.root)




node <__main__.Node object at 0x7f02abc94490>
feature 3 <= 0.6
node.left <__main__.Node object at 0x7f02abc94690>
node.right <__main__.Node object at 0x7f02abc94a10>


left node <__main__.Node object at 0x7f02abc94690>
Class 0
right node <__main__.Node object at 0x7f02abc94a10>
feature 3 <= 1.6
node.left <__main__.Node object at 0x7f02abc94650>
node.right <__main__.Node object at 0x7f02abc94510>


left node <__main__.Node object at 0x7f02abc94650>
feature 2 <= 4.9
node.left <__main__.Node object at 0x7f02abc94290>
node.right <__main__.Node object at 0x7f02abc94610>


left node <__main__.Node object at 0x7f02abc94290>
Class 1
right node <__main__.Node object at 0x7f02abc94610>
feature 3 <= 1.5
node.left <__main__.Node object at 0x7f02abc94ed0>
node.right <__main__.Node object at 0x7f02abc94990>


left node <__main__.Node object at 0x7f02abc94ed0>
Class 2
right node <__main__.Node object at 0x7f02abc94990>
Class 1
right node <__main__.Node object at 0x7f02abc94510>
Class 2
