In [1]:
import numpy as np
import pandas as pd

## OC1: Oblique decisione tree
Algorithm:

1. Get all possible hyperplanes $H_{i}$.
2. Choose one.
3. Perturb and find $v_{j}$.
4. Calculate gini index of each $H_{i}$.
5. Choose $H_{i}$ with lowest gini index.

In [151]:
class Node:
    def __init__(self, elements, labels):
        self.children = []
        self.elements = elements
        self.labels = labels
    
    def set_children(self, children):
        self.children = children
    
    def get_children(self):
        return self.children    

In [157]:
class Leaf(Node):
    def __init__(self, elements, labels):
        Node.__init__(self, elements, labels)
        _, self.label = max([(len(np.where(label == labels)), label) for label in labels])
    

- Simple binary tree, using oc1 method. 
- Binary classification (True, False) 
- Pandas dataframe.

In [158]:
class ID3Tree:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def _information_gain(self, arr, labels):
        def plogp(p): return -p * np.log(p)
        def calc_entropy(labels):
            unique_labels = np.unique(labels)
            return sum([plogp(len(np.where(labels == label)[0]) / len(labels)) for label in unique_labels])
            
        unique_vals = np.unique(arr)
        information_gain = calc_entropy(labels)
        for v in unique_vals:
            feature_labels = labels[np.where(arr == v)[0]]
            entropy = calc_entropy(feature_labels)
            information_gain -= entropy * len(feature_labels) / len(labels)
        return information_gain
    
    def _build_tree(self):
        data = self.data
        labels = self.labels
        
        columns = list(np.linspace(0, data.shape[1] - 1, data.shape[1]).astype(np.int32))
        self.root = self._build_node(data, labels, columns)
    
    def _build_node(self, data, labels, columns):
        if len(np.unique(labels)) == 1 or len(columns) == 0:
            return Leaf(data, columns)
            
        gain, i = max([(self._information_gain(data[:, i], labels), i) for i in columns])
        children = []
        columns.remove(i)
        for v in np.unique(data[:, i]):
            node = self._build_node(data[data[:, i] == v], labels[data[:, i] == v], columns)
            children.append(node)
        
        result = Node(data, labels)
        result.set_children(children)
        return result

In [159]:
y = np.array([-1,-1,1,1,1,-1,1,-1,1,1,1,1,1,-1])
x = np.array([[1,1,2,3,3,3,2,1,1,3,1,2,2,3],[1,1,1,2,3,3,3,2,3,2,2,2,1,2],[1,1,1,1,2,2,2,1,2,2,2,1,2,1],
              [1,2,1,1,1,2,2,1,1,1,2,2,1,2]])

In [161]:
tree = ID3Tree(x.T,y)
tree._build_tree()

In [72]:
# class OC1Tree:
#     def __init__(self, data):
#         self.data = data
#         self.data
#     def _feature_val_gini(self, column, data, feature_val):
#         tot = len(data[data[column] == feature_val])
#         truths = len(data[data[column] == feature_val & data['y']])
#         return 1 - (truths / tot) ** 2 + ((1-truths) / tot) ** 2
    
#     def _column_gini(self, column, data):
#         ginis = [self._feature_val_gini(column, data, feature_val) for feature_val in data[column].unique()]
        
            
#         pass
    
#     def _get_possible_hyperplanes_by_gini(self, data):
#         for c in data.columns:
#             data[c]
    
#     def build_tree(self, data):
#         hyperplanes = self._get_possible_hyperplanes()
        

In [63]:
np.unique(x)

array([1, 2, 3])

In [61]:
x = np.array([1,2,1,3,1,1,1])
np.where(x == 1)

(array([0, 2, 4, 5, 6]),)

In [54]:
np.linspace(0,3,4).astype(np.int32)

array([0, 1, 2, 3], dtype=int32)

In [15]:
y = np.array([1,1,-1,1,-1,-1,1,1,1,1,1,1,-1,-1,-1])
x = np.array([[1,1,2,2],[2,1,2,2],[1,1,1,2],[1,2,1,2],[2,3,2,2],
                [2,2,1,2],[3,2,2,1],[1,3,2,2],[3,3,2,1],[2,3,1,2],
                [3,1,1,1],[1,2,1,1],[2,3,1,1],[2,1,1,2],[2,2,1,1]])

df = pd.DataFrame(x)

In [51]:
dir(x)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_e

In [42]:
x[:,1]

array([1, 1, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2])

In [16]:
df['y'] = y

In [19]:
df['y'] = df['y'].map(lambda x: x == 1)

In [30]:
for i in df[0].unique():
    print(i)

1
2
3


In [36]:
df[df[0] == 1 & df['y']]

Unnamed: 0,0,1,2,3,y
0,1,1,2,2,True
3,1,2,1,2,True
7,1,3,2,2,True
11,1,2,1,1,True


In [33]:
a[a['y']]

Unnamed: 0,0,1,2,3,y
0,1,1,2,2,True
3,1,2,1,2,True
7,1,3,2,2,True
11,1,2,1,1,True


In [39]:
(len([3])/3 ) **2

0.1111111111111111

In [40]:
2**4

16

In [41]:
df

Unnamed: 0,0,1,2,3,y
0,1,1,2,2,True
1,2,1,2,2,True
2,1,1,1,2,False
3,1,2,1,2,True
4,2,3,2,2,False
5,2,2,1,2,False
6,3,2,2,1,True
7,1,3,2,2,True
8,3,3,2,1,True
9,2,3,1,2,True
